In [46]:
from Bio import SeqIO
from Bio import Seq
import pandas as pd
import numpy as np
import random
import csv
import re
import os

In [2]:
def seqToString(motif):
    """
    motif: a Seq that represents the motif 
    
    Returns the String representation of the motif
    """
    i = 0
    string = ""
    length = motif.__len__()
    while i < length:
        string += motif.__getitem__(i)
        i += 1
    return string

In [3]:
def getNegative(pos_seq):
    """
    pos_seq = dna sequence in the positive direction reading from the file
    
    Returns the negative counterpart of the positive sequence.
    """
    dict = {"A":'T','T':'A','G':'C','C':'G','-':'-','N':'N'}
    negative = ""
    last_index = len(pos_seq) - 1
    while last_index > -1:
        negative += dict[pos_seq[last_index].upper()]
        last_index -= 1
    return negative

In [73]:
def randomize_strand(length):
    strands = []
    for i in np.arange(length):
        r = random.random()
        if r < 0.5:
            strands += ["positive"]
        else:
            strands += ["negative"]
    return strands
    

In [124]:
def randomize_position(spec, no_thresh_path, length, path_to_raw):
    #1. get non thresholded file and group by species.max()
    no_thresh = pd.read_csv(no_thresh_path) 
    subsetted = no_thresh.groupby('species').max()
    max_align_positions = subsetted["align_position"].values
    rand_pos = []
    #2. iterate through max values and randomly generate an index in the middle
    for i in np.arange(len(max_align_positions)):
        random_index = random.randint(0, max_align_positions[i])
        rand_pos += [random_index]
    print(rand_pos)
    #3. drop uneeded columns in "thresholded file"
    subsetted = subsetted.drop(columns = ['score', 'motif', 'raw_position', 'Unnamed: 0'])
    subsetted['align_position'] = np.array(rand_pos)
    subsetted['strand'] = np.array(randomize_strand(len(rand_pos)))
    no_thresh = no_thresh.drop(columns = ['motif', 'Unnamed: 0', 'score']) 
    #4.add randomly generated column as the raw_positions column for my "thresholded" file
    result = pd.merge(subsetted, no_thresh, on = ['align_position', 'strand'])
    return(result)

In [125]:
randomize_position("VT6436", "~/motif_extraction/data/sub_no_map_motif_bcd_no_threshold/VT6436.fa.csv", 5, "VT6436.fa")

[1308, 1430, 3185, 2166, 1824, 1243, 3590, 2612, 826, 136, 997, 2052, 1660, 624, 3110, 956, 2792, 542, 267, 998, 279, 1454, 3635, 1994]


Unnamed: 0,strand,align_position,species,raw_position
0,negative,1308,VT6436|1|MEMB004E|-|2496,979
1,negative,1308,VT6436|1|MEMB006C|-|2759,1051
2,positive,1430,VT6436|1|MEMB005D|+|2691,1085
3,positive,3185,VT6436|1|MEMB002A|+|2701,2106
4,positive,3185,VT6436|1|MEMB002B|-|2408,1877
5,positive,3185,VT6436|1|MEMB002C|+|2709,2128
6,positive,3185,VT6436|1|MEMB002D|-|2501,1945
7,positive,3185,VT6436|1|MEMB002E|-|2699,2120
8,positive,3185,VT6436|1|MEMB002F|-|2670,2129
9,positive,3185,VT6436|1|MEMB003A|-|2262,1807


In [126]:
def raw_string(spec, length, no_thresh_path, motif_key, path_to_raw):
    """ spec: The region number ex: 11048
        length: The length of the sequence that we want
        path: The path to the folder with the thresholded and non thresholded directories
        
        Returns a csv saved under data/output/full_raw_motif_extraction/*spec*
        """
    
    result = randomize_position(spec, no_thresh_path, length, path_to_raw)
    print(result)
    num_spec = spec.split('.')[0]
    if num_spec[0] == "_":
        num_spec = num_spec[1:]
    spec_path = path_to_raw + "/outlier_rm_with_length_"+str(num_spec)+".fa"
    record_dict = SeqIO.to_dict(SeqIO.parse(spec_path, "fasta"))
    
    sequences = []
    
    for index, row in result.iterrows():
        speci = row['species']
        pos = row['raw_position']
        strand = row['strand']
        seq = record_dict[speci]
        if strand == 'negative':
            sequences.append(getNegative(seqToString(seq[pos:pos + length])))
        else:
            sequences.append(seqToString(seq[pos:pos + length]))
    result['raw_seq'] = np.array(sequences)
    print(result)
    path = os.path.join("/Users/niharikadesaraju/motif_extraction/data/output", str(num_spec) + "_random.csv")
    print(path)
    result.to_csv(path)

In [127]:
raw_string("VT6436", 5, "/Users/niharikadesaraju/motif_extraction/data/sub_no_map_motif_bcd_no_threshold/VT6436.fa.csv", "VT6436.fa", "/Users/niharikadesaraju/motif_extraction/data/raw") 

[1606, 2188, 1825, 1374, 2488, 1875, 3876, 47, 1656, 3362, 691, 109, 2480, 2061, 103, 2882, 2295, 6, 2964, 1448, 1574, 2892, 637, 1019]
       strand  align_position                   species  raw_position
0    positive            1606  VT6436|1|MEMB002A|+|2701          1093
1    positive            1606  VT6436|1|MEMB002B|-|2408          1091
2    positive            1606  VT6436|1|MEMB002C|+|2709          1115
3    positive            1606  VT6436|1|MEMB002D|-|2501          1020
4    positive            1606  VT6436|1|MEMB002E|-|2699          1014
5    positive            1606  VT6436|1|MEMB002F|-|2670          1171
6    positive            1606  VT6436|1|MEMB003A|-|2262          1017
7    positive            1606  VT6436|1|MEMB003B|+|2585          1130
8    positive            1606  VT6436|1|MEMB003C|-|2741          1135
9    positive            1606  VT6436|1|MEMB003D|-|2312           986
10   positive            1606  VT6436|1|MEMB003F|-|2611          1128
11   positive           