In [1]:
from Bio import SeqIO
from Bio import Seq
import pandas as pd
import numpy as np
import csv
import re
import os

In [2]:
def seqToString(motif):
    """
    motif: a Seq that represents the motif 
    
    Returns the String representation of the motif
    """
    i = 0
    string = ""
    length = motif.__len__()
    while i < length:
        string += motif.__getitem__(i)
        i += 1
    return string

In [49]:
def getNegative(pos_seq):
    """
    pos_seq = dna sequence in the positive direction reading from the file
    
    Returns the negative counterpart of the positive sequence.
    """
    dict = {"A":'T','T':'A','G':'C','C':'G','-':'-','N':'N'}
    negative = ""
    last_index = len(pos_seq) - 1
    while last_index > -1:
        negative += dict[pos_seq[last_index].upper()]
        last_index -= 1
    return negative

In [4]:
def getMotifLength(motif_key):
    dir = "../data/jaspar_fm/modified/"
    name = motif_key + ".jaspar"
    for file in os.listdir(dir):
        if file.endswith(name):
            file_dir = dir + file
            f = open(file_dir, 'r')
            length = 0
            for line in f:
                a = line.split()
                length = len(a)
                return length

In [44]:
getMotifLength("cad")

11

In [53]:
getMotifLength("eve")

7

In [6]:
species_list = [11048, 16679, 19895, 40548, 43692, 44110, 48156, 50550, 59000, 60074, 61209, 6436, 6705, 7859, 8646]
member = ['Dkik', 'MEMB002A', 'MEMB002B', 'MEMB002C', 'MEMB002D', 'MEMB002E', 'MEMB002F', 'MEMB003A', 'MEMB003B', 
          'MEMB003C', 'MEMB003D', 'MEMB003E', 'MEMB003F', 'MEMB004A', 'MEMB004B', 'MEMB004E', 'MEMB005D', 'MEMB006B', 
          'MEMB006C', 'MEMB007A', 'MEMB007B', 'MEMB007C', 'MEMB007D', 'MEMB008C']


## Testing

In [18]:
thresh = pd.read_csv("../data/output/map_motif_bcd_with_threshold/occurance_align_outlier_rm_with_length_VT11048.fa.csv")
no_thresh = pd.read_csv("../data/output/map_motif_bcd_no_threshold/VT11048.fa.csv")

In [19]:
""" Getting the threshold csv in the format we want:
        - drop duplicates of aligned positions
        - drop score, motif, raw_postition, Unnamed: 0   """ 

verif_motifs = thresh.drop(columns = ['score', 'motif', 'raw_position', 'Unnamed: 0', 'strand'])
thresh = thresh.drop_duplicates(subset = 'align_position')
thresh = thresh.drop(columns = ['score', 'motif', 'raw_position', 'Unnamed: 0'])

In [20]:
thresh

Unnamed: 0,species,strand,align_position
0,VT11048|0|MEMB005D|-|287,negative,60


In [21]:
verif_motifs

Unnamed: 0,species,align_position
0,VT11048|0|MEMB005D|-|287,60
1,VT11048|0|MEMB006A|+|286,60


In [22]:
keys = verif_motifs['species'].to_list()
vals = verif_motifs['align_position'].to_list()
print(keys)
print(vals)
verif_motifs_dict = dict(zip(keys, vals))
print(verif_motifs_dict)
k = 'VT11048|0|MEMB005D|-|287' in verif_motifs_dict.keys()
print(k)

['VT11048|0|MEMB005D|-|287', 'VT11048|0|MEMB006A|+|286']
[60, 60]
{'VT11048|0|MEMB005D|-|287': 60, 'VT11048|0|MEMB006A|+|286': 60}
True


In [23]:
""" Getting the no threshold csv in the format that we want it. 
        - drop the columns: motif', 'Unnamed:0'. """

no_thresh = no_thresh.drop(columns = ['motif', 'Unnamed: 0'])

In [24]:
no_thresh

Unnamed: 0,score,species,raw_position,strand,align_position
0,-6.140227,VT11048|0|MEMB002A|+|284,0,positive,21
1,1.092593,VT11048|0|MEMB002A|+|284,0,negative,21
2,-6.140227,VT11048|0|MEMB002A|+|284,1,positive,22
3,1.092593,VT11048|0|MEMB002A|+|284,1,negative,22
4,-1.892300,VT11048|0|MEMB002A|+|284,2,positive,23
5,3.355628,VT11048|0|MEMB002A|+|284,2,negative,23
6,-4.140227,VT11048|0|MEMB002A|+|284,3,positive,24
7,-3.307337,VT11048|0|MEMB002A|+|284,3,negative,24
8,2.355628,VT11048|0|MEMB002A|+|284,4,positive,25
9,0.107700,VT11048|0|MEMB002A|+|284,4,negative,25


In [25]:
""" Merging the two dfs to get the corresponding raw_positions for every species
        - merge based on align_position
        - drop ever duplicate of the species_y  """
result = pd.merge(thresh, no_thresh, on = ['align_position', 'strand'])

In [26]:
result.drop(columns = ['species_x'])

Unnamed: 0,strand,align_position,score,species_y,raw_position
0,negative,60,-2.193808,VT11048|0|MEMB002A|+|284,39
1,negative,60,-2.193808,VT11048|0|MEMB002B|-|284,39
2,negative,60,-2.193808,VT11048|0|MEMB002C|-|284,39
3,negative,60,-3.193808,VT11048|0|MEMB002D|+|269,35
4,negative,60,1.007825,VT11048|0|MEMB002E|+|288,40
5,negative,60,-2.193808,VT11048|0|MEMB002F|+|278,36
6,negative,60,1.007825,VT11048|0|MEMB003A|+|278,39
7,negative,60,-1.193808,VT11048|0|MEMB003B|+|289,43
8,negative,60,-2.193808,VT11048|0|MEMB003C|-|284,39
9,negative,60,1.007825,VT11048|0|MEMB003D|+|274,35


In [27]:
species = result['species_y']
species[0]

'VT11048|0|MEMB002A|+|284'

In [28]:
pos = result['align_position']
print(pos[0])

60


In [29]:
orig = []
for i in np.arange(len(result)):
    orig += ['no']
for key in verif_motifs_dict:
    species = result['species_y']
    pos = result['align_position']
    for i in np.arange(len(species)):
        if key == species[i]:
            if str(verif_motifs_dict[key]) == str(pos[i]):
                orig[i] = 'yes'
orig

['no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no']

In [30]:
print(len(result))
print(result)
print(len(orig))
print(orig)

24
                   species_x    strand  align_position     score  \
0   VT11048|0|MEMB005D|-|287  negative              60 -2.193808   
1   VT11048|0|MEMB005D|-|287  negative              60 -2.193808   
2   VT11048|0|MEMB005D|-|287  negative              60 -2.193808   
3   VT11048|0|MEMB005D|-|287  negative              60 -3.193808   
4   VT11048|0|MEMB005D|-|287  negative              60  1.007825   
5   VT11048|0|MEMB005D|-|287  negative              60 -2.193808   
6   VT11048|0|MEMB005D|-|287  negative              60  1.007825   
7   VT11048|0|MEMB005D|-|287  negative              60 -1.193808   
8   VT11048|0|MEMB005D|-|287  negative              60 -2.193808   
9   VT11048|0|MEMB005D|-|287  negative              60  1.007825   
10  VT11048|0|MEMB005D|-|287  negative              60 -1.193808   
11  VT11048|0|MEMB005D|-|287  negative              60 -2.193808   
12  VT11048|0|MEMB005D|-|287  negative              60 -1.193808   
13  VT11048|0|MEMB005D|-|287  negative       

In [31]:
result['motif?'] = orig
result

Unnamed: 0,species_x,strand,align_position,score,species_y,raw_position,motif?
0,VT11048|0|MEMB005D|-|287,negative,60,-2.193808,VT11048|0|MEMB002A|+|284,39,no
1,VT11048|0|MEMB005D|-|287,negative,60,-2.193808,VT11048|0|MEMB002B|-|284,39,no
2,VT11048|0|MEMB005D|-|287,negative,60,-2.193808,VT11048|0|MEMB002C|-|284,39,no
3,VT11048|0|MEMB005D|-|287,negative,60,-3.193808,VT11048|0|MEMB002D|+|269,35,no
4,VT11048|0|MEMB005D|-|287,negative,60,1.007825,VT11048|0|MEMB002E|+|288,40,no
5,VT11048|0|MEMB005D|-|287,negative,60,-2.193808,VT11048|0|MEMB002F|+|278,36,no
6,VT11048|0|MEMB005D|-|287,negative,60,1.007825,VT11048|0|MEMB003A|+|278,39,no
7,VT11048|0|MEMB005D|-|287,negative,60,-1.193808,VT11048|0|MEMB003B|+|289,43,no
8,VT11048|0|MEMB005D|-|287,negative,60,-2.193808,VT11048|0|MEMB003C|-|284,39,no
9,VT11048|0|MEMB005D|-|287,negative,60,1.007825,VT11048|0|MEMB003D|+|274,35,no


In [32]:
result.drop(columns=['species_x'])

Unnamed: 0,strand,align_position,score,species_y,raw_position,motif?
0,negative,60,-2.193808,VT11048|0|MEMB002A|+|284,39,no
1,negative,60,-2.193808,VT11048|0|MEMB002B|-|284,39,no
2,negative,60,-2.193808,VT11048|0|MEMB002C|-|284,39,no
3,negative,60,-3.193808,VT11048|0|MEMB002D|+|269,35,no
4,negative,60,1.007825,VT11048|0|MEMB002E|+|288,40,no
5,negative,60,-2.193808,VT11048|0|MEMB002F|+|278,36,no
6,negative,60,1.007825,VT11048|0|MEMB003A|+|278,39,no
7,negative,60,-1.193808,VT11048|0|MEMB003B|+|289,43,no
8,negative,60,-2.193808,VT11048|0|MEMB003C|-|284,39,no
9,negative,60,1.007825,VT11048|0|MEMB003D|+|274,35,no


## Final Code

In [5]:
def prep_table(spec, motif_key, thresh_path, no_thresh_path):
    """ 
        spec: The number representing a region ex: 11048
        motif_key: The three character code representing a motif
        path: The path to the folder with the thresholded and non thresholded directories
        
        Returns table containing the columns: strand direction, aligned position, score, species id, raw postion,
        and whether or not the position is considered the beginning of a motif
        
    """
    print("is thresh path")
    print(os.path.isfile(thresh_path))
    print("is no thresh path")
    print(os.path.isfile(no_thresh_path))

    thresh = pd.read_csv(thresh_path)
    no_thresh = pd.read_csv(no_thresh_path)
    verif_motifs = thresh.drop(columns = ['score', 'motif', 'raw_position', 'Unnamed: 0', 'strand'])
    keys = verif_motifs['species'].to_list()
    vals = verif_motifs['align_position'].to_list()
    verif_motifs_dict = dict(zip(keys, vals))
    thresh = thresh.drop_duplicates(subset = 'align_position')
    thresh = thresh.drop(columns = ['score', 'motif', 'raw_position', 'Unnamed: 0'])
    no_thresh = no_thresh.drop(columns = ['motif', 'Unnamed: 0'])
    result = pd.merge(thresh, no_thresh, on = ['align_position', 'strand'])
    result.drop(columns = ['species_x'])
    orig = []
    for i in np.arange(len(result)):
        orig += ['no']
    for key in verif_motifs_dict:
        species = result['species_y']
        pos = result['align_position']
        for i in np.arange(len(species)):
            if key == species[i]:
                if str(verif_motifs_dict[key]) == str(pos[i]):
                    orig[i] = 'yes'
    result['motif?'] = orig
    result = result.drop(columns=['species_x'])
    return result
                    
                    
                    

In [36]:
prep_table(48156)

Unnamed: 0,strand,align_position,score,species_y,raw_position,motif?
0,negative,153,-4.140227,VT48156|0|MEMB002A|-|586,111,no
1,negative,153,-4.140227,VT48156|0|MEMB002B|-|618,137,no
2,negative,153,-4.140227,VT48156|0|MEMB002C|+|593,111,no
3,negative,153,-3.193808,VT48156|0|MEMB002D|+|551,77,yes
4,negative,153,-4.140227,VT48156|0|MEMB002E|-|515,44,no
5,negative,153,-4.140227,VT48156|0|MEMB002F|-|568,112,no
6,negative,153,-4.140227,VT48156|0|MEMB003A|+|530,71,no
7,negative,153,-4.140227,VT48156|0|MEMB003B|+|553,90,no
8,negative,153,-4.140227,VT48156|0|MEMB003C|+|564,89,no
9,negative,153,-6.140227,VT48156|0|MEMB003D|+|541,83,no


In [37]:
result = prep_table(11048)

In [175]:
thresh2 = pd.read_csv("../data/output/map_motif_bcd_with_threshold/occurance_align_outlier_rm_with_length_VT11048.fa.csv")
thresh2 = thresh2.aggregate('align_position')




In [176]:
thresh2

0    60
1    60
Name: align_position, dtype: int64

In [19]:
#def raw_positions(name, thresh_file, length):
for file in os.listdir('../data/output/map_motif_bcd_with_threshold'):
    thresh = pd.read_csv(file)
    num = file.split('length_', 1)[1]
    no_thresh = pd.read_csv("../data/output/map_motif_bcd_no_threshold/"+str(num)+".fa.csv")
    print(num)

FileNotFoundError: [Errno 2] File b'occurance_align_outlier_rm_with_length_VT7859.fa.csv' does not exist: b'occurance_align_outlier_rm_with_length_VT7859.fa.csv'

In [67]:
def raw_string(spec, motif_length, thresh_path, no_thresh_path, motif_key, path_to_raw):
    """ spec: The region number ex: 11048
        length: The length of the sequence that we want
        path: The path to the folder with the thresholded and non thresholded directories
        
        Returns a csv saved under data/output/full_raw_motif_extraction/*spec*
        """
    result = prep_table(spec, motif_key, thresh_path, no_thresh_path)
    num_spec = spec[6:11]
    print(path_to_raw)
    print("num_spec:" + num_spec)
    spec_path = path_to_raw + "/outlier_rm_with_length_VT"+str(num_spec)+".fa"
    record_dict = SeqIO.to_dict(SeqIO.parse(spec_path, "fasta"))
    sequences = []
    before = []
    after = []
    length = motif_length
    for index, row in result.iterrows():
        speci = row['species_y']
        #print(spec)
        pos = row['raw_position']
        #print(pos)
        strand = row['strand']
        #print(strand)
        seq = record_dict[speci]
        if strand == 'negative':
            sequences.append(getNegative(seqToString(seq[pos:pos + 6])))
            before.append(getNegative(seqToString(seq[pos - length:pos])))
            after.append(getNegative(seqToString(seq[pos + 6:pos + 6 + length])))
        else:
            sequences.append(seqToString(seq[pos:pos + 6]))
            before.append(seqToString(seq[pos - length:pos]))
            after.append(seqToString(seq[pos + 6:pos + 6 + length]))
    # print(sequences)
    # print(len(sequences))
    # print(sequences.index('TAAGCC'))
    result['raw_seq'] = np.array(sequences)
    result['before_seq'] = np.array(before)
    result['after_seq'] = np.array(after)
    result
    if os.path.exists("/Volumes/Samsung_T5/output") == False:
        os.mkdir("/Volumes/Samsung_T5/output")
    result.to_csv("/Volumes/Samsung_T5/output/"+str(spec)+"_final_raw.fa.csv")

In [41]:
s = 11048
raw_string(s, 5)

TypeError: raw_string() missing 4 required positional arguments: 'thresh_path', 'no_thresh_path', 'motif_key', and 'path_to_raw'

In [202]:
for k in species_list:
    raw_string(k, 5)

In [66]:
def in_dir(path_to_extracted, path_to_raw, length):
    """
    path_to_extracted: string indicating the path to the directory with the preprocessed csv's
    path_to_raw: string indicating the path to the directory with the raw data
    length: length of the DNA sequence
    """
    no_thresh = ""
    for direc in os.listdir(path_to_extracted):
        if direc.startswith("sub_no"):
            no_thresh = os.path.join(path_to_extracted, direc)
    for direc in os.listdir(path_to_extracted):
        thresh_path = ""
        if direc.startswith("sub_threshold"):
            direc_motif = os.path.join(path_to_extracted, direc)
            #6_TFBS.../sub_threshold...
            for motif in os.listdir(direc_motif):
                motif_key = motif[:3]
                motif_length = getMotifLength(motif_key)
                print(motif)
                print(os.system("pwd"))
                print("motif: " + motif_key )
                if motif_length is not None:
                    motif_path = os.path.join(direc_motif, motif)
                    print("motif path: " + motif_path)
                    for thresh_file in os.listdir(motif_path):
                        print("thresh_file: " + thresh_file)
                        if not thresh_file.startswith("."):
                            thresh_path = os.path.join(motif_path, thresh_file)
                            no_thresh_path = no_thresh + "/" + motif + "/" + thresh_file + ".csv"
                            print("thresh path:" + thresh_path)
                            print("no thresh path:" + no_thresh_path)
                            if not os.path.isfile("/Volumes/Samsung_T5/output/"+str(thresh_file)+"_final_raw.fa.csv"):
                                raw_string(thresh_file, motif_length, thresh_path, no_thresh_path, motif_key, path_to_raw)
    

In [64]:
path_to_extracted = "/Volumes/Samsung_T5/6_TFBS_subset_30April2019"
path_to_raw = "/Volumes/Samsung_T5/3.24_species_only"

In [68]:
in_dir(path_to_extracted, path_to_raw, 5)

eve_new6
0
motif: eve
motif path: /Volumes/Samsung_T5/6_TFBS_subset_30April2019/sub_threshold_5_TFBS_scores_19Oct2018/eve_new6
thresh_file: eve_VT63706.fa
thresh path:/Volumes/Samsung_T5/6_TFBS_subset_30April2019/sub_threshold_5_TFBS_scores_19Oct2018/eve_new6/eve_VT63706.fa
no thresh path:/Volumes/Samsung_T5/6_TFBS_subset_30April2019/sub_no_threshold_Map_Motif_no_threshold_14Nov2018/eve_new6/eve_VT63706.fa.csv
thresh_file: eve_VT63732.fa
thresh path:/Volumes/Samsung_T5/6_TFBS_subset_30April2019/sub_threshold_5_TFBS_scores_19Oct2018/eve_new6/eve_VT63732.fa
no thresh path:/Volumes/Samsung_T5/6_TFBS_subset_30April2019/sub_no_threshold_Map_Motif_no_threshold_14Nov2018/eve_new6/eve_VT63732.fa.csv
thresh_file: eve_VT63747.fa
thresh path:/Volumes/Samsung_T5/6_TFBS_subset_30April2019/sub_threshold_5_TFBS_scores_19Oct2018/eve_new6/eve_VT63747.fa
no thresh path:/Volumes/Samsung_T5/6_TFBS_subset_30April2019/sub_no_threshold_Map_Motif_no_threshold_14Nov2018/eve_new6/eve_VT63747.fa.csv
thresh_file:

In [25]:
k = "cad_VT63706.fa"
k[6:11]

'63706'

In [26]:
k[1:]

'ad_VT63706.fa'