In [1]:
import os
import re
import collections
import numpy as py
import pandas as pd
import itertools

from ic_functions_py3 import*

In [2]:
# Create dictionaries of all associated names with the FlyBase gene symbol
geneID,NametoIDs,NametoFBid = geneIDdictionary('Datasets/fbgn_annotation_ID_fb_2016_05.tsv')

# Read in PFMs for promoter motifs and TFs
pPFM,checkFBgn = readFASTA("Datasets/promoter_PFMs.fasta",'PWM','v',geneID,0,0,0,0)
PFM,checkFBgn = readFASTA("Datasets/Modified/PFMs.fasta",'PWM','v',geneID,0,0,0,0)

# Read in BDGP insitu data
Stage = ['1_3','4_6','7_8','9_10','11_12','13_16']
insitu = pd.read_csv('Datasets/insitu_annot.csv',names=['gene_symb','CG','FBgn','stage','staining'])
insitu_ns = insitu[insitu.staining != 'no staining']
insitu.head(5)

Unnamed: 0,gene_symb,CG,FBgn,stage,staining
0,a10,CG6642,FBgn0011293,1,no staining
1,a10,CG6642,FBgn0011293,2,no staining
2,a10,CG6642,FBgn0011293,3,no staining
3,a10,CG6642,FBgn0011293,4,no staining
4,a10,CG6642,FBgn0011293,5,no staining


In [3]:
# Read in all datasets
# Cross-referencing dictionary for promoters
crossref = pd.read_csv('Datasets/EPD/cross_references.txt',header=0,sep='\t')

# Promoters (-50 to +50)
raw_promoters = readFASTA('Datasets/EPD/dm6promoters_minus50plus50.fa','sequences',0,geneID,re.compile('>FP\d+\s([\w\-\(\)\.]+_\d)'))
promoters = {}
title = []
neither = []
for k,v in raw_promoters.items():
    number = k[-2:]
    if k[:-2] in geneID.keys(): 
        promoters[geneID[k[:-2]][0]+number] = v
    elif k[:-2].lower() in geneID.keys(): 
        promoters[geneID[k[:-2].lower()][0]+number] = v
    elif k[:-2].title() in geneID.keys(): 
        promoters[geneID[k[:-2].title()][0]+number] = v
        title.append(k[:-2].title())
    elif not crossref['Ensembl Gene ID'][crossref['Associated Gene Name'] == k[:-2]].empty:
        FBgn = crossref['Ensembl Gene ID'][crossref['Associated Gene Name'] == k[:-2]].values[0]
        if FBgn in geneID.keys():
            promoters[geneID[FBgn][0]+number] = v
        else:
            print('%s is in the cross references but not in geneIDs' %FBgn)
    else:
        neither.append(k)
        # note: these genes seem to be withdrawn, in progress, or not found in FlyBase
# Note: Both FBgn0283709 and FBgn0267825 are unannotated
od_promoters = collections.OrderedDict(sorted(promoters.items()))

FBgn0283709 is in the cross references but not in geneIDs
FBgn0267825 is in the cross references but not in geneIDs


In [4]:
# Read in Vienna Tile enhancers
VT = pd.read_csv('Datasets/Stark_ViennaTile/nature13395-s2/2014-01-00083C-Supplementary Table 4.csv')
VT.head(5)

Unnamed: 0,VTID,Chromosome,Start,End,FlyBase ID,Symbol,Chromosome.1,Start.1,End.1,Orientation,Match
0,VT0006,chr2L,16836,18924,FBgn0002121,l(2)gl,chr2L,9839,21376,-,2
1,VT0025,chr2L,54047,56104,FBgn0051973,Cda5,chr2L,25402,65404,-,2
2,VT0131,chr2L,285217,287380,FBgn0031245,CG3625,chr2L,283385,291795,-,2
3,VT0132,chr2L,288875,290893,FBgn0025686,Amnionless,chr2L,287252,289144,-,2
4,VT0132,chr2L,288875,290893,FBgn0031245,CG3625,chr2L,283385,291795,-,1


In [5]:
# Read in CAD/RedFly "known" enhancers
Known = pd.read_excel('Datasets/Furlong_4C/nature13417/nature13417-s3.xlsx')
Known.head(5)

Unnamed: 0,Source,Name,Chr,Start,Stop,assigned gene,FBGn
0,REDFly,ci_7.1,4,76479,83352,ci,FBgn0004859
1,CAD,ey_UE2.0,4,720530,722315,ey,FBgn0005558
2,CAD,ey_UE0.9,4,724596,725290,ey,FBgn0005558
3,CAD,ey_DO2,4,730630,730841,ey,FBgn0005558
4,REDFly,sphinx_1067bp_5'_fragment,4,994776,995842,sphinx,FBgn0083990


In [6]:
# Read in Furlong 4C enhancers
Furlong4C = pd.read_excel('Datasets/Furlong_4C/nature13417/nature13417-s2.xlsx')
Furlong4C.head(5)

Unnamed: 0,view,cond,pvalue,chr,start,end,midpoint,mid_dist,max_dist,short_dist,zscore,feature_id,feature
0,CRM_1088,MESO_3-4h,0.001,chr2R,15164485,15165376,15164930,-135286,-135286,-134840,4.194707,,intragenic
1,CRM_1088,MESO_3-4h,0.001,chr2R,15182513,15183814,15183164,-117052,-117052,-116402,4.234489,,intragenic
2,CRM_1088,MESO_3-4h,0.001,chr2R,15191016,15191709,15191362,-108854,-108854,-108507,3.489552,,intragenic
3,CRM_1088,MESO_3-4h,0.001,chr2R,15197589,15198942,15198266,-101950,-101950,-101274,3.378263,FBgn0034433,promoter
4,CRM_1088,MESO_3-4h,0.001,chr2R,15206172,15206872,15206522,-93694,-93694,-93344,3.572291,,intragenic


In [9]:
# Find core promoter motifs
_,_,filenames = next(os.walk('matrix'),(None, None, []))
pmotif_list = [x[:-4] for x in filenames if '.txt' in x]

ES_cutoff=dict(zip(pmotif_list,[0.5,0.05,0.01,0.01,0.01,0.01,0.1,0.01,0.01,0.01,0.01]))
ppatser_output = ElemeNT_scoring(promoters,'promoters',pmotif_list,pPFM,ES_cutoff,os.getcwd())
ppatser_output.head(5)

Unnamed: 0,sequence,motif,position,score
0,ECSIT_1,BREd,71,0.6065
1,Fhos_4,BREd,32,0.9317
2,CG33552_1,BREd,58,0.5349
3,CG33552_1,BREd,61,0.7859
4,mrt_2,BREd,63,0.5518
