In [168]:
import numpy as np
import pandas as pd
from Bio import SeqIO
import os
import re
from Bio.Seq import Seq
from Bio import Restriction
import random


In [23]:
#%% load mouse 3' utrs from biomart - GRCm38.p6

input_file = './Datasets/mouse_3utrs.txt'

#fasta_sequences = SeqIO.parse(open(input_file),'fasta')

utrsdict={rec.id : str(rec.seq) for rec in SeqIO.parse(input_file, "fasta")}
#for fasta in fasta_sequences:
#    name, sequence = fasta.id, str(fasta.seq)
#    utrsdict[name]=sequence

utrsdictdf=pd.DataFrame.from_dict(data=utrsdict, orient='index').iloc[:,0]

genenames={rec.id : rec.id.split('|')[2] for rec in SeqIO.parse(input_file, "fasta")}
#for key in utrsdict.keys():
#    genenames[key]=key.split('|')[2]

genenamesdf=pd.DataFrame.from_dict(data=genenames, orient='index').iloc[:,0]



In [26]:
len(genenamesdf.unique())

55486

In [45]:
genesutrs={}
for vals in genenamesdf.unique():
    genesutrs[vals]=list(genenamesdf[genenamesdf==vals].index)

genesutrsdf=pd.DataFrame.from_dict(data=genesutrs, orient='index').iloc[:,0]



In [46]:
genesutrsdf.to_pickle('./dataframes/genesutrsdf.pkl')

In [47]:
genesutrsdf

Wnt9a         ENSMUSG00000000126|ENSMUST00000000128|Wnt9a|11...
Cntn1         ENSMUSG00000055022|ENSMUST00000000109|Cntn1|15...
Ngfr          ENSMUSG00000000120|ENSMUST00000000122|Ngfr|11|...
Scmh1         ENSMUSG00000000085|ENSMUST00000000087|Scmh1|4|...
Wnt3          ENSMUSG00000000125|ENSMUST00000000127|Wnt3|11|...
                                    ...                        
AL772407.1    ENSMUSG00000118517|ENSMUST00000239054|AL772407...
AC087801.2    ENSMUSG00000118583|ENSMUST00000239183|AC087801...
AC141882.1    ENSMUSG00000118503|ENSMUST00000239096|AC141882...
AC122861.1    ENSMUSG00000118610|ENSMUST00000239201|AC122861...
AC124742.1    ENSMUSG00000118552|ENSMUST00000238938|AC124742...
Name: 0, Length: 55486, dtype: object

In [14]:
utrsdictdf=pd.Series({rec.id.split('|')[2] : str(rec.seq) for rec in SeqIO.parse("./Datasets/mouse_3utrs.txt", "fasta")})

In [48]:
#%% functions

def get_wt_utrs(genelist):
    genelistdf=pd.DataFrame()
    for gene in genelist:
        try:
            genedf=utrsdictdf.loc[genesutrs[gene]]
            
            genedf.drop(genedf[genedf=='Sequenceunavailable'].index, inplace=True)
            genedf=genedf.to_frame()
            genedf.loc[:,'gene_name']=gene
            genedf.columns=['sequence','gene_name']
            
            for i in genedf.index:
                for j in genedf.index:
                    if (j!=i) & bool(genedf.loc[j,'sequence'].find(genedf.loc[i,'sequence'])>-1):
                        genedf.drop(i, inplace=True)
                        break
            genelistdf=genelistdf.append(genedf)
        except:
            pass
    genelistdf['seqlength']=genelistdf.sequence.apply(lambda x: len(x))
    genelistdf['paposition']=genelistdf.sequence.apply(lambda seq: len(seq)-50+re.search('A[AT]TAAA', seq[-50:]).span()[0] if not
                              re.search('A[AT]TAAA', seq[-50:])==None else -1)
    genelistdf['sequence_minuspa']=genelistdf.apply(lambda x: x.loc['sequence']
               [:x.loc['paposition']] if x.loc['paposition']>0 else x.loc['sequence'], axis=1)
    return genelistdf
            
def complists(list1, list2):
    listout=[]
    for i in list1:
        if i in list2:
            listout+=[i]
    print(len(listout))
    return listout

def make_scanning(sq, increment):
    varseqs=[]
    remaining=len(sq)
    startpos=0
    while remaining>=150:
        varseqs.append(sq[startpos:startpos+150])
        startpos+=increment
        remaining-=increment
    return varseqs

In [30]:
if os.path.isdir('./dataframes/design')==False:
    os.mkdir('./dataframes/design')

posgenes=['Actb','Arc','Camk2a','Ranbp1','Bdnf','Map2','Shank1','Dlg4','Map1a','Cplx2',
          'Calm1','Grin1','Nrgn','Ranbp1']

posgenesdf=get_wt_utrs(posgenes) # 17

In [32]:
posgenesdf.shape

(16, 5)

In [150]:
#%% pick contexts

'''
Middleton et al., 2019:
    deDent - significantly enriched in neurites
    consDent - found in 90% of the cells in the neurite (and also in the soma potentially)
    isoDent - genes with differential localization of isoforms
    "We found 1863 RNAs in at least 90% of the dendrite samples, which included 
    well-characterized localized RNAs such as Actb, Bdnf, Calm1, Dlg4, Grin1, 
    and Map2. "

    
Zappulo et al., 2017:
    "We identified 1,292 transcripts enriched in neurites by at least 2-fold when 
    compared with the soma (P-values < 0.05)."
    positive controls: "We found transcripts known to be preferentially localized, 
    such as syntaxin-3 (Stx3)22, glutamate receptor-1 (Gria1)5, calcium channel 
    Ryr2 5, inositol 1,4,5-trisphosphate receptor type 1 (Itpr1)5, neuregulins 
    (Nrg1 23 and Nrg2 24), voltage-dependent L-type calcium channel subunit α-1D 
    (Cacna1d)5, ephrin type-A receptor 2 (Epha2)25, unconventional myosin-Ic 
    (Myo1c)26, low-density lipoprotein receptor adapter protein 1 (Ldlrap1)27, 
    vang-like protein (Vangl)28, and transcripts encoding mitochondrial proteins6,
    10, consistent with previous works (see also Supplementary Data 5). "
    
Taliaferro et al., 2016:
    "Several genes with known projection-enriched RNA localization patterns, 
    including β-actin (Actb), neurogranin (Nrgn), and Ranbp1, were identified 
    as neurite-enriched in both CAD and N2A cells"
    isoform spec: "Previously, differential localization of alternative mRNA isoforms 
    has been observed in a few cases (An et al., 2008, Buckley et al., 2011, Harrison 
    et al., 2014, Whittaker et al., 1999)."
    "Using more stringent criteria, we identified a confident set of 195 localized ALEs 
    and 96 localized tandem UTRs (Figure S1O). "
    "We identified 421 distal ALE isoforms that were preferentially localized to 
    neurites, with criteria including ΔΨ ≥ 0.1 in both CAD and N2A cells (Table S2)."

Ciolli Mattioli et al., 2019:
    alternative utrs
    TableS2.xlsx

Additional established neurite-localizing transcripts:
    Arc
    Camk2a
    Ranbp1
    Bdnf
    Map2
    Shank1
    Dlg4
    Map1a
    Cplx2
    
'''

if os.path.isdir('./dataframes/design')==False:
    os.mkdir('./dataframes/design')

posgenes=['Actb','Arc','Camk2a','Ranbp1','Bdnf','Map2','Shank1','Dlg4','Map1a','Cplx2',
          'Calm1','Grin1','Nrgn','Ranbp1']

posgenesdf=get_wt_utrs(posgenes) # 17

posgenesdf.to_pickle('./dataframes/design/posgenesdf.pkl')


# 40 top

middleton_comp=pd.read_excel('./Datasets/MiddletonKimBMCBiol2019/12915_2019_630_MOESM6_ESM.xlsx')

top40=middleton_comp.GeneName.head(n=40).values

top40df=get_wt_utrs(top40) # 52

top40df.to_pickle('./dataframes/design/top40df.pkl')


# Middleton

deDent=pd.read_excel('./Datasets/MiddletonKimBMCBiol2019/12915_2019_630_MOESM6_ESM.xlsx', 
                     sheet_name='TableS5-deDend genes')

deDentlist=deDent.iloc[:,0].values

deDend_df=get_wt_utrs(deDentlist) # 347

deDend_df.to_pickle('./dataframes/design/deDend_df.pkl')


consDent=pd.read_excel('./Datasets/MiddletonKimBMCBiol2019/12915_2019_630_MOESM6_ESM.xlsx', 
                     sheet_name='TableS6-consDend genes')

consDentlist=consDent.iloc[:,0].values

consDend_df=get_wt_utrs(consDentlist) # 2790

consDend_df.to_pickle('./dataframes/design/consDend_df.pkl')


isoDent=pd.read_excel('./Datasets/MiddletonKimBMCBiol2019/12915_2019_630_MOESM6_ESM.xlsx', 
                     sheet_name='TableS7-isoDend genes')

isoDentlist=isoDent.GeneName.values

isoDend_df=get_wt_utrs(isoDentlist) # 533

isoDend_df.to_pickle('./dataframes/design/isoDend_df.pkl')


#Ciolli Matiolli Isoforms

ciolli=pd.read_excel('./Datasets/CiolliMattioliChekulaevaNAR2018_gky1270_supplemental_files/TableS2.xlsx', header=2)

ciolli['gene_name']=ciolli.Names.apply(lambda x: x.split('|')[1])

ciolligenes=ciolli.gene_name.unique()

asoverlap=complists(ciolligenes, isoDentlist)

asoverlap_df=get_wt_utrs(asoverlap) # 33


# Zappulo

zappulo3=pd.read_excel('./Datasets/ZappuloChekulaevaNatCommun2017/SuppData2.xlsx', header=0)
zappulo6=pd.read_excel('./Datasets/ZappuloChekulaevaNatCommun2017/41467_2017_690_MOESM6_ESM.xlsx', header=1)

zappulo_dend=zappulo3[(zappulo3['rzRNA_log2FC_Neurite_Soma']>2)&(zappulo3.rzRNA_padj_Neurite_Soma<0.01)].gene_name.unique()

zappulo_dend_df=get_wt_utrs(zappulo_dend) # 127

zappulo_dend_df.to_pickle('./dataframes/design/zappulo_dend_df.pkl')


### Negative:

middleton_background=pd.read_table('./Datasets/MiddletonKimBMCBiol2019/bg_pool.no_loc.txt')

middleton_background_genes=middleton_background[middleton_background.Stat<-5].Gene.values

middleton_background_df=get_wt_utrs(middleton_background_genes) # 340

middleton_background_df.to_pickle('./dataframes/design/middleton_background_df.pkl')


zappulo_soma=zappulo3[(zappulo3['rzRNA_log2FC_Neurite_Soma']<-2)&(zappulo3.rzRNA_padj_Neurite_Soma<0.01)].gene_name.unique()

zappulo_soma_df=get_wt_utrs(zappulo_soma) # 190

zappulo_soma_df.to_pickle('./dataframes/design/zappulo_soma_df.pkl')


sharedbg=complists(middleton_background_genes,zappulo_soma)

# ['Pdp1', 'Rragb', 'St6gal1', 'Kcna6']

BurgeCAD=list(zappulo6[zappulo6['Burge_CAD_Neurites.Soma']<-1].index)
BurgeN2A=list(zappulo6[zappulo6['Burge_N2A_Neurites.Soma']<-1].index)
BurgeCord=list(zappulo6[zappulo6['Burge_Cort_Neurites.Soma']<-1].index)

shared_cadn2a=complists(zappulo6.loc[BurgeCAD, 'gene_name'].values, zappulo6.loc[BurgeN2A, 'gene_name'].values)
shared_cord=complists(zappulo6.loc[BurgeCord, 'gene_name'].values, shared_cadn2a)

# ['Cntn2', 'Disp2', 'Gpr17', 'Magi2', 'Ogt', 'Pgap1']

backgroundgenesdf=get_wt_utrs(sharedbg+shared_cord)


# selection criteria: UTR > 250 nt, only one isoform, function in the nucleus or cell body

backgroundgenes=['Rragb','St6gal1','Gpr17','Ogt','Pgap1']

bggenes_df=get_wt_utrs(backgroundgenes)





17
4
127
6


In [151]:
print('overlap backgroundgenes, deDentlist')
complists(backgroundgenes, deDentlist)
print('overlap backgroundgenes, consDentlist')
complists(backgroundgenes, consDentlist)
print('overlap backgroundgenes, isoDentlist')
complists(backgroundgenes, isoDentlist)
print('overlap backgroundgenes, zappulo_dend')
complists(backgroundgenes, zappulo_dend)
# all 0


print('overlap asoverlap, deDentlist')
complists(asoverlap, deDentlist)
print('overlap asoverlap, consDentlist')
complists(asoverlap, consDentlist)
print('overlap asoverlap, isoDentlist')
complists(asoverlap, isoDentlist) # 8
print('overlap asoverlap, zappulo_dend')
complists(asoverlap, zappulo_dend)

print('overlap posgenes, deDentlist')
complists(posgenes, deDentlist)
print('overlap posgenes, consDentlist')
complists(posgenes, consDentlist)
print('overlap posgenes, isoDentlist')
complists(posgenes, isoDentlist)
print('overlap posgenes, zappulo_dend')
complists(posgenes, zappulo_dend)


print('overlap top40, deDentlist')
complists(top40, deDentlist)
print('overlap top40, consDentlist')
complists(top40, consDentlist)
print('overlap top40, isoDentlist')
complists(top40, isoDentlist)
print('overlap top40, zappulo_dend')
complists(top40, zappulo_dend)

overlap backgroundgenes, deDentlist
0
overlap backgroundgenes, consDentlist
0
overlap backgroundgenes, isoDentlist
0
overlap backgroundgenes, zappulo_dend
0
overlap asoverlap, deDentlist
0
overlap asoverlap, consDentlist
8
overlap asoverlap, isoDentlist
17
overlap asoverlap, zappulo_dend
0
overlap posgenes, deDentlist
4
overlap posgenes, consDentlist
9
overlap posgenes, isoDentlist
3
overlap posgenes, zappulo_dend
0
overlap top40, deDentlist
12
overlap top40, consDentlist
37
overlap top40, isoDentlist
4
overlap top40, zappulo_dend
0


[]

In [201]:
#%% scanning of wild type

### posgenes

vars_scan50_posgenes=pd.DataFrame()

for idx in posgenesdf.index:
    seqs=make_scanning(posgenesdf.sequence_minuspa[idx], 50)
    genedf = pd.Series(seqs, name='varseq').to_frame()
    genedf['gene_name']=posgenesdf.gene_name[idx]
    genedf['transcript']=idx
    genedf['subset']='wt scanning 50'
    genedf['positioninutr']=pd.Series(np.arange(0,50*len(seqs),50))
    vars_scan50_posgenes=vars_scan50_posgenes.append(genedf, ignore_index=True)
    
vars_scan50_posgenes.dropna(inplace=True, subset=['varseq'])
vars_scan50_posgenes.drop_duplicates(inplace=True, subset=['varseq'])

vars_scan50_posgenes.to_pickle('./dataframes/design/vars_scan50_posgenes.pkl')

# 715

#### top40

vars_scan50_top40=pd.DataFrame()

for idx in top40df.index:
    seqs=make_scanning(top40df.sequence_minuspa[idx], 50)
    genedf = pd.Series(seqs, name='varseq').to_frame()
    genedf['gene_name']=top40df.gene_name[idx]
    genedf['transcript']=idx
    genedf['subset']='wt scanning 50'
    genedf['positioninutr']=pd.Series(np.arange(0,50*len(seqs),50))
    vars_scan50_top40=vars_scan50_top40.append(genedf, ignore_index=True)
    
vars_scan50_top40.dropna(inplace=True, subset=['varseq'])
vars_scan50_top40.drop_duplicates(inplace=True, subset=['varseq'])

vars_scan50_top40.to_pickle('./dataframes/design/vars_scan50_top40.pkl')

# 1178


#### backgroundgenesdf

vars_scan50_backgroundgenes=pd.DataFrame()

for idx in backgroundgenesdf.index:
    seqs=make_scanning(backgroundgenesdf.sequence_minuspa[idx], 50)
    genedf = pd.Series(seqs, name='varseq').to_frame()
    genedf['gene_name']=backgroundgenesdf.gene_name[idx]
    genedf['transcript']=idx
    genedf['subset']='wt scanning 50'
    genedf['positioninutr']=pd.Series(np.arange(0,50*len(seqs),50))
    vars_scan50_backgroundgenes=vars_scan50_backgroundgenes.append(genedf, ignore_index=True)
    
vars_scan50_backgroundgenes.dropna(inplace=True, subset=['varseq'])
vars_scan50_backgroundgenes.drop_duplicates(inplace=True, subset=['varseq'])

vars_scan50_backgroundgenes.to_pickle('./dataframes/design/vars_scan50_backgroundgenes.pkl')

# 710

#### deDent

vars_scan50_deDend=pd.DataFrame()

for idx in deDend_df.index:
    seqs=make_scanning(deDend_df.sequence_minuspa[idx], 50)
    genedf = pd.Series(seqs, name='varseq').to_frame()
    genedf['gene_name']=deDend_df.gene_name[idx]
    genedf['transcript']=idx
    genedf['subset']='wt scanning 50'
    genedf['positioninutr']=pd.Series(np.arange(0,50*len(seqs),50))
    vars_scan50_deDend=vars_scan50_deDend.append(genedf, ignore_index=True)
    
vars_scan50_deDend.dropna(inplace=True, subset=['varseq'])
vars_scan50_deDend.drop_duplicates(inplace=True, subset=['varseq'])

vars_to_drop3=[]
for i in vars_scan50_deDend.index:
    seq=vars_scan50_deDend.varseq[i]
    if ('AATAAA' in seq) | ('ATTAAA' in seq):
        vars_to_drop3.append(i)
   
vars_scan50_deDend.drop(vars_to_drop3, inplace=True)     
vars_scan50_deDend.to_pickle('./dataframes/design/vars_scan50_deDend.pkl')


# 7948



#### zappulo_dend

vars_scan50_zappulo_dend=pd.DataFrame()

for idx in zappulo_dend_df.index:
    seqs=make_scanning(zappulo_dend_df.sequence_minuspa[idx], 50)
    genedf = pd.Series(seqs, name='varseq').to_frame()
    genedf['gene_name']=zappulo_dend_df.gene_name[idx]
    genedf['transcript']=idx
    genedf['subset']='wt scanning 50'
    genedf['positioninutr']=pd.Series(np.arange(0,50*len(seqs),50))
    vars_scan50_zappulo_dend=vars_scan50_zappulo_dend.append(genedf, ignore_index=True)
    
vars_scan50_zappulo_dend.dropna(inplace=True, subset=['varseq'])
vars_scan50_zappulo_dend.drop_duplicates(inplace=True, subset=['varseq'])

vars_to_drop4=[]
for i in vars_scan50_zappulo_dend.index:
    seq=vars_scan50_zappulo_dend.varseq[i]
    if ('AATAAA' in seq)|('ATTAAA' in seq):
        vars_to_drop4.append(i)
   
vars_scan50_zappulo_dend.drop(vars_to_drop4, inplace=True)     

vars_scan50_zappulo_dend.to_pickle('./dataframes/design/vars_scan50_zappulo_dend.pkl')


# 2000


#### asoverlap

vars_scan50_asoverlap=pd.DataFrame()

for idx in asoverlap_df.index:
    seqs=make_scanning(asoverlap_df.sequence_minuspa[idx], 50)
    genedf = pd.Series(seqs, name='varseq').to_frame()
    genedf['gene_name']=asoverlap_df.gene_name[idx]
    genedf['transcript']=idx
    genedf['subset']='wt scanning 50'
    genedf['positioninutr']=pd.Series(np.arange(0,50*len(seqs),50))
    vars_scan50_asoverlap=vars_scan50_asoverlap.append(genedf, ignore_index=True)
    
vars_scan50_asoverlap.dropna(inplace=True, subset=['varseq'])
vars_scan50_asoverlap.drop_duplicates(inplace=True, subset=['varseq'])

vars_scan50_asoverlap.to_pickle('./dataframes/design/vars_scan50_asoverlap.pkl')

# 1203


wt_scan = pd.concat([vars_scan50_posgenes, vars_scan50_posgenes[[x in ['Nrgn','Actb','Map2'] for x in vars_scan50_posgenes.gene_name]], 
                     vars_scan50_top40, vars_scan50_backgroundgenes, 
                    vars_scan50_deDend, vars_scan50_zappulo_dend, vars_scan50_asoverlap], 
                    ignore_index=True)

wt_scan['changes']='no changes'



  if __name__ == '__main__':


In [None]:
'''
vars_scan50_posgenes: 715
vars_scan50_top40: 1178
vars_scan50_backgroundgenes: 710
vars_scan50_deDend: 7948
vars_scan50_zappulo_dend: 2000
vars_scan50_asoverlap: 1203

'''

In [202]:

#%% RBP motifs

# from Middleton
# SRSF3-binding motif AUCAWCG   Therefore, one hypothesis could be that SRSF3 
# plays a role in the early steps of dendritic localization by promoting inclusion 
# of alternative 3′UTRs (theoretically containing DTEs) and by facilitating nuclear 
# export.
# de novo motifs: UUCGAU CCGCAA GUGGGU

dendkmers_middletonpub=['ATCAACG','ATCATCG','TTCGAT','CCGCAA','GTGGGT']

# ZBP?

### From Taliaferro

dendkmers_taliaferropub=['GCTGCT','CTGCTG','GCGCTG','CTGGAC','CCTGCT','TCTGGA','CCCCAA','CTGCCC','ACACTG','TTTTCA','TTTTTT','ATACAG']
somakmers_taliaferropub=['TAGGTC','TCTTCT','CTCTTT','TCTCTT','TCTCTC','AGGTAA']



### From Ati

middleton_dendkmers=pd.read_table('./Datasets/Middleton/deDent_VS_bk_results.txt')
middleton_somakmers=pd.read_table('./Datasets/Middleton/bk_VS_deDent_results.txt')

dendkmers_middleton = middleton_dendkmers.drop_duplicates('protein',keep='first').drop_duplicates('binding_motif',keep='first').binding_motif.values #28
somakmers_middleton = middleton_somakmers.drop_duplicates('protein',keep='first').drop_duplicates('binding_motif',keep='first').binding_motif.values #42

zappulo_dendkmers=pd.read_table('./Datasets/Zappulo/neurit_th1_VS_soma_thminus2.txt')
zappulo_somakmers=pd.read_table('./Datasets/Zappulo/soma_thminus2_VS_neurit_th1.txt')

dendkmers_zappulo = zappulo_dendkmers.drop_duplicates('protein',keep='first').drop_duplicates('binding_motif',keep='first').binding_motif.values #15
somakmers_zappulo = zappulo_somakmers.drop_duplicates('protein',keep='first').drop_duplicates('binding_motif',keep='first').binding_motif.values #45


complists(dendkmers_middleton, somakmers_middleton) # 0
dendkmers_middletonzappulo = complists(dendkmers_middleton, dendkmers_zappulo) # 4
somakmers_middletonzappulo = complists(somakmers_middleton, somakmers_zappulo) # 9
complists(dendkmers_zappulo, somakmers_zappulo) # 0

dendkmers_short = list(set(dendkmers_middletonpub + dendkmers_taliaferropub + [mtf.replace('U','T') for mtf in dendkmers_middletonzappulo]))
somakmers_short = list(set(somakmers_taliaferropub + [mtf.replace('U','T') for mtf in somakmers_middletonzappulo]))
# 21 and 15 = 36

dendkmers_long = list(set([mtf.replace('U','T') for mtf in dendkmers_middleton] + [mtf.replace('U','T') for mtf in dendkmers_zappulo] + dendkmers_middletonpub + dendkmers_middletonpub))
somakmers_long = list(set([mtf.replace('U','T') for mtf in somakmers_middleton] + [mtf.replace('U','T') for mtf in somakmers_zappulo] + somakmers_taliaferropub))
# 44 and 84 = 128

# Get RBPs
dendkmers_middletonzappulo_rbps = middleton_dendkmers[[x in dendkmers_middletonzappulo for x in middleton_dendkmers.binding_motif]]
'''
     protein binding_motif  ...  target.seq.count  bkg.seq.count
1      PCBP1      CCCUUCCC  ...               377            612
36    SAMD4A       GCUGGCC  ...               377            612
42   HNRNPH2       GGGAGGG  ...               377            612
43    LIN28A       GGGAGGG  ...               377            612
169  BRUNOL6       UGUGGGG  ...               377            612
'''
middleton_dendkmers[[x in dendkmers_middleton for x in middleton_dendkmers.binding_motif]]
'''
       protein binding_motif  ...  target.seq.count  bkg.seq.count
1        PCBP1      CCCUUCCC  ...               377            612
2        PCBP2       CCCUCCC  ...               377            612
7       HNRNPK       CCAGCCC  ...               377            612
9      HNRNPH2       GGGGGGG  ...               377            612
15      PABPC1       AAAAAAA  ...               377            612
16      PABPC4       AAAAAAA  ...               377            612
17       SART3       AAAAAAA  ...               377            612
19       PPRC1       GCGCGCC  ...               377            612
20        RBM4       GCGCGCC  ...               377            612
21       RBM8A       GCGCGCC  ...               377            612
24       PCBP3      CUAUCCCU  ...               377            612
28        RBM5       GAGGGAG  ...               377            612
29        RBM5       GAGGGAG  ...               377            612
30       ESRP2       UGGGGGG  ...               377            612
32       SRSF1       GGAGGGC  ...               377            612
33       SRSF1       GGAGGGC  ...               377            612
34       SRSF1       GGAGGGC  ...               377            612
35       SRSF1       GGAGGGC  ...               377            612
36      SAMD4A       GCUGGCC  ...               377            612
42     HNRNPH2       GGGAGGG  ...               377            612
43      LIN28A       GGGAGGG  ...               377            612
45      SRSF10       AGAGGGA  ...               377            612
55       SRSF2      AGGAGAGG  ...               377            612
56      HNRNPL       CCACACA  ...               377            612
58      PABPC1       AGAAAAA  ...               377            612
59      PABPC4       AGAAAAA  ...               377            612
60      PABPC5       AGAAAAA  ...               377            612
61       SART3       AGAAAAA  ...               377            612
62      SRSF10       AGAAAAA  ...               377            612
76     HNRNPH2       GGGAGGC  ...               377            612
77       SRSF9       GGGAGGC  ...               377            612
84         FUS       CGCGCGC  ...               377            612
87     IGF2BP2       AAAAACA  ...               377            612
88     IGF2BP3       AAAAACA  ...               377            612
89      PABPC3       AAAAACA  ...               377            612
90      PABPC4       AAAAACA  ...               377            612
96      ZC3H10       CCAGCGC  ...               377            612
112       FMR1       GGACGGG  ...               377            612
113       FXR2       GGACGGG  ...               377            612
114      SRSF1       GGACGGG  ...               377            612
136    HNRNPA1       GUAGGGG  ...               377            612
137  HNRNPA2B1       GUAGGGG  ...               377            612
138       SFPQ       GUGGUGG  ...               377            612
145     PABPN1       AAAAGAA  ...               377            612
165      CNOT4       GACAGAG  ...               377            612
169    BRUNOL6       UGUGGGG  ...               377            612
182      RBM38       GUGUGGG  ...               377            612
200     HNRPLL       ACACGCA  ...               377            612
'''

somakmers_middletonzappulo_rbps = middleton_somakmers[[x in somakmers_middletonzappulo for x in middleton_somakmers.binding_motif]]
'''
      protein binding_motif  ...  target.seq.count  bkg.seq.count
1      HNRNPC       AUUUUUA  ...               612            377
2    HNRNPCL1       AUUUUUA  ...               612            377
3         HuR       UUAAUUU  ...               612            377
36       TUT1       AAAUACU  ...               612            377
90       SFPQ       GUAAUGU  ...               612            377
92        QKI       ACUAAUA  ...               612            377
127     RBMS1       AAUAUAC  ...               612            377
128     RBMS3       AAUAUAC  ...               612            377
134     PCBP3      AUUUUCCU  ...               612            377
142    DAZAP1       UAGUUAA  ...               612            377
143      MSI1       UAGUUAA  ...               612            377
180     RBM28       AAGUAGA  ...               612            377
'''
middleton_somakmers[[x in somakmers_middleton for x in middleton_somakmers.binding_motif]]



0
4
9
0


Unnamed: 0,protein,binding_motif,Pval,Qval,target.hits,bkg.hits,target.seq.hits,bkg.seq.hits,target.seq.count,bkg.seq.count
1,HNRNPC,AUUUUUA,5.906104e-55,1.771831e-52,5042,2077,533,261,612,377
2,HNRNPCL1,AUUUUUA,5.906104e-55,1.771831e-52,5042,2077,533,261,612,377
3,HuR,UUAAUUU,6.661631e-54,1.985166e-51,4342,1745,523,240,612,377
4,ZCRB1,GAUUUAAU,4.601611e-51,1.366678e-48,7198,3182,575,303,612,377
6,KHDRBS1,UUAAAAA,2.151655e-39,6.347381e-37,4795,2077,532,270,612,377
...,...,...,...,...,...,...,...,...,...,...
262,HNRNPA1L2,UUAGGUA,4.101886e-07,1.633914e-05,1393,660,419,187,612,377
279,G3BP2,AGGAUAU,2.812262e-06,6.189752e-05,1558,758,452,204,612,377
281,YBX2,AACAACU,2.920139e-06,6.189752e-05,1702,835,454,226,612,377
282,SRSF1,AAGGACAU,3.035948e-06,6.189752e-05,5926,3141,583,315,612,377


In [203]:
###############

### Insert in backgrounds

forinsertion=posgenesdf.append(bggenes_df)

forinsertion.drop_duplicates(inplace=True, subset=['gene_name'])

vars_forinsertion=pd.DataFrame()

for idx in forinsertion[forinsertion.seqlength<3800].index:
    seqs=make_scanning(forinsertion.sequence_minuspa[idx], 150)
    genedf = pd.Series(seqs, name='varseq').to_frame()
    genedf['gene_name']=forinsertion.gene_name[idx]
    genedf['transcript']=idx
    genedf['subset']='wt scanning 150'
    genedf['changes']='no changes'
    genedf['positioninutr']=pd.Series(np.arange(0,150*len(seqs),150))
    vars_forinsertion=vars_forinsertion.append(genedf, ignore_index=True)
    
vars_forinsertion.dropna(inplace=True, subset=['varseq'])
vars_forinsertion.drop_duplicates(inplace=True, subset=['varseq'])



vars_singlemotif=pd.DataFrame()

for i in vars_forinsertion.index:
    for mtf in dendkmers_short+somakmers_short:
        toadd = vars_forinsertion.loc[i]
        sq = toadd['varseq']
        toadd['varseq'] = sq[:50] + mtf + sq[50+len(mtf):100] + mtf + sq[100+len(mtf):]
        toadd['changes'] = mtf + ' 2 times at position 50 and 100'
        vars_singlemotif=vars_singlemotif.append(toadd, ignore_index=True)

vars_singlemotif['subset']='motif inserted scanning 150'

vars_forinsertion.append(vars_singlemotif, ignore_index=True).to_pickle('./dataframes/design/vars_singlemotif.pkl')           

# 6993

vars_camk2aforinsertion=vars_forinsertion[vars_forinsertion.gene_name=='Camk2a']
# 22
vars_camk2aforinsertion['subset']='motifs in Camk2a'


vars_combmotif=pd.DataFrame()

for i in vars_camk2aforinsertion.index:
    toadd = vars_camk2aforinsertion.loc[i]
    toadd['changes']='no changes'
    vars_combmotif=vars_combmotif.append(toadd, ignore_index=True)

    for mtf in dendkmers_short+somakmers_short:
        toadd = vars_camk2aforinsertion.loc[i]
        sq = toadd['varseq']
        newsq = sq[:30] + mtf + sq[30+len(mtf):]
        toadd['varseq'] = newsq
        toadd['changes'] = mtf + ' 1 time at position 30'
        vars_combmotif=vars_combmotif.append(toadd, ignore_index=True)

        newsq2 = newsq[:90] + mtf + newsq[90+len(mtf):]
        toadd['varseq'] = newsq2
        toadd['changes'] = mtf + ' 2 times at position 30,90'
        vars_combmotif=vars_combmotif.append(toadd, ignore_index=True)

        newsq3 = newsq2[:60] + mtf + newsq2[60+len(mtf):]
        toadd['varseq'] = newsq3
        toadd['changes'] = mtf + ' 3 times at position 30,60,90'
        vars_combmotif=vars_combmotif.append(toadd, ignore_index=True)

        newsq4 = newsq3[:120] + mtf + newsq3[120+len(mtf):]
        toadd['varseq'] = newsq4
        toadd['changes'] = mtf + ' 4 times at position 30,60,90,120'
        vars_combmotif=vars_combmotif.append(toadd, ignore_index=True)

# 3190
vars_combmotif['subset'] = 'motifs inserted, short, Camk2a, up to 4 times'

vars_combmotif.to_pickle('./dataframes/design/vars_combmotif.pkl')           

###

vars_combmotif2=pd.DataFrame()

for i in vars_camk2aforinsertion.index:
    toadd = vars_camk2aforinsertion.loc[i]
    toadd['changes']='no changes'
    vars_combmotif2=vars_combmotif2.append(toadd, ignore_index=True)

    for mtf in dendkmers_long:
        toadd = vars_camk2aforinsertion.loc[i]
        sq = toadd['varseq']
        newsq = sq[:50] + mtf + sq[50+len(mtf):]
        toadd['varseq'] = newsq
        toadd['changes'] = mtf + ' at position 50'
        vars_combmotif2=vars_combmotif2.append(toadd, ignore_index=True)

        newsq2 = newsq[:100] + mtf + newsq[100+len(mtf):]
        toadd['varseq'] = newsq2
        toadd['changes'] = mtf + ' 2 times at position 50,100'
        vars_combmotif2=vars_combmotif2.append(toadd, ignore_index=True)

# 1958
vars_combmotif2['subset'] = 'motifs inserted, all, Camk2a'

vars_combmotif2.to_pickle('./dataframes/design/vars_combmotif2.pkl')           


##
vars_combmotif2sec=pd.DataFrame()

for i in vars_camk2aforinsertion.index:
    toadd = vars_camk2aforinsertion.loc[i]
    toadd['changes']='no changes'
    vars_combmotif2sec=vars_combmotif2sec.append(toadd, ignore_index=True)

    for mtf in dendkmers_long:
        toadd = vars_camk2aforinsertion.loc[i]
        sq = toadd['varseq']
        newsq = sq[:50] + mtf + sq[50+len(mtf):50+len(mtf)+3] + str(Seq(sq[38:47]).reverse_complement()) + sq[50+len(mtf) + 3 + 9:]
        toadd['varseq'] = newsq
        toadd['changes'] = mtf + ' at position 50 in 9 nt hairpin'
        vars_combmotif2sec=vars_combmotif2sec.append(toadd, ignore_index=True)

        newsq2 = newsq[:100] + mtf + newsq[100+len(mtf):100+len(mtf)+3] + str(Seq(newsq[88:97]).reverse_complement()) + newsq[100+len(mtf) + 3 + 9:]
        toadd['varseq'] = newsq2
        toadd['changes'] = mtf + ' 2 times at position 50,100 in 9 nt hairpin'
        vars_combmotif2sec=vars_combmotif2sec.append(toadd, ignore_index=True)

# 1958
vars_combmotif2sec['subset'] = 'motifs inserted with hairpin'
vars_combmotif2sec.to_pickle('./dataframes/design/vars_combmotif2sec.pkl')           


# de novo motifs (Ati)

middleton_neurite_denovo=pd.read_excel('./Datasets/middleton_denovo/dedent_denovo.xlsx')
middleton_soma_denovo=pd.read_excel('./Datasets/middleton_denovo/bk_denovo.xlsx')

zappulo_neurite_denovo=pd.read_excel('./Datasets/zappulo_denovo/neurite_denovo.xlsx')
zappulo_soma_denovo=pd.read_excel('./Datasets/zappulo_denovo/soma_denovo.xlsx')

denovo_motifs=list(middleton_neurite_denovo.Best_possible_Match.values) + \
    list(middleton_soma_denovo.Best_possible_Match.values) + \
        list(zappulo_neurite_denovo.Best_possible_Match.values) + \
            list(zappulo_soma_denovo.Best_possible_Match.values)


vars_denovomotifs=pd.DataFrame()

for i in vars_forinsertion[[x in ['Camk2a','Arc','Actb']+backgroundgenes for x in vars_forinsertion.gene_name]].index:
    for mtf in denovo_motifs:
        toadd = vars_forinsertion.loc[i]
        sq = toadd['varseq']
        toadd['varseq'] = sq[:30] + mtf + sq[30+len(mtf):]
        toadd['changes'] = mtf + ' at position 30'
        vars_denovomotifs=vars_denovomotifs.append(toadd, ignore_index=True)

# 4554
vars_denovomotifs['subset']='de novo motifs'       
vars_denovomotifs.to_pickle('./dataframes/design/vars_denovomotifs.pkl')           


motifs_comb=pd.concat([vars_forinsertion, vars_singlemotif, vars_combmotif, 
                       vars_combmotif2, vars_combmotif2sec, vars_denovomotifs], ignore_index=True)

# 18653

#%%

### Mutation of endriched motifs present in variants


def scanandreplace(sq, mtf):
    sqnew=sq
    if sq.find(mtf)>=0:
        for startpos in [m.start() for m in re.finditer(mtf, sq)]:
            newmtf=''.join([random.choice(['A','C','T','G']) for i in np.arange(len(mtf))])
            sqnew=sqnew[:startpos]+newmtf+sqnew[startpos + len(mtf):]
    return sqnew

vars_motifmut_posgenes = pd.DataFrame()
            
for i in vars_scan50_posgenes.index:
    for j in dendkmers_long:
        mutseq=scanandreplace(vars_scan50_posgenes.varseq[i], j)
        if mutseq!=vars_scan50_posgenes.varseq[i]:
            toadd = vars_scan50_posgenes.loc[i]
            toadd.loc['varseq']=mutseq
            toadd.loc['changes']=j + ' replaced by random'
            
            vars_motifmut_posgenes=vars_motifmut_posgenes.append(toadd, ignore_index=True)
            
# 681
            
vars_motifmut_posgenes.to_pickle('./dataframes/design/vars_motifmut_posgenes.pkl')           


vars_motifmut_top40 = pd.DataFrame()
            
for i in vars_scan50_top40.index:
    for j in dendkmers_long:
        mutseq=scanandreplace(vars_scan50_top40.varseq[i], j)
        if mutseq!=vars_scan50_top40.varseq[i]:
            toadd = vars_scan50_top40.loc[i]
            toadd.loc['varseq']=mutseq
            toadd.loc['changes']=j + ' replaced by random'
            
            vars_motifmut_top40=vars_motifmut_top40.append(toadd, ignore_index=True)
            
# 1010 
            
vars_motifmut_top40.to_pickle('./dataframes/design/vars_motifmut_top40.pkl')           



vars_motifmut_deDend = pd.DataFrame()
            
for i in vars_scan50_deDend.index:
    for j in dendkmers_long:
        mutseq=scanandreplace(vars_scan50_deDend.varseq[i], j)
        if mutseq!=vars_scan50_deDend.varseq[i]:
            toadd = vars_scan50_deDend.loc[i]
            toadd.loc['varseq']=mutseq
            toadd.loc['changes']=j + ' replaced by random'
            
            vars_motifmut_deDend=vars_motifmut_deDend.append(toadd, ignore_index=True)
            
# 6532 
            
vars_motifmut_deDend.to_pickle('./dataframes/design/vars_motifmut_deDend.pkl')           


vars_motifmut_zappulo_dend = pd.DataFrame()
            
for i in vars_scan50_zappulo_dend.index:
    for j in dendkmers_long:
        mutseq=scanandreplace(vars_scan50_zappulo_dend.varseq[i], j)
        if mutseq!=vars_scan50_zappulo_dend.varseq[i]:
            toadd = vars_scan50_zappulo_dend.loc[i]
            toadd.loc['varseq']=mutseq
            toadd.loc['changes']=j + ' replaced by random'
            
            vars_motifmut_zappulo_dend=vars_motifmut_zappulo_dend.append(toadd, ignore_index=True)
            
# 212 
            
vars_motifmut_zappulo_dend.to_pickle('./dataframes/design/vars_motifmut_zappulo_dend.pkl')           




vars_motifmut_asoverlap = pd.DataFrame()
            
for i in vars_scan50_asoverlap.index:
    for j in dendkmers_long:
        mutseq=scanandreplace(vars_scan50_asoverlap.varseq[i], j)
        if mutseq!=vars_scan50_asoverlap.varseq[i]:
            toadd = vars_scan50_asoverlap.loc[i]
            toadd.loc['varseq']=mutseq
            toadd.loc['changes']=j + ' replaced by random'
            
            vars_motifmut_asoverlap=vars_motifmut_asoverlap.append(toadd, ignore_index=True)
            
# 724 
            
vars_motifmut_asoverlap.to_pickle('./dataframes/design/vars_motifmut_asoverlap.pkl')           



vars_motifmut_asoverlapsoma = pd.DataFrame()
            
for i in vars_scan50_asoverlap.index:
    for j in somakmers_long:
        mutseq=scanandreplace(vars_scan50_asoverlap.varseq[i], j)
        if mutseq!=vars_scan50_asoverlap.varseq[i]:
            toadd = vars_scan50_asoverlap.loc[i]
            toadd.loc['varseq']=mutseq
            toadd.loc['changes']=j + ' replaced by random'
            
            vars_motifmut_asoverlapsoma=vars_motifmut_asoverlapsoma.append(toadd, ignore_index=True)
            
# 2745 
            
vars_motifmut_asoverlapsoma.to_pickle('./dataframes/design/vars_motifmut_asoverlapsoma.pkl')           




vars_motifmut_backgroundgenes = pd.DataFrame()
            
for i in vars_scan50_backgroundgenes.index:
    for j in somakmers_long:
        mutseq=scanandreplace(vars_scan50_backgroundgenes.varseq[i], j)
        if mutseq!=vars_scan50_backgroundgenes.varseq[i]:
            toadd = vars_scan50_backgroundgenes.loc[i]
            toadd.loc['varseq']=mutseq
            toadd.loc['changes']=j + ' replaced by random'
            
            vars_motifmut_backgroundgenes=vars_motifmut_backgroundgenes.append(toadd, ignore_index=True)
            
# 1062 
            
vars_motifmut_backgroundgenes.to_pickle('./dataframes/design/vars_motifmut_backgroundgenes.pkl')           


mut_scan = pd.concat([vars_motifmut_posgenes, vars_motifmut_top40, vars_motifmut_backgroundgenes, 
vars_motifmut_deDend, vars_motifmut_zappulo_dend, vars_motifmut_asoverlap, vars_motifmut_asoverlapsoma], ignore_index=True)

mut_scan['subset']='mut scanning 50'

# 12967
#%% structural motifs

# from Middleton: 
# B1: GAGGCAGGCGGATTTCTGAGTTCGAGGCCAGCCTGGTCTACAGAGTGAGTTCCAGGACAGCCAGGGCTACACAGAGAAACCCTGTCTC
# ((((((((....(((((((((((..(((...(((((.((........))..)))))...))).)))))...))))))...))))))))

# B2: GCTGGTGAGATGGCTCAGTGGGTAAGAGCACCCGACTGCTCTTCCGAAGGTCAGGAGTTCAAATCCCAGC
# (((((.((..((((((....((.(((((((......))))))))).........))).)))..)))))))

b1motif='GAGGCAGGCGGATTTCTGAGTTCGAGGCCAGCCTGGTCTACAGAGTGAGTTCCAGGACAGCCAGGGCTACACAGAGAAACCCTGTCTC'
b2motif='GCTGGTGAGATGGCTCAGTGGGTAAGAGCACCCGACTGCTCTTCCGAAGGTCAGGAGTTCAAATCCCAGC'
zipcode=posgenesdf[posgenesdf.gene_name=='Actb'].sequence.values[0][:54]

vars_secmutsbmotifs=pd.DataFrame()

for i in vars_forinsertion[[x in ['Camk2a','Arc','Actb']+backgroundgenes for x in vars_forinsertion.gene_name]].index:
    toadd = vars_forinsertion.loc[i]
    sq = toadd['varseq']
    toadd['varseq'] = sq[:30] + b1motif + sq[30+len(b1motif):]
    toadd['changes'] = 'B1 motif at position 30'
    vars_secmutsbmotifs=vars_secmutsbmotifs.append(toadd, ignore_index=True)
    
    toadd = vars_forinsertion.loc[i]
    sq = toadd['varseq']
    toadd['varseq'] = sq[:30] + b2motif + sq[30+len(b2motif):]
    toadd['changes'] = 'B2 motif at position 30'
    vars_secmutsbmotifs=vars_secmutsbmotifs.append(toadd, ignore_index=True)
    
    toadd = vars_forinsertion.loc[i]
    sq = toadd['varseq']
    toadd['varseq'] = zipcode + sq[len(zipcode):]
    toadd['changes'] = 'zipcode at position 0'
    vars_secmutsbmotifs=vars_secmutsbmotifs.append(toadd, ignore_index=True)
    
    toadd = vars_forinsertion.loc[i]
    sq = toadd['varseq']
    toadd['varseq'] = sq[:30] + zipcode + sq[30+len(zipcode):]
    toadd['changes'] = 'zipcode at position 30'
    vars_secmutsbmotifs=vars_secmutsbmotifs.append(toadd, ignore_index=True)

    toadd = vars_forinsertion.loc[i]
    sq = toadd['varseq']
    toadd['varseq'] = sq[:60] + zipcode + sq[60+len(zipcode):]
    toadd['changes'] = 'zipcode at position 60'
    vars_secmutsbmotifs=vars_secmutsbmotifs.append(toadd, ignore_index=True)

    toadd = vars_forinsertion.loc[i]
    sq = toadd['varseq']
    toadd['varseq'] = sq[:90] + zipcode + sq[90+len(zipcode):]
    toadd['changes'] = 'zipcode at position 90'
    vars_secmutsbmotifs=vars_secmutsbmotifs.append(toadd, ignore_index=True)


# RNAinverse

b1like = ['GGGUUUUAGCCCGAGGUUACUCGUAGCGUGGCUUUUGUAGGAGACGGUAGUGAAAGAAUCGCCCGAGUCGAGAUCUCUCAUAAGAUCC',
'CUCUCUCUGAUUACUGGUUAACUAAGUACAUUUCCGACGUCUUGAUCCGGACGGAGCCCUAUAGGUUAAUUACCGGUCCAAGAGGGAG',		
'UAUAAGCGCUUCCAUCCGCCGACGCGAAUUCUAUAACGGCCGUCGACCCACUUGUGCUUUUCAGUCGGACGUGGGUGCCACGUUUGUA',	
'UACGGGGCAAUGGCUUGUUCCACCAUUAGGAGGGGUCUGAGCAAAUACAAAAUCCCGGAUAGAGUGGAUAGGCAGGCGUGGUUUCGUA',		
'GACAUAAAGAGUCAGUUAGUGGUCAGCACGAUUCUGCGCUUUAAUAUGCCACAGAAGGAUGCCACUACAUCUAGUUGUAAUUUGUGUC',		
'GUGACGACGCCUUUUUGGCCGGCCUGUCUGGAUCUAGGCAAUCGCAGGCAAUGGAUAGUGGCUGUUGGACAUCGGAAUCAGUCGUUGC',		
'GAAUCACUUAUUCAUAAGUGUUGGCUGCGAACCUUUGUUGGAGGUUGAAUGGAAGGGAAGCGAUAACGAGUUUUAUGCCUAGUGGUUU',		
'GGGGGAAGGAGGUGGCGUUGUGGCUUACCACUAAACCCUUGAUGUCUAGCCGUUUGAAAGUAUCCGCAAUCACGCUAGAGCUUCUCUC',		
'AGCGAGUUAACGAUCCUAAUUCAGCGGUUUAUUCGUCGGAUCUGAGGCCAAACGGAAGUAUUAUGAGUUCCUAGGGUGACAACUUGCU',		
'GUAGGGACCGGGCUUAUGCUCGUACAACAGACCCUUAAGAUUGCACGCUGGAAGGGGACGUUCGCGAGACACGUAAGGGAGUCCCUAC']		

b2like = ['CCGGCUGCAUUUAUGACCAUGGAUCCGCGUGCGGGGACGCGGACCUCGAAAUUUUCACUAACGGUGUCGG',		
'CCGCAGCGAAUCGCAGUAUAAGCAGCGAUAACUGAAUGUCGCUCUGAGGUCCGUCUGAUGAGGUGUGUGG',		
'GGAUAACUCACGCCUAAAAUAGUGGAAGGUUGCGUUGCCUUCCCUCAAUCGAUGUGGUGCGCCAGUGUCC',		
'CUGUAAGUAUGCGUCCUAAACCAUUAGUUAGCAUUAUGACUGGGGCACACGGGUGGACCGCCGACUACGG',		
'GCUAUAGCACAUCAUCAAGAGAUGUAUUGCGACUUAGUAGUAUUCGGGUAAAGAGAUAGAUAGGCAUAGU',		
'GAGAUUAGACCACGCUCUCUUCAUAAGCGAUCAACUUUGCUUGGAUCUUUACACGGCAGUGCGCUGUCUC',		
'AGUUGGCUGUGCCAAUUCUUGCAGCCACUAACGACGUGGUGGCGCUUUUCACCUGUUCGGCUAAGCAGCU',		
'GUUUAACUAUCACAUCAUUUCUCGAGAUGGAGGUGCCCAUUUCAGUGUCUGUUCGAUCGUGUGAGUGAGC',		
'GGACGUAGGAAUUCCAAAAAGUAGAUUCCACUCGAUUGGAGUCACGUAUAAGUAUGGUAGUGGCUCGUCU',		
'GACGACACGGGCAUUGAAUAGUAUGUUUUGCUAAUAUAAGACGACAAGGAGGAGCGAUUGCGAGUUUGUC']


for i in vars_forinsertion[[x in ['Camk2a','Arc','Actb']+backgroundgenes for x in vars_forinsertion.gene_name]].index:
    for mtf in b1like:
        toadd = vars_forinsertion.loc[i]
        sq = toadd['varseq']
        toadd['varseq'] = sq[:30] + mtf.replace('U','T') + sq[30+len(mtf):]
        toadd['changes'] = 'B1 like motif at position 30'
        vars_secmutsbmotifs=vars_secmutsbmotifs.append(toadd, ignore_index=True)
    for mtf in b2like:
        toadd = vars_forinsertion.loc[i]
        sq = toadd['varseq']
        toadd['varseq'] = sq[:30] + mtf.replace('U','T') + sq[30+len(mtf):]
        toadd['changes'] = 'B2 like motif at position 30'
        vars_secmutsbmotifs=vars_secmutsbmotifs.append(toadd, ignore_index=True)

# 2139
vars_secmutsbmotifs['subset']='sec structure and zipcode'        
vars_secmutsbmotifs.to_pickle('./dataframes/design/vars_secmutsbmotifs.pkl')           





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [208]:

#%% assemble library

library_varseq = pd.concat([wt_scan, mut_scan, motifs_comb, vars_secmutsbmotifs], ignore_index=True)

library_varseq.to_pickle('./dataframes/design/library_varseq.pkl')           

from Bio import Restriction
from Bio.Seq import Seq

vars_to_drop=[]
for i in library_varseq.index:
    seq=library_varseq.varseq[i]
    if (len(Restriction.SbfI.search(Seq(seq)))>0) | (len(Restriction.AscI.search(Seq(seq)))>0):
        vars_to_drop.append(i)

library_varseq.drop(vars_to_drop, inplace=True)


#%%

bcspl=pd.read_csv('./Datasets/40000barcodes.csv', header=None)
bcspl.columns=['idx','barcode']

bcs=list(bcspl.barcode.values)
bcs2=list(bcspl.barcode.values)


primers=['GCCCCACGGAGGTGCCAC','CCTCCTCACGGCGACGCG','CTCCCGGGCATGCGAATT','TCAACCAGTCGCGGTCCA',
         'TGCGAGTTAGGGGACGGT','ACGGACGCGGGTATAGCA','CGAAATGGGCCGCATTGC','CACTGCGGCTGATGACGA',
            'GACAGATGCGCCGTGGAT','AGCCACCCGATCCAATGC','ATGGGGTTCGGTATGCGC','AAGGCTCCCCGAGACGAT']


'''

wt scanning 50 CGAAATGGGCCGCATTGC CACTGCGGCTGATGACGA
[the rest] GACAGATGCGCCGTGGAT AGCCACCCGATCCAATGC

three TGCGAGTTAGGGGACGGT ACGGACGCGGGTATAGCA

'''
def addbc(idx):
    if library_varseq.subset[idx]=='wt scanning 50':
        while True:
            bc=random.choice(bcs)
            vsnew=primers[6] + bc + library_varseq.varseq[idx] + primers[7]
            if (Restriction.SbfI.search(Seq(vsnew))==[])&(Restriction.AscI.search(Seq(vsnew))==[]):
                bcs.remove(bc)
                break
    else:
        while True:
            bc=random.choice(bcs2)
            vsnew=primers[8] + bc + library_varseq.varseq[idx] + primers[9]
            if (Restriction.SbfI.search(Seq(vsnew))==[])&(Restriction.AscI.search(Seq(vsnew))==[]):
                bcs2.remove(bc)
                break
    return vsnew


library_varseq['varseq_final']=library_varseq.index.map(lambda x: addbc(x))



vars_to_drop=[]
for i in library_varseq.index:
    seq=library_varseq.varseq_final[i]
    if (len(Restriction.SbfI.search(Seq(seq)))>0) | (len(Restriction.AscI.search(Seq(seq)))>0):
        vars_to_drop.append(i)


lengths=library_varseq.varseq_final.apply(lambda x: len(x))   


library_varseq['barcode']=library_varseq.varseq_final.apply(lambda x: x[16:30])
library_varseq['primer1']=library_varseq.varseq_final.apply(lambda x: x[:18])
library_varseq['primer2']=library_varseq.varseq_final.apply(lambda x: x[-18:])

library_varseq.to_pickle('./dataframes/design/library_varseq_FINAL.pkl')           

library_varseq.to_csv('./dataframes/design/library_varseq_FINAL.csv')           

library_varseq.varseq_final.to_csv('./dataframes/design/RNAlocneurons_library.csv', header=None)           



In [209]:
# make sure that there are no other characters than actg

noactg=[]
count=0
for i in library_varseq.varseq_final.values:
    for j in i:
        if (j!='A')&(j!='T')&(j!='C')&(j!='G'):
            noactg.append(i)
    count+=1
    
print(noactg)

[]
