In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn3_circles
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.patches as mpatches
from os import listdir 

In [2]:
# filter sequences according to specific rules for each library
def validity_test(seq,lib):
    if lib == '7':
        if (seq[:3]=='SAC') & (seq[-5:]=='CGGGS'): return True
        else: return False
    elif lib == '9':
        if (seq[0]=='X') & (seq[-1]=='X'): return True #flanks are unknown
        else: return False
    elif lib == '12':
        if (seq[0]=='S') & (seq[-4:]=='GGGS'): return True
        else: return False
    elif lib == '16':
        if (seq[0]=='S') & (seq[-1]=='S'): return True
        else: return False
    elif lib == '20':
        if (seq[0]=='S') & (seq[-1]=='S'): return True
        else: return False
    else: return 'Error'

In [3]:
library_lengths = {'7':15,  #starts with 'ac' ends with 'cgggs'
                   '9':9,   #unknown 9 is probably wrong
                   '12':17, #starts with 's' ends with 'gggs'
                   '16':18, #starts and ends with 's'
                   '20':22  #starts and ends with 's'
                  }

In [6]:
for f in listdir('All_NGS_raw/'):
    if f[-3:] != 'csv': continue # ignore xlsx files
    print(f)
    seq_data0 = pd.read_csv('All_NGS_raw/{}'.format(f),sep=';')
    file_ID = f.split('.')[0]  
    file_ID
    # add columns 'cell line', 'library', 'round', 'subcellular fraction'    
    celllist = []
    liblist = []
    roundlist = []
    fraclist = []
    for line in seq_data0['Sample']:
        lnls = line.split('_')
        lnls.remove('FK')
        lnls.remove('VACure')
        if 'iEC' in lnls: lnls.remove('iEC')
        if 'PS' in lnls: lnls.remove('PS')
        if not lnls[1].isdigit():
            lnls.append(lnls[1])
            del lnls[1]
        celllist.append(lnls[0])
        liblist.append(lnls[1])
        roundlist.append(lnls[2])
        fraclist.append(lnls[3])
    seq_data0['cell_line']= celllist
    seq_data0['library']=liblist
    seq_data0['round']=roundlist
    seq_data0['fraction']=fraclist
    # preprocessing check
    #for g,dataf in tqdm(seq_data0.groupby(by=['cell_line','library','round','fraction'])):
        #if g[1] == '9': continue
        #print(g,len(dataf))
    #continue
    # remove peptides with stop codon ('*') or with NaN value
    seq_data0.dropna(subset='Protein',inplace=True)
    seq_data = seq_data0[~seq_data0['Protein'].str.contains('\*')].copy()
    
    # dont run this if you dont need to, it takes a long time (10mins)
    df_final = pd.DataFrame()
    #df_final_top20 = pd.DataFrame()
    for g,dataf in tqdm(seq_data.groupby(by=['cell_line','library','round','fraction'])):
        output = pd.DataFrame()
        lib = g[1]
        seqlen = library_lengths[lib]
        if lib == '9': continue # currently unknown flanks
        tlen_data = dataf[dataf['ProteinLength']==seqlen].copy()
        if len(tlen_data) == 0: 
            print(g,' 0')
        else:
            for j,p in tlen_data.groupby('Protein'):
                valid = validity_test(j,lib)
                if valid:
                    if len(p) > 1:
                        outdf = p.iloc[:1].copy()
                        sums = p.sum()
                        outdf['Count'] = sums['Count']
                        outdf['Percentage'] = sums['Percentage']
                        outdf['seq_count'] = len(p)
                        output = pd.concat([output,outdf])
                    else:
                        solo_df = p.copy()
                        solo_df['seq_count'] = len(p)
                        output = pd.concat([output,solo_df])
                else: continue # if not valid
            if len(output) == 0: print(g,' 0')
            else: 
                output.sort_values('Count',ascending=False,inplace=True)
                df_final = pd.concat([df_final,output])
        #df_final_top20 = pd.concat([df_final_top20,output[:20]])
    
    df_final.to_csv('All_NGS_filtered/{}_filtered.csv'.format(f),sep=',')
    
    # check count for each subgrouping
    for g,df in df_final.groupby(['cell_line','library','round','fraction']):
        print(g,len(df))


RN-1194_full_iEC_WT_ENG_KRIT1.csv


100%|████████████████████████████████████████████████████████████████████████████████| 135/135 [00:59<00:00,  2.25it/s]


RN-1208_full_HUVEC_HUAEC.csv


 19%|███████████████▍                                                                  | 17/90 [00:39<02:14,  1.84s/it]

('HUAEC', '16', '3', 'Nucleus')  0


 39%|███████████████████████████████▉                                                  | 35/90 [01:14<00:23,  2.35it/s]

('HUAEC', '7', '3', 'Nucleus')  0


 53%|███████████████████████████████████████████▋                                      | 48/90 [01:34<01:03,  1.50s/it]

('HUVEC', '12', '2', 'Cytosol')  0


 83%|████████████████████████████████████████████████████████████████████▎             | 75/90 [02:57<00:24,  1.67s/it]

('HUVEC', '7', '2', 'Cytosol')  0
('HUVEC', '7', '2', 'Debris')  0


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [02:58<00:00,  1.98s/it]


('HUVEC', '7', '3', 'Nucleus')  0
RN-1209_full_iENG_KO_mouse_kidney.csv


100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:13<00:00,  1.53it/s]


RN-1223_full_iENG_KO_mouse_PS.csv


100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:08<00:00,  2.62it/s]


RN-1224_full_inVivo_PD_HUVEC_Tie2_xenograft.csv


100%|██████████████████████████████████████████████████████████████████████████████████| 46/46 [00:25<00:00,  1.81it/s]


RN-1249_full_inVivo_PD_HUVEC_Tie2_kidney_liver.csv


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [16:25<00:00, 32.84s/it]


RN-1276_full_CCM2_Tie2LF.csv


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:37<00:00,  2.43it/s]


RN-1352_full_alk1_KO.csv


100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [00:13<00:00,  3.26it/s]


In [None]:
tlen_data

# get numbers for publication

In [24]:
indf = pd.DataFrame()
for f in listdir('All_NGS_raw/'):
    if 'tie' not in f.lower(): continue
    if f[-3:] != 'csv': continue # ignore xlsx files
    print(f)
    seq_data0 = pd.read_csv('All_NGS_raw/{}'.format(f),sep=';')
    file_ID = f.split('.')[0]  
    file_ID
    # add columns 'cell line', 'library', 'round', 'subcellular fraction'    
    celllist = []
    liblist = []
    roundlist = []
    fraclist = []
    for line in seq_data0['Sample']:
        lnls = line.split('_')
        lnls.remove('FK')
        lnls.remove('VACure')
        if 'iEC' in lnls: lnls.remove('iEC')
        if 'PS' in lnls: lnls.remove('PS')
        if not lnls[1].isdigit():
            lnls.append(lnls[1])
            del lnls[1]
        celllist.append(lnls[0])
        liblist.append(lnls[1])
        roundlist.append(lnls[2])
        fraclist.append(lnls[3])
    seq_data0['cell_line']= celllist
    seq_data0['library']=liblist
    seq_data0['round']=roundlist
    seq_data0['fraction']=fraclist
    #display(seq_data0)
    
    #load and Combine all NGS data into one df
    
    if f == 'RN-1223_full_iENG_KO_mouse_PS':
        seq_data0 = seq_data0[seq_data0['Sample'] != 'VACure_ENGKOmice_FK_PS_7_1_Cytosol'].copy() # this test was rerun in another file, so we remove this set  
    indf = pd.concat([indf, seq_data0], axis=0)
indf.reset_index(drop=True, inplace=True)

RN-1224_full_inVivo_PD_HUVEC_Tie2_xenograft.csv
RN-1249_full_inVivo_PD_HUVEC_Tie2_kidney_liver.csv
RN-1276_full_CCM2_Tie2LF.csv


In [25]:
#remove unwanted samples from dataset
mydf0 = indf[indf['fraction'] != 'Heart'].copy()
mydf1 = mydf0[mydf0['library'] != '9'].copy()

In [28]:
# reapply label columns
cell_linels = []
libraryls = []
roundls = []
fracls = []
for e in mydf1['Sample']:
    lnls = e.split('_')
    lnls.remove('FK')
    lnls.remove('VACure')
    lnset = set(lnls)
    
    if '7' in lnset:    libraryls.append('7'); lnset.remove('7')
    elif '12' in lnset: libraryls.append('12'); lnset.remove('12')
    elif '16' in lnset: libraryls.append('16'); lnset.remove('16')
    elif '20' in lnset: libraryls.append('20'); lnset.remove('20')
    else: print('no library found:',e)    
    
    if '1' in lnset:   roundls.append('1'); lnset.remove('1')
    elif '2' in lnset: roundls.append('2'); lnset.remove('2')
    elif '3' in lnset: roundls.append('3'); lnset.remove('3')
    else: print('no round found:',e)
    
    if 'C' in lnset:         fracls.append('Cytosol'); lnset.remove('C')
    elif 'Cytosol' in lnset: fracls.append('Cytosol'); lnset.remove('Cytosol')
    elif 'N' in lnset:       fracls.append('Nucleus'); lnset.remove('N')
    elif 'Nucleus' in lnset: fracls.append('Nucleus'); lnset.remove('Nucleus')
    elif 'D' in lnset:       fracls.append('Debris'); lnset.remove('D')
    elif 'Debris' in lnset:  fracls.append('Debris'); lnset.remove('Debris')
    else: fracls.append('ND')
    
    cell_linels.append('-'.join(sorted(lnset)))
    
mydf1['cell_line'] = cell_linels
mydf1['library'] = libraryls
mydf1['round'] = roundls
mydf1['fraction'] = fracls

In [75]:
mydf = mydf1[mydf1['cell_line'].str.contains('Xenograft')].copy()
mydf.cell_line.unique()

array(['Tie2LF-Xenograft', 'WT-Xenograft', 'Kidney-Xenograft',
       'Liver-Xenograft'], dtype=object)

In [81]:
mydf.sort_values('Count',ascending=False)[:150]

Unnamed: 0,Sample,Count,Percentage,Sequence,Protein,ProteinLength,cell_line,library,round,fraction
522875,VACure_Xenograft_FK_Liver_16_3,225444,52.92460,TCC,S,1,Liver-Xenograft,16,3,ND
285165,VACure_Xenograft_FK_Kidney_12_3,216649,49.14600,TCC,S,1,Kidney-Xenograft,12,3,ND
298763,VACure_Xenograft_FK_Kidney_16_2,212778,46.95660,TCC,S,1,Kidney-Xenograft,16,2,ND
307150,VACure_Xenograft_FK_Kidney_16_3,209263,51.20940,TCC,S,1,Kidney-Xenograft,16,3,ND
515888,VACure_Xenograft_FK_Liver_16_2,207691,50.28980,TCC,S,1,Liver-Xenograft,16,2,ND
...,...,...,...,...,...,...,...,...,...,...
475950,VACure_Xenograft_FK_Liver_16_1,15912,4.76938,TCG,S,1,Liver-Xenograft,16,1,ND
26579,VACure_Xenograft_FK_Tie2LF_12_2_Nucleus,15754,12.86230,TCG,S,1,Tie2LF-Xenograft,12,2,Nucleus
307149,VACure_Xenograft_FK_Kidney_16_2,15485,3.42000,unmatched,,0,Kidney-Xenograft,16,2,ND
659791,VACure_Xenograft_FK_Liver_20_3,14709,3.49535,TCTAGTAGTTTTGGTATGTCGAGTCAGCTTATTAGTCATGGTGGAG...,SSSFGMSSQLISHGGGS,17,Liver-Xenograft,20,3,ND


In [68]:
mydf.Count.sum()

16888323

In [45]:
mydf.Protein.nunique()

366991

In [57]:
mydf_fil = pd.read_csv('all_NGS_data_combined.csv',sep=',')

In [64]:
mydf_xeno = mydf_fil[mydf_fil.cell_line.str.contains('Xeno')]
mydf_xeno.cell_line.unique()

array(['Kidney-Xenograft', 'Liver-Xenograft', 'Tie2LF-Xenograft',
       'WT-Xenograft'], dtype=object)

In [65]:
mydf_xeno.Count.sum()

1983819

In [67]:
mydf_xeno.Protein.nunique()

174559