# Generation of non-sense and frame-shift mutated sequences

In [2]:
import pandas as pd
from bgvep.readers import Tabix
import re
from tqdm.notebook import trange, tqdm
import math
from Bio import SeqIO
import numpy as np

## 1. Prepare CPTAC dataset

In [4]:
####################
#Load CPTAC dataset
####################

print('Load CPTAC dataset')

CPTAC = ['BRCA', 'CCRCC', 'COAD', 'GBM', 'LUAD', 'OV', 'UCEC']
cptac_df = pd.DataFrame(columns=['gene', 'cpct_aliquot', 'protein_expression', 'aliquot', 'sample',
       'Type', 'median', 'stdev', 'norm_protein_expression', 'log2fpkm',
       'log10fpkm', '#Uploaded_variation', 'Location', 'Allele', 'Feature',
       'protein_mutation', 'Protein_position', 'Amino_acids', 'Consequence',
       'Phenotype', 'Ubiquitinases_Mutated', 'Altered_E3_Ligases',
       'Raw_Residual', 'Stability_Change', 'ABS_Stability_Change'])
for cancer in tqdm(CPTAC, total=len(CPTAC)):
    df = pd.read_csv("/workspace/projects/cptac_analysis/data/"+cancer+"/dataset_irls.gz", sep='\t')
    Dataset = [cancer] * len(df)
    df['Dataset'] = Dataset
    frames = [cptac_df, df]
    cptac_df = pd.concat(frames)
cptac_df.drop_duplicates(keep='first',inplace=True) 

######################################################################################################
#Calculate the mean of rna values for the repeated mutations (log2fpkm, log10fpkm, Stability_Change)
######################################################################################################

print('Eliminate duplicated rna measures')

dupl_samples = ['C3L-00908', 'C3N-00545', 'C3N-01825']

cptac_dupl_df = cptac_df[cptac_df['sample'].isin(dupl_samples)]
df1 = cptac_dupl_df[cptac_dupl_df['Feature'].isnull()].groupby(['gene','sample'],as_index=False).mean()
df2 = cptac_dupl_df[~cptac_dupl_df['Feature'].isnull()].groupby(['gene','Feature','#Uploaded_variation','Location'],as_index=False).mean()
cptac_dupl2_df = pd.concat([df1,df2])
cptac_dupl3_df = cptac_dupl_df.drop(['log2fpkm','log10fpkm','Raw_Residual','Stability_Change','ABS_Stability_Change'],axis=1)
cptac_dupl4_df = pd.merge(cptac_dupl3_df,cptac_dupl2_df,how='left')
cptac_dupl4_df.drop_duplicates(keep='first',inplace=True)
cptac2_df = cptac_df[~(cptac_df['sample'].isin(dupl_samples))]
cptac_df = pd.concat([cptac2_df,cptac_dupl4_df],ignore_index=True)

#####################################################################################
#Prepare cptac_df table for the merge (select stop_gained and frameshift_variant)
#####################################################################################

cptac_mut_df = cptac_df[['gene','Feature','#Uploaded_variation','Location','protein_mutation','Protein_position','Amino_acids','sample','Dataset']][(cptac_df['Phenotype']=='stop_gained')|(cptac_df['Phenotype']=='frameshift_variant')]


Load CPTAC dataset


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


Eliminate duplicated rna measures


## 2. Prepare CCLE dataset

In [5]:
####################
#Load CCLE dataset
####################

print('Load CCLE dataset')

ccle_df = pd.read_csv("/workspace/projects/cptac_analysis/data/CCLE/dataset_irls.gz", sep='\t')

#############################################################################################################################
#Calculate the mean of protein values for the repeated mutations (protein_expression, norm_protein_expression, Raw_Residual, Stability_Change, ABS_Stability_Change)
#############################################################################################################################

print('Eliminate duplicated protein measures')

df1 = ccle_df[ccle_df['Phenotype']=='WT'].groupby(['gene','sample','Phenotype'],as_index=False).mean()
df2 = ccle_df[ccle_df['Phenotype']!='WT'].groupby(['gene','Feature','sample','#Uploaded_variation','Location','Phenotype'],as_index=False).mean()
ccle2_df = pd.concat([df1,df2])
ccle3_df = ccle_df.drop(['protein_expression', 'norm_protein_expression', 'Raw_Residual', 'Stability_Change', 'ABS_Stability_Change'],axis=1)
ccle4_df = pd.merge(ccle3_df,ccle2_df,how='left')
ccle_df = ccle4_df.drop_duplicates(keep='first')

##################################################################################
#Prepare cptac_df table for the merge(select stop_gained and frameshift_variant)
##################################################################################

ccle_mut_df = ccle_df[['gene','Feature','sample','#Uploaded_variation','Location','Phenotype']][(ccle_df['Phenotype']=='stop_gained')|(ccle_df['Phenotype']=='frameshift_variant')]


Load CCLE dataset
Eliminate duplicated protein measures


## 3. Import and prepare necessary tables

### 3.1 Import tables with wt seq (BioMart) from canonical transcripts

In [6]:
#Import wt sequences from canonical transcripts
seq_df = pd.read_csv("/workspace/users/msanchezg/notebooks/ens_canonical_sequences.tsv", sep="\t")

#Merge mutations_df with seq_df
seq_merge_df = seq_df[['ensembl_transcript_id','coding']]
seq_merge_df = seq_merge_df.rename(columns={'ensembl_transcript_id':'Feature','coding':'seq_wt'})

cptac_mut_df = pd.merge(cptac_mut_df, seq_merge_df, how='left')
ccle_mut_df = pd.merge(ccle_mut_df, seq_merge_df, how='left')

### 3.2 Tables with genes with ns and fs mutations

In [7]:
#Get genes containing ns or fs in cptac or ccle datasets
genes_cptac_df = cptac_mut_df[['Feature']].drop_duplicates(subset='Feature',keep='first',ignore_index=True)
genes_ccle_df = ccle_mut_df[['Feature']].drop_duplicates(subset='Feature',keep='first',ignore_index=True)
genes_ns_fs_df = pd.concat([genes_cptac_df,genes_ccle_df], ignore_index=True)

### 3.3 Import CDS positions from BG-VEP 

In [8]:
def cds_pos_bgvep (mut_df, positions_df):
    '''Function to get cds positions for each mutation using bgvep annotation'''

    positions_list = positions_df.values.tolist()

    col_names = ['Feature','Chromosome','Position','Reference','Alternate','cDNA_position','CDS_position','Strand']

    positions_list100 = positions_list[0:100]
    cds_pos_df = pd.DataFrame(columns=col_names)
    for position in tqdm(positions_list):
        chrom = position[0]
        pos = int(position[1])
        with Tabix('hg38', '92') as reader:
            for data in reader.get(chrom, pos, pos):
                canonical = re.findall('YES', data[21])
                if canonical !=[]:
                    data2 =[]
                    data2.append(data[5])
                    data2.append(data[0])
                    data2.append(data[1])
                    data2.append(data[2])
                    data2.append(data[3])
                    data2.append(data[8])
                    data2.append(data[9])
                    data2.append(data[16])
                    newRow = pd.DataFrame([data2], columns=col_names)
                    cds_pos_df = cds_pos_df.append(newRow, ignore_index=True) 
    
    return (cds_pos_df)         

In [9]:
def cds_pos_table (mut_df):
    '''Function to create a table with cds positions of the mutations'''
    mut_df['chrom'] = mut_df['Location'].str.split(':').str[0]
    mut_df['location2'] = mut_df['Location'].str.split(':').str[1]
    mut_df['location2'] = mut_df['location2'].str.split('-').str[0]
    positions_df = mut_df[['chrom','location2']]
    cds_pos_df = cds_pos_bgvep (mut_df, positions_df)
    cds_pos_df = cds_pos_df.rename(columns={'Position':'location2'})
    cds_pos_df['location2'] = cds_pos_df['location2'].astype(str)
    cds_pos_df['chrom'] = 'chr' + cds_pos_df['Chromosome'].astype(str)
    cds_pos_df = cds_pos_df.drop(columns=['Chromosome','Alternate'])
    cds_pos_df.drop_duplicates(keep='first',inplace=True)
    return (cds_pos_df)

In [None]:
#Add cds positions to datasets (takes a lot of time!)
cptac_cds_pos_df = cds_pos_table (cptac_mut_df)
ccle_cds_pos_df = cds_pos_table (cptac_df)

HBox(children=(FloatProgress(value=0.0, max=16421.0), HTML(value='')))

This step is to save the tables XXXX_cds_pos_df, because the function **cds_pos_bgvep** (inside the function **cds_pos_table**) takes a lot of time, so we can run it only once and then load the saved tables

In [None]:
#Save tables(to not compute it again, takes a lot of time)
cptac_cds_pos_df.to_csv(r'cds_pos_nsfs_cptac.tsv', header = True, index = None, sep = '\t')
ccle_cds_pos_df.to_csv(r'cds_pos_nsfs_ccle.tsv', header = True, index = None, sep = '\t')

In [3]:
#Load tables
cptac_cds_pos_df = pd.read_csv('cds_pos_nsfs_ccle_bgveps.tsv', sep='\t')
ccle_cds_pos_df = pd.read_csv('cds_pos_nsfs_ccle_bgveps.tsv', sep='\t')

From here we can continue after uploading the saved XXXX_cds_pos_df tables

In [None]:
#Merge mutations_df with cds_pos_df
cptac_mut_df = pd.merge(cptac_mut_df,cptac_cds_pos_df, how='left')
ccle_mut_df = pd.merge(ccle_mut_df,ccle_cds_pos_df, how='left')

In [None]:
#Select mutations with CDS positions for in silico mutational process
cptac_mut_df = cptac_mut_df[cptac_mut_df['CDS_position']!='-']
cptac_mut_df = cptac_mut_df[~cptac_mut_df['CDS_position'].isnull()]
ccle_mut_df = ccle_mut_df[ccle_mut_df['CDS_position']!='-']
ccle_mut_df = ccle_mut_df[~ccle_mut_df['CDS_position'].isnull()]

## 4. Create mutated sequences

In [None]:
def mutate_seq (df):
    '''Function to generate the mutated sequences with nonsense and frameshift alterations'''
    
    mutations_list = df[['seq_wt','CDS_position','REF','ALT','Reference','Strand','Feature','sample','Location','#Uploaded_variation','Phenotype']].values.tolist()

    def fix_nt (nt, strand):
        compl_dict = {'A':'T','T':'A','C':'G','G':'C'}
        if strand == -1: 
            if len(nt)>1:
                nt_list = list(nt)
                nt2_list = []
                for base in nt_list:
                    nt2 = compl_dict[base]
                    nt2_list.append(nt2)
                nt2 = ''.join(nt2_list)
            else:
                nt2 = compl_dict[nt]
        else:
            nt2 = nt
        return(nt2)

    mut_check_list =[]
    seq_mut_list =[]
    for seq in mutations_list:
        ref = seq[2]
        alt = seq[3]
        ref_b = seq[4]
        strand = seq[5]
        mutation = seq[10]
        if seq[0] is np.nan:
            mut_check = [seq[6],seq[7],seq[8],seq[9],'nan','nan','nan','nan','nan','nan','nan','nan','nan','nan','nan','nan','nan']
            seq_mut2 = [seq[6],seq[7],seq[8],seq[9],seq[10],ref,alt,seq[0]]
        else:
            if (ref!='-')&(alt!='-'):
                ref2 = fix_nt(ref, strand)
                alt2 = fix_nt(alt, strand)
                ref2_list = list(ref2)
                cds_pos = int(seq[1])
                seq_wt = seq[0]
                if len(seq_wt)<cds_pos:
                    mut_check = [seq[6],seq[7],seq[8],seq[9],'nan','nan',ref,ref_b,ref2,'nan',alt,alt2, len(seq_wt),'nan',len(ref2),len(alt2),'nan']
                    seq_mut2 = [seq[6],seq[7],seq[8],seq[9],seq[10],ref,alt,'Short_seq_wt']
                else:
                    ref3 = seq_wt[cds_pos-1]
                    if ref2_list[0] != ref3:
                        mut_check = [seq[6],seq[7],seq[8],seq[9],'nan','nan',ref,ref_b,ref2,ref3,alt,alt2,len(seq_wt),'nan',len(ref2), len(alt2),'nan']
                        seq_mut2 = [seq[6],seq[7],seq[8],seq[9],seq[10],ref,alt,'No_ref_match']
                    else:
                        seq_mut = seq_wt[0:(cds_pos-1)]+alt2+seq_wt[cds_pos:len(seq_wt)]
                        seq_mut2 = [seq[6],seq[7],seq[8],seq[9],seq[10],ref,alt,seq_mut]
                        mut_check = [seq[6],seq[7],seq[8],seq[9],seq_wt[(cds_pos-2):(cds_pos+3)],seq_mut[(cds_pos-2):(cds_pos+3)],ref,ref2,ref3,alt,alt2,len(seq_wt), len(seq_mut), len(ref2), len(alt2), (len(seq_mut)-len(seq_wt))]
            elif alt!='-':
                ref2 = fix_nt(ref_b, strand)
                alt2 = fix_nt(alt, strand)
                cds_pos = int(seq[1])
                seq_wt = seq[0]
                if len(seq_wt)<cds_pos:
                    mut_check = [seq[6],seq[7],seq[8],seq[9],'nan','nan',ref,ref_b,ref2,'nan',alt,alt2, len(seq_wt),'nan',len(ref2),len(alt2),'nan']
                    seq_mut2 = [seq[6],seq[7],seq[8],seq[9],seq[10],ref,alt,'Short_seq_wt']
                else:
                    ref3 = seq_wt[cds_pos-1]
                    if ref2 != ref3:
                        mut_check = [seq[6],seq[7],seq[8],seq[9],'nan','nan',ref,ref_b,ref2,ref3,alt,alt2,len(seq_wt),'nan',len(ref2), len(alt2),'nan']
                        seq_mut2 = [seq[6],seq[7],seq[8],seq[9],seq[10],ref,alt,'No_ref_match']
                    else:
                        seq_mut = seq_wt[0:(cds_pos-1)]+alt2+seq_wt[(cds_pos-1):len(seq_wt)]
                        seq_mut2 = [seq[6],seq[7],seq[8],seq[9],seq[10],ref,alt,seq_mut]
                        mut_check = [seq[6],seq[7],seq[8],seq[9],seq_wt[(cds_pos-2):(cds_pos+3)],seq_mut[(cds_pos-2):(cds_pos+len(alt2)+3)],ref,ref_b,ref2,ref3,alt,alt2,len(seq_wt), len(seq_mut), len(ref2), len(alt2), (len(seq_mut)-len(seq_wt))]
            else:
                ref2 = fix_nt(ref, strand)
                alt2 = '-'
                ref2_list = list(ref2)
                cds_pos = int(seq[1])
                seq_wt = seq[0]
                if len(seq_wt)<cds_pos:
                    mut_check = [seq[6],seq[7],seq[8],seq[9],'nan','nan',ref,ref2,'nan',alt,alt2, len(seq_wt),'nan',len(ref2),len(alt2),'nan']
                    seq_mut2 = [seq[6],seq[7],seq[8],seq[9],seq[10],ref,alt,'Short_seq_wt']
                else:
                    ref3 = seq_wt[cds_pos-1]
                    if ref2_list[0] != ref3:
                        mut_check = [seq[6],seq[7],seq[8],seq[9],'nan','nan',ref,ref_b,ref2,ref3,alt,alt2,len(seq_wt),'nan',len(ref2), len(alt2),'nan']
                        seq_mut2 = [seq[6],seq[7],seq[8],seq[9],seq[10],ref,alt,'No_ref_match']
                    else:
                        seq_mut = seq_wt[0:(cds_pos-1)]+seq_wt[(cds_pos+len(ref2)-1):len(seq_wt)]
                        seq_mut2 = [seq[6],seq[7],seq[8],seq[9],seq[10],ref,alt,seq_mut]
                        mut_check = [seq[6],seq[7],seq[8],seq[9],seq_wt[(cds_pos-2):(cds_pos+3)],seq_mut[(cds_pos-2):(cds_pos+3)],ref,ref_b,ref2,ref3,alt,alt2,len(seq_wt), len(seq_mut), len(ref2), len(alt2), (len(seq_mut)-len(seq_wt))]

        mut_check_list.append(mut_check)
        seq_mut_list.append(seq_mut2)

    mut_check_df = pd.DataFrame(mut_check_list, columns=['Feature','sample','Location','#Uploaded_cariation','seq_wt','seq_mut','ref','ref_bgveps','ref_at_strand','ref_at_seq','alt','alt_at_strand','length_seq_wt','length_seq_mut','length_ref','length_alt','length_seq_mut_minus_length_seq_wt'])
    seq_mut_df = pd.DataFrame(seq_mut_list, columns=['Feature','sample','Location','#Uploaded_variation','Phenotype','REF','ALT','seq_mut'])
    return (mut_check_df, seq_mut_df)

In [None]:
#Generate mutated sequences
cptac_mutcheck_df, cptac_seq_mut_df = mutate_seq(cptac_mut_df)
ccle_mutcheck_df, ccle_seq_mut_df = mutate_seq(ccle_mut_df)

In [None]:
#Table to check if in silico mutations have been done correctly
cptac_mut_check_df

In [None]:
#Table to check if in silico mutations have been done correctly
ccle_mut_check_df

In [20]:
#Add mutated sequence to cptac_mut_df
cptac_mut_df = pd.merge(cptac_mut_df,cptac_seq_mut_df,how='left')
ccle_mut_df = pd.merge(ccle_mut_df,ccle_seq_mut_df,how='left')

Unnamed: 0,gene,Feature,#Uploaded_variation,Location,protein_mutation,Protein_position,Amino_acids,sample,Dataset,Location1,...,Ins,REF2,ALT2,POS2,seq_wt,Reference,cDNA_position,CDS_position,Strand,seq_mut
0,ABCB7,ENST00000645829,11BR016__G__C,chrX:75060263,S669*,669,S/*,11BR016,BRCA,chrX:75060263,...,0,G,C,75060263,ATGGCGCTGCTCGCGATGCATTCTTGGCGCTGGGCGGCCGCGGCGG...,G,2074,2006,-1,ATGGCGCTGCTCGCGATGCATTCTTGGCGCTGGGCGGCCGCGGCGG...
1,ABCC4,ENST00000645237,20BR006__A__T,chr13:95166186,L669*,669,L/*,20BR006,BRCA,chr13:95166186,...,0,A,T,95166186,ATGCTGCCCGTGTACCAGGAGGTGAAGCCCAACCCGCTGCAGGACG...,A,2138,2006,-1,ATGCTGCCCGTGTACCAGGAGGTGAAGCCCAACCCGCTGCAGGACG...
2,ABI1,ENST00000376142,18BR003__G__A,chr10:26823179,Q82*,82,Q/*,18BR003,BRCA,chr10:26823179,...,0,G,A,26823179,ATGGCAGAGCTGCAGATGTTACTAGAGGAGGAGATCCCGTCTGGCA...,G,316,244,-1,ATGGCAGAGCTGCAGATGTTACTAGAGGAGGAGATCCCGTCTGGCA...
3,ACACA,ENST00000616317,14BR008__ATACGTTTTGAAA__-,chr17:37274257-37274269,FSKRI311-315X,311-315,FSKRI/X,14BR008,BRCA,chr17:37274257,...,0,ATACGTTTTGAAA,-,37274257,ATGTGGTGGTCTACTCTGATGTCAATCTTGAGGGCTAGGTCTTTTT...,G,1426,945,-1,ATGTGGTGGTCTACTCTGATGTCAATCTTGAGGGCTAGGTCTTTTT...
4,ACBD6,ENST00000642319,11BR028__C__A,chr1:180397563,E206*,206,E/*,11BR028,BRCA,chr1:180397563,...,0,C,A,180397563,ATGGCTTCATCATTCCTGCCCGCGGGGGCCATCACCGGCGACAGCG...,C,1304,616,-1,ATGGCTTCATCATTCCTGCCCGCGGGGGCCATCACCGGCGACAGCG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16423,PTPRC,ENST00000442510,C3N-01825__-__AACTCCT,chr1:198748182-198748183,N974KTPX,974,N/KTPX,C3N-01825,UCEC,chr1:198748182,...,-1,-,AACTCCT,198748182,ATGACCATGTATTTGTGGCTTAAACTCTTGGCATTTGGCTTTGCCT...,A,3095,2921,1,ATGACCATGTATTTGTGGCTTAAACTCTTGGCATTTGGCTTTGCCT...
16424,RAB22A,ENST00000244040,C3N-01825__-__AAAAAAA,chr20:58354224-58354225,A149AKKX,149,A/AKKX,C3N-01825,UCEC,chr20:58354224,...,-1,-,AAAAAAA,58354224,ATGGCGCTGAGGGAGCTCAAAGTGTGTCTGCTCGGGGATACAGGTG...,C,727,446,1,ATGGCGCTGAGGGAGCTCAAAGTGTGTCTGCTCGGGGATACAGGTG...
16425,SLC7A2,ENST00000004531,C3N-01825__-__ATTCA,chr8:17548817-17548818,-264-265IX,264-265,-/IX,C3N-01825,UCEC,chr8:17548817,...,-1,-,ATTCA,17548817,ATGAAGATAGAAACAAGTGGTTATAACTCAGACAAACTAATTTGTC...,T,840,792,1,ATGAAGATAGAAACAAGTGGTTATAACTCAGACAAACTAATTTGTC...
16426,TAOK1,ENST00000261716,C3N-01825__-__TTCATTG,chr17:29510944-29510945,-552-553FIX,552-553,-/FIX,C3N-01825,UCEC,chr17:29510944,...,-1,-,TTCATTG,29510944,ATGCCATCAACTAACAGAGCAGGCAGCCTGAAGGACCCTGAAATTG...,C,2175,1656,1,ATGCCATCAACTAACAGAGCAGGCAGCCTGAAGGACCCTGAAATTG...


## 5. Translation

In [18]:
genetic_code = {'GAA': 'E', 'CGA': 'R', 'GTG': 'V', 'TAA': '*', 'CGT': 'R', 'ATA': 'I', 'GAC': 'D', 'TCG': 'S', 
                  'GAT': 'D', 'ATG': 'M', 'CTG': 'L', 'CTA': 'L', 'TAC': 'Y', 'GGA': 'G', 'CGG': 'R', 'AGC': 'S', 
                  'TCT': 'S', 'TGA': '*', 'AAA': 'K', 'ACC': 'T', 'ACA': 'T', 'TGC': 'C', 'AAG': 'K', 'GTC': 'V', 
                  'TCC': 'S', 'ACT': 'T', 'AGA': 'R', 'CTT': 'L', 'GCC': 'A', 'GTA': 'V', 'TAG': '*', 'CAA': 'Q', 
                  'CAC': 'H', 'GCT': 'A', 'TTA': 'L', 'CAT': 'H', 'CGC': 'R', 'TTC': 'F', 'ATT': 'I', 'GGC': 'G', 
                  'CAG': 'Q', 'AAC': 'N', 'CCC': 'P', 'GTT': 'V', 'AGG': 'R', 'TGT': 'C', 'CCG': 'P', 'GGG': 'G', 
                  'ATC': 'I', 'TTT': 'F', 'AAT': 'N', 'TCA': 'S', 'GAG': 'E', 'CCA': 'P', 'GCA': 'A', 'TAT': 'Y', 
                  'GGT': 'G', 'TGG': 'W', 'GCG': 'A', 'CTC': 'L', 'TTG': 'L', 'CCT': 'P', 'ACG': 'T', 'AGT': 'S'}

def translation(nt_seq_list):
    '''Function to translate in silico a list of cdna sequences'''
    prot_seq_list = []
    for nt_seq in nt_seq_list:
        if (nt_seq[0] is np.nan)|(nt_seq[0] == 'Sequence unavailable')|(nt_seq[0] == 'No_ref_match')|(nt_seq[0] == 'Short_seq_wt'):
            prot_seq_ids = [nt_seq[1],nt_seq[2],nt_seq[3],nt_seq[0]]
            prot_seq_list.append(prot_seq_ids)
        else:
            nt_seq_triplets = [nt_seq[0][i:i+3] for i in range(0, len(nt_seq[0]), 3)]
            prot_seq = ''

            for triplet in nt_seq_triplets:
                if len(triplet) == 3:
                    if 'N' in triplet:
                        prot_seq += 'X' 
                    else:
                        prot_seq += genetic_code[triplet]
                        if '*' in prot_seq:
                            break
                else:
                    break
            prot_seq_ids = [nt_seq[1],nt_seq[2],nt_seq[3],prot_seq]
            prot_seq_list.append(prot_seq_ids)
    return(prot_seq_list)

In [22]:
def translation_mut_table (df):
    '''Function to prepare a table with translated mutated sequences'''
    seq_mut_list = df[['seq_mut','Feature','#Uploaded_variation','Location']].values.tolist()
    prot_seq_mut_list = translation(seq_mut_list)
    prot_seq_mut_df = pd.DataFrame(prot_seq_mut_list, columns=['Feature','#Uploaded_variation','Location','prot_seq'])
    prot_mut_df = pd.merge(df,prot_seq_mut_df,how='left')
    return (prot_mut_df)

In [None]:
#Translate sequences CCLE
cptac_prot_mut_df = translation_mut_table(cptac_mut_df)
ccle_prot_mut_df = translation_mut_table(ccle_mut_df)

In [None]:
def translation_wt_table (df,seq_df):
    '''Function to prepare a table with translated wt sequences'''
    #Prepare cptac_df table for the merge
    wt_df = df[['gene','Feature','#Uploaded_variation','Location','protein_mutation','Protein_position','Amino_acids','sample','Dataset']][df['Phenotype']!='WT']
    wt_df.drop_duplicates(keep='first',inplace=True)

    #Merge seq with cptac table for all proteins
    seq_wt_df = seq_df[['ensembl_transcript_id','coding']]
    seq_wt_df.drop_duplicates(keep='first',inplace=True)
    seq_wt_df = seq_wt_df.rename(columns={'ensembl_transcript_id':'Feature','coding':'seq_wt'})

    wt_df = pd.merge(wt_df, seq_wt_df, how='left')

    #Translate wt sequences
    seq_wt_list = wt_df[['seq_wt','Feature','#Uploaded_variation','Location']].values.tolist()
    prot_seq_wt_list = translation(seq_wt_list)
    prot_wt_df = pd.DataFrame(prot_seq_wt_list, columns=['Feature','#Uploaded_variation','Location','prot_seq'])
    return (prot_wt_df)

In [None]:
#Create a table qith translated sequences
cptac_prot_wt_df = translation_wt_table (cptac_df,seq_df)
ccle_prot_wt_df = translation_wt_table (ccle_df,seq_df)

## 6. Search for c-term degrons in the generated sequences

In [30]:
def find_cdegron(prot_list, cdegron_motif):
    """This function finds all proteins containing c-terminal degrons (cdegrons)"""
    cdegron_seq_list = [] 
    for mutation in prot_list:
        seq = mutation[0]
        splitted = re.split('\*', str(seq))
        seq2 = splitted[0]
        motif_re = cdegron_motif +'$'
        motif_re = motif_re.replace('X', '.')
        find_cdegron_motif = re.findall(motif_re, seq2)
        if find_cdegron_motif != []:
            mutation_list = []
            mutation_list.append(mutation[1])
            mutation_list.append(mutation[2])
            mutation_list.append(mutation[3])
            mutation_list.append(seq)
            mutation_list.append(True)
            cdegron_seq_list.append(mutation_list)
    cdegron_df = pd.DataFrame(cdegron_seq_list,columns=['Feature','#Uploaded_variation','Location','prot_seq',cdegron_motif])
    return cdegron_df

In [32]:
cdegron_motifs = ['GG', 'RG', 'PG', 'XR', 'RXXG', 'EE', 'RXX', 'VX', 'AX', 'A']

def find_cdegron_list(prot_list, cdegron_motifs):
    """This function finds all cdegron motifs provided in a list"""
    col_names=['Feature','#Uploaded_variation','Location','prot_seq']
    all_cdegron_df = pd.DataFrame(columns=col_names)
    for motif in cdegron_motifs:
        cdegron_df = find_cdegron(prot_list, motif)
        all_cdegron_df = pd.concat([all_cdegron_df,cdegron_df])
    all_cdegron_df = all_cdegron_df.fillna(False)
    dupl = all_cdegron_df[all_cdegron_df[['Feature','#Uploaded_variation','Location']].duplicated()].values.tolist()
    df2 = pd.DataFrame()
    for seq in dupl:
        df1 = all_cdegron_df[(all_cdegron_df['Feature']==seq[0])&(all_cdegron_df['#Uploaded_variation']==seq[1])&(all_cdegron_df['Location']==seq[2])].groupby(['Feature','#Uploaded_variation','Location','prot_seq'], as_index=False).sum()
        df2 = pd.concat([df2,df1])
    all_cdegron_df.drop_duplicates(subset=['Feature','#Uploaded_variation','Location','prot_seq'], keep=False, inplace=True)
    df2.drop_duplicates(keep='first', inplace=True)
    all_cdegron_df = pd.concat([all_cdegron_df,df2],ignore_index=True)
    return all_cdegron_df

In [None]:
def cdegron_table (prot_mut_df, prot_wt_df, cdegron_motifs,df,seq_df)
    '''This function creates a table with c-degrons annotated per sequence'''
    #Search cdegron motifs in cptac_mut_df
    prot_seq_mut_list = prot_mut_df[['prot_seq_mut','Feature','#Uploaded_variation','Location']].values.tolist()
    cdegron_mut_df = find_cdegron_list(prot_seq_mut_list,cdegron_motifs)

    #Add proteins without cterm degrons to the table
    prot_seq_mut_df = cptac_prot_mut_df[['gene','Feature','#Uploaded_variation','Location','prot_seq']]
    prot_seq_mut_df = prot_seq_mut_df.rename(columns={'prot_seq_mut':'prot_seq'})
    cdegron_mut_df = pd.merge(prot_seq_mut_df,cdegron_mut_df,how='left')
    cdegron_mut_df = cdegron_mut_df.fillna(False)
    
    #Find wt proteins with cterm degrons
    prot_wt_list = prot_wt_df[['prot_seq','Feature','#Uploaded_variation','Location']].values.tolist()
    cdegron_wt_df = find_cdegron_list(prot_wt_list,cdegron_motifs)

    #Add proteins without cterm degrons to the table
    cdegron_wt_df = pd.merge(prot_wt_df,cdegron_wt_df,how='left')
    cdegron_wt_df = cdegron_wt_df.fillna(False)

    #Add 'gene' and 'Phenotype' to cdegron_wt_df
    id_df = cptac_df[['gene','Feature']][cptac_df['Phenotype']!='WT']
    id_df.drop_duplicates(keep='first',inplace=True)
    cdegron_wt_df = pd.merge(cdegron_wt_df,id_df,how='left')

    cdegron_wt2_df = cdegron_wt_df.drop(columns=['Feature','#Uploaded_variation','Location'])
    phenotype_wt = 'WT'
    cdegron_wt2_df['Phenotype'] = phenotype_wt

    #Add phenotype to cdegron_mut_df
    phenotype_mut = df[['Feature','#Uploaded_variation','Location','Phenotype']][(df['Phenotype']=='stop_gained')|(df['Phenotype']=='frameshift_variant')|(df['Phenotype']=='stop_lost')]
    cdegron_mut_df = pd.merge(cdegron_mut_df,phenotype_mut,how='left')
    cdegron_mut_df = pd.merge(cdegron_mut_df,id_df,how='left')

    #concat wt and mut
    cdegron_df = pd.concat([cdegron_wt2_df,cdegron_mut_df])
    cdegron_df.drop_duplicates(keep='first',inplace=True) 
    return (cdegron_df)

In [None]:
#Create tables with c-degron annotations
cptac_cdegron_df = cdegron_table (cptac_prot_mut_df, cptac_prot_wt_df, cdegron_motifs, cptac_df, seq_df)
ccle_cdegron_df = cdegron_table (ccle_prot_mut_df, ccle_prot_wt_df, cdegron_motifs, ccle_df, seq_df)

In [45]:
#Save table with cdegron annotation
cptac_cdegron_df.to_csv(r'cdegron_wtnsfs_cptac.tsv', header = True, index = None, sep = '\t')
ccle_cdegron_df.to_csv(r'cdegron_wtnsfs_ccle.tsv', header = True, index = None, sep = '\t')

## 7. Get c-terminal amino acid (last aa)

In [46]:
def get_last_aa(prot_list, aa):
    """This function finds all proteins containing c-terminal degrons (cdegrons)"""
    aa_seq_list = [] 
    for mutation in prot_list:
        seq = mutation[0]
        splitted = re.split('\*', str(seq))
        seq2 = splitted[0]
        aa_re = aa +'$'
        find_aa = re.findall(aa_re, seq2)
        if find_aa != []:
            mutation_list = []
            mutation_list.append(mutation[1])
            mutation_list.append(mutation[2])
            mutation_list.append(mutation[3])
            mutation_list.append(seq)
            mutation_list.append(aa)
            aa_seq_list.append(mutation_list)
    aa_df = pd.DataFrame(aa_seq_list,columns=['Feature','#Uploaded_variation','Location','prot_seq','last_aa'])
    return aa_df


In [47]:
aa_list = ['A','C','D','E','F','G','H','I','K','Q','L','M','N','P','R','S','T','V','W','Y']

def get_last_aa_list(prot_list, aa_list):
    """This function finds all cdegron motifs provided in a list"""
    col_names=['Feature','#Uploaded_variation','Location','prot_seq']
    all_aa_df = pd.DataFrame()
    for aa in aa_list:
        aa_df = get_last_aa(prot_list, aa)
        all_aa_df = pd.concat([all_aa_df,aa_df],ignore_index=True)
    return all_aa_df

In [None]:
def last_aa_table (prot_mut_df,aa_list, prot_wt_df, df)
    '''This function annotates all c-terminal amino acids (last aa) from mutated and wt sequences'''
    
    prot_seq_mut_list = prot_mut_df[['prot_seq_mut','Feature','#Uploaded_variation','Location']].values.tolist()
    aa_mut_df = get_last_aa_list(prot_seq_mut_list,aa_list)

    #Get last aa from wt sequences
    prot_wt_list = prot_wt_df[['prot_seq','Feature','#Uploaded_variation','Location']].values.tolist()
    aa_wt_df = get_last_aa_list(prot_wt_list,aa_list)

    #Add 'gene' and 'Phenotype'
    id_df = cptac_df[['gene','Feature']][cptac_df['Phenotype']!='WT']
    id_df.drop_duplicates(keep='first',inplace=True)
    aa_wt_df = pd.merge(aa_wt_df,id_df,how='left')

    aa_wt2_df = aa_wt_df.drop(columns=['Feature','#Uploaded_variation','Location'])
    phenotype_wt = 'WT'
    aa_wt2_df['Phenotype'] = phenotype_wt

    phenotype_mut = df[['Feature','#Uploaded_variation','Location','Phenotype']][(df['Phenotype']=='stop_gained')|(df['Phenotype']=='frameshift_variant')]
    aa_mut_df = pd.merge(aa_mut_df,phenotype_mut,how='left')
    aa_mut_df = pd.merge(aa_mut_df,id_df,how='left')

    #concat wt and mut
    aa_df = pd.concat([aa_wt2_df,aa_mut_df])
    aa_df.drop_duplicates(keep='first',inplace=True) 
    return (aa_df)

In [None]:
#Create tables with last aa annotation
cptac_aa_df = last_aa_table (cptac_prot_mut_df, aa_list, cptac_prot_wt_df, cptac_df)
ccle_aa_df = last_aa_table (ccle_prot_mut_df, aa_list, ccle_prot_wt_df, ccle_df)

In [54]:
#Save table with last aa annotation
cptac_aa_df.to_csv(r'last_aa_cptac.tsv', header = True, index = None, sep = '\t')
ccle_aa_df.to_csv(r'last_aa_ccle.tsv', header = True, index = None, sep = '\t')