# Relative mutation positions

In [1]:
import pandas as pd
from tqdm.notebook import trange, tqdm

## 1. Prepare CPTAC table

In [2]:
####################
#Load CPTAC dataset
####################

print('Load CPTAC datasets')

CPTAC = ['BRCA', 'CCRCC', 'COAD', 'GBM', 'LUAD', 'OV', 'UCEC']
cptac_df = pd.DataFrame(columns=['gene', 'cpct_aliquot', 'protein_expression', 'aliquot', 'sample',
       'Type', 'median', 'stdev', 'norm_protein_expression', 'log2fpkm',
       'log10fpkm', '#Uploaded_variation', 'Location', 'Allele', 'Feature',
       'protein_mutation', 'Protein_position', 'Amino_acids', 'Consequence',
       'Phenotype', 'Ubiquitinases_Mutated', 'Altered_E3_Ligases',
       'Raw_Residual', 'Stability_Change', 'ABS_Stability_Change'])
for cancer in tqdm(CPTAC, total=len(CPTAC),desc='Load CPTAC data'):
    df = pd.read_csv("/workspace/projects/cptac_analysis/data/"+cancer+"/dataset_irls.gz", sep='\t')
    Dataset = [cancer] * len(df)
    df['Dataset'] = Dataset
    frames = [cptac_df, df]
    cptac_df = pd.concat(frames)
cptac_df.drop_duplicates(keep='first',inplace=True) 

cptac_df['ID'] = cptac_df['gene'].astype(str)+cptac_df['Feature'].astype(str)+cptac_df['sample'].astype(str)+cptac_df['#Uploaded_variation'].astype(str)+cptac_df['Location'].astype(str)

######################################################################################################
#Calculate the mean of rna values for the repeated mutations (log2fpkm, log10fpkm, Stability_Change)
######################################################################################################

print('Eliminate duplicated rna measures')

dupl_samples = ['C3L-00908', 'C3N-00545', 'C3N-01825']

cptac_dupl_df = cptac_df[cptac_df['sample'].isin(dupl_samples)]
df1 = cptac_dupl_df[cptac_dupl_df['Feature'].isnull()].groupby(['gene','sample'],as_index=False).mean()
df2 = cptac_dupl_df[~cptac_dupl_df['Feature'].isnull()].groupby(['gene','Feature','#Uploaded_variation','Location'],as_index=False).mean()
cptac_dupl2_df = pd.concat([df1,df2])
cptac_dupl3_df = cptac_dupl_df.drop(['log2fpkm','log10fpkm','Raw_Residual','Stability_Change','ABS_Stability_Change'],axis=1)
cptac_dupl4_df = pd.merge(cptac_dupl3_df,cptac_dupl2_df,how='left')
cptac_dupl4_df.drop_duplicates(keep='first',inplace=True)
cptac2_df = cptac_df[~(cptac_df['sample'].isin(dupl_samples))]
cptac_df = pd.concat([cptac2_df,cptac_dupl4_df],ignore_index=True)

###########################################################
#Load cdegron table, merge and create cterm_degron column
###########################################################

print('Load cdegron table and merge')

cdegron_df = pd.read_csv(r'cdegron_wtnsfs_cptac.tsv', sep = '\t')
cptac_df = pd.merge(cptac_df,cdegron_df,how='left')

Load CPTAC datasets


HBox(children=(FloatProgress(value=0.0, description='Load CPTAC data', max=7.0, style=ProgressStyle(descriptio…


Eliminate duplicated rna measures
Load cdegron table and merge


## 2. Prepare CCLE table

In [5]:
###################
#Load CCLE dataset
####################

print('Load CCLE dataset')
ccle_df = pd.read_csv("/workspace/projects/cptac_analysis/data/CCLE/dataset_irls.gz", sep='\t')

ccle_df['ID'] = ccle_df['gene'].astype(str)+ccle_df['Feature'].astype(str)+ccle_df['sample'].astype(str)+ccle_df['#Uploaded_variation'].astype(str)+ccle_df['Location'].astype(str)

#############################################################################################################################
#Calculate the mean of protein values for the repeated mutations (protein_expression, norm_protein_expression, Raw_Residual, Stability_Change, ABS_Stability_Change)
#############################################################################################################################

print('Eliminate duplicated protein measures')

df1 = ccle_df[ccle_df['Feature'].isnull()].groupby(['gene','sample','Phenotype'],as_index=False).mean()
df2 = ccle_df[~ccle_df['Feature'].isnull()].groupby(['gene','Feature','sample','#Uploaded_variation','Location','Phenotype'],as_index=False).mean()
ccle2_df = pd.concat([df1,df2])
ccle3_df = ccle_df.drop(['protein_expression', 'norm_protein_expression', 'Raw_Residual', 'Stability_Change', 'ABS_Stability_Change'],axis=1)
ccle4_df = pd.merge(ccle3_df,ccle2_df,how='left')
ccle_df = ccle4_df.drop_duplicates(keep='first')

################################################################
#Import cterm degrons table, merge and add cterm_degron column
################################################################

print('Load cdegron table and merge')

cdegron_df = pd.read_csv(r'cdegron_wtnsfs_ccle.tsv', sep = '\t')
ccle_df = pd.merge(ccle_df,cdegron_df,how='left')

Load CCLE dataset
Eliminate duplicated protein measures
Load cdegron table and merge


## 3. Get relative mutation position

In [30]:
#Get relative mutation positions for CPTAC dataset (takes a lot of time, 1-5h)
df = cptac_df
phenotypes = ['stop_gained','frameshift_variant','missense_variant','synonymous_variant']
genes_mut_df = df[['gene']][(~df['prot_seq'].isnull())&(df['Phenotype'].isin(phenotypes))]
genes_mut_df = genes_mut_df.drop_duplicates(keep='first')
genes_mut_list = genes_mut_df['gene'].to_list()

#Prepare df all mutations
cptac_prots_mut_df = pd.DataFrame()
for prot in tqdm(genes_mut_list):
    prot_df = df[['ID','Phenotype','prot_seq','Protein_position','Stability_Change','gene']][df['gene']==prot]
    prot_df = prot_df.drop_duplicates(subset=['ID'])
    prot_wt_df = prot_df[prot_df['Phenotype']=='WT']
    prot_wt_list = prot_wt_df.values.tolist()

    prot_mut_df = prot_df[prot_df['Phenotype'].isin(phenotypes)]
    prot_mut_list = prot_mut_df.values.tolist()

    #Distance to cterm from end of sequence

    prot_length = len(prot_wt_list[0][2])

    pos_list = prot_mut_df['Protein_position'].tolist()
    pos1_list = []
    for pos in pos_list:
        pos1 = pos.split('-')[0]
        pos1 = int(pos1)/prot_length
        pos1_list.append(pos1)
    pos1_list
    prot_mut_df['Protein_position2'] = pos1_list

    cptac_prots_mut_df = pd.concat([cptac_prots_mut_df,prot_mut_df], ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=6421.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





In [34]:
#Get relative mutation positions for CCLE dataset (takes a lot of time, 1-5h)
df = ccle_df
phenotypes = ['stop_gained','frameshift_variant','missense_variant','synonymous_variant']
genes_mut_df = df[['gene']][(~df['prot_seq'].isnull())&(df['Phenotype'].isin(phenotypes))]
genes_mut_df = genes_mut_df.drop_duplicates(keep='first')
genes_mut_list = genes_mut_df['gene'].to_list()

#Prepare df all mutations
ccle_prots_mut_df = pd.DataFrame()
for prot in tqdm(genes_mut_list):
    prot_df = df[['ID','Phenotype','prot_seq','Protein_position','Stability_Change','gene']][df['gene']==prot]
    prot_df = prot_df.drop_duplicates(subset=['ID'])
    prot_wt_df = prot_df[prot_df['Phenotype']=='WT']
    prot_wt_list = prot_wt_df.values.tolist()

    prot_mut_df = prot_df[prot_df['Phenotype'].isin(phenotypes)]
    prot_mut_list = prot_mut_df.values.tolist()

    #Distance to cterm from end of sequence

    prot_length = len(prot_wt_list[0][2])

    pos_list = prot_mut_df['Protein_position'].tolist()
    pos1_list = []
    for pos in pos_list:
        pos1 = pos.split('-')[0]
        pos1 = int(pos1)/prot_length
        pos1_list.append(pos1)
    pos1_list
    prot_mut_df['Protein_position2'] = pos1_list

    ccle_prots_mut_df = pd.concat([ccle_prots_mut_df,prot_mut_df], ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=5253.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





In [35]:
#Save tables with relative mutation positions
cptac_prots_mut_df.to_csv(r'pos_rel_cptac.tsv', header = True, index = None, sep = '\t')
ccle_prots_mut_df.to_csv(r'pos_rel_ccle.tsv', header = True, index = None, sep = '\t')