Find which genes or proteins are in CGC, have been targeted by FDA drug targets and detected in CysDB using Cancer Census Genes and Human Protein Atlas annotations

In [None]:
import os, sys
import numpy as np
import pandas as pd
import csv
import matplotlib
import math
import string

In [None]:
def get_separated_df(df, xvar, new_xvar):
    
    df[new_xvar] = df[xvar].str.split(',')
    df = df.explode(new_xvar)
    df[new_xvar] = df[new_xvar].map(lambda x: str(x).strip())
    
    return df

In [None]:
cd = os.getcwd()
cd

In [None]:
date = '220916'

# Read CysDB IDs

In [None]:
c_df = pd.read_excel('Table_S1.xlsx', sheet_name = 'Fig1D')

In [None]:
p_df = pd.read_excel('Table_S1.xlsx', sheet_name = 'Fig1C')

In [None]:
id_c_df = c_df[c_df['identified'] == True]
id_c_ids = id_c_df['proteinid']

In [None]:
lig_c_df = c_df[c_df['ligandable'] == True]
lig_c_ids = lig_c_df['proteinid']

In [None]:
rxt_c_df = c_df[c_df['hyperreactive'] == True]
rxt_c_ids = rxt_c_df['proteinid']

In [None]:
id_p_df = p_df[p_df['identified'] == 'yes']
id_p_ids = id_p_df['proteinid']

In [None]:
lig_p_df = p_df[p_df['ligandable'] == 'yes']
lig_p_ids = lig_p_df['proteinid']

In [None]:
rxt_p_df = p_df[p_df['hyperreactive'] == 'yes']
rxt_p_ids = rxt_p_df['proteinid']

# Read Cancer Gene Census

In [None]:
# https://cancer.sanger.ac.uk/census
cgc_df = pd.read_csv('220903_cgc_dataset.csv')

# Merge CysDB IDs and CGC

In [None]:
cysdb_cgc_df = pd.merge(cgc_df, p_df[['proteinid', 'identified', 'ligandable', 'hyperreactive']], left_on = 'Uniprot_Mapped', right_on = 'proteinid', how = 'left')

In [None]:
id_cgc_df = cysdb_cgc_df[cysdb_cgc_df['identified'] == 'yes']

In [None]:
lig_cgc_df = cysdb_cgc_df[cysdb_cgc_df['ligandable'] == 'yes']

In [None]:
rxt_cgc_df = cysdb_cgc_df[cysdb_cgc_df['hyperreactive'] == 'yes']

In [None]:
id_cgc_som_df = get_separated_df(id_cgc_df, 'Tumour Types(Somatic)', 'Somatic')

In [None]:
cgc_som_df = get_separated_df(cgc_df, 'Tumour Types(Somatic)', 'Somatic')

# Not FDA Targeted but in CGC and Identified in CysDB

In [None]:
fda_cgc_df = cysdb_cgc_df[cysdb_cgc_df['FDA'] == 'yes']
fda_cgc_som_df = get_separated_df(fda_cgc_df, 'Tumour Types(Somatic)', 'Somatic')

In [None]:
no_fda_cgc_df = cysdb_cgc_df[cysdb_cgc_df['FDA'] != 'yes']
no_fda_cgc_som_df = get_separated_df(no_fda_cgc_df, 'Tumour Types(Somatic)', 'Somatic')

In [None]:
def get_mutation_df(df, xvar):
    count_df = pd.DataFrame()
    count_df[xvar] = list(df[xvar].value_counts().index)
    count_df['count'] = list(df[xvar].value_counts())
    
    return count_df

In [None]:
mutation_df = get_mutation_df(cgc_som_df[cgc_som_df['Somatic'] != "nan"], 'Somatic')

In [None]:
id_mutation_df = id_mutation_df.rename(columns = {'count': 'id_count'})

In [None]:
lig_mutation_df = lig_mutation_df.rename(columns = {'count': 'lig_count'})

In [None]:
rxt_mutation_df = rxt_mutation_df.rename(columns = {'count': 'rxt_count'})

In [None]:
merged_mutation_df = mutation_df.merge(id_mutation_df, on = 'Somatic')

In [None]:
merged_mutation_df = merged_mutation_df.merge(lig_mutation_df, on = 'Somatic')

In [None]:
merged_mutation_df = merged_mutation_df.merge(rxt_mutation_df, on = 'Somatic')

In [None]:
# create new dataframe with columns: Cancer, CGC_FDA, CysDB_LIG & CysDB_ID

def get_vals(fda_df, no_fda_df):
    fda = []
    no_fda_lig = []
    no_fda_lig_id = []
    cgc = []
    new_df = pd.DataFrame()
    
    for i in range(len(top_cancers)):
        
        cancer_fda_df = fda_df[fda_df['Somatic'] == top_cancers[i]]
        cancer_fda_vals = list(cancer_fda_df['Uniprot_Mapped'].unique())
        
        cancer_no_fda_df = no_fda_df[no_fda_df['Somatic'] == top_cancers[i]]
        cancer_no_fda_vals = list(cancer_no_fda_df['Uniprot_Mapped'].unique())
        
        cancer_lig_df = cancer_no_fda_df[cancer_no_fda_df['ligandable'] == 'yes']
        cancer_lig_vals = list(cancer_lig_df['Uniprot_Mapped'].unique())
                               
        cancer_id_df = cancer_no_fda_df[cancer_no_fda_df['identified'] == 'yes']
        cancer_id_vals = list(cancer_id_df['Uniprot_Mapped'].unique())
        
        cancer_cgc_df = no_fda_df[no_fda_df['Somatic'] == top_cancers[i]]
        cancer_cgc_ids = list(cancer_cgc_df['proteinid'].unique())
                              
        cgc_count = len(cancer_cgc_ids) - len(cancer_id_vals)
        lig_count = len(cancer_lig_vals)
        id_count = len(cancer_id_vals) - len(cancer_lig_vals)
        fda_count = len(cancer_fda_vals)
        
        
        fda.append(fda_count)
        no_fda_lig.append(lig_count)
        no_fda_lig_id.append(id_count)
    
    new_df['Cancer'] = top_cancers
    new_df['CGC_FDA'] = fda
    new_df['CysDB_LIG'] = no_fda_lig
    new_df['CysDB_ID'] = no_fda_lig_id
        
    return new_df

In [None]:
top_cancers = ['AML', 'melanoma', 'NSCLC', 'T-ALL', 'colorectal']

In [None]:
stacked_df = get_vals(fda_cgc_som_df, no_fda_cgc_som_df)

In [None]:
# totals of each row as column
stacked_df['total'] = [74, 43, 25, 30, 27]

In [None]:
stacked_df['CGC_Other'] = stacked_df['total'] - (stacked_df['CysDB_ID'] + stacked_df['CysDB_LIG'])

In [None]:
stacked_df.to_csv(date + '_cgc_fda_cysdb.csv', index = False)