In [1]:
import pandas as pd 
import numpy as np
from scipy import stats
import random
import statsmodels.stats.multitest as multi
import matplotlib.pyplot as plt

In [8]:
#CGC tsv can be downloaded at https://cancer.sanger.ac.uk/census
cgc=pd.read_csv("Census_all.tsv",sep='\t')

#provide a list of all possible aliases for mutsig gene names
gene_key=pd.read_csv("all_gene_targets.txt",sep='\t')

#read sig gene list and subset to those with a q-value less than 0.1
sig_genes=pd.read_csv("sig_genes.txt",sep='\t')
expression_genes=sig_genes['gene']
sig_genes=sig_genes.loc[sig_genes['q']<0.1,'gene']

gene_key=gene_key[gene_key['gene'].isin(expression_genes)]
gene_key=gene_key.reset_index(drop=True)

sig_cgc=len(gene_key.loc[gene_key['gene'].isin(sig_genes)&(gene_key['gene'].isin(cgc['Gene Symbol'])|
                      gene_key['Hugo_Symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Approved name'].isin(cgc['Gene Symbol'])|
                        gene_key['Alias symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Previous symbol'].isin(cgc['Gene Symbol'])),'gene'].unique())

cutoff=sig_cgc/len(sig_genes.index)

n_exceed=0
sim_range=np.arange(0,1001)

chip_whitelist=pd.read_csv("~/Documents/for_abhishek/whitelists/Sidd_chip_gene_list.txt",sep='\t')

#permutation test sampling a set of genes equal to the number of significantly mutated genes
#and identifying if there are more genes in the CGC than the number of genes sampled
for run in sim_range:
    gene_set=np.random.choice(gene_key['gene'].unique(),len(sig_genes.index))
    
    sim_cgc=len(gene_key.loc[gene_key['gene'].isin(gene_set)&(gene_key['gene'].isin(cgc['Gene Symbol'])|
                      gene_key['Hugo_Symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Approved name'].isin(cgc['Gene Symbol'])|
                        gene_key['Alias symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Previous symbol'].isin(cgc['Gene Symbol'])),'gene'].unique())
    
    membership_ratio=sim_cgc/len(sig_genes.index)
    print(membership_ratio)
    
    if membership_ratio>cutoff:
        n_exceed+=1
        
p_exceed=n_exceed/len(sim_range)

print([n_exceed,p_exceed])

#calculate fisher's exact test to identify if the number of sig genes also found in the CGC is at a 
#higher rate than expected

sig_cgc, sig_c_cgc = len(gene_key.loc[gene_key['gene'].isin(sig_genes)&(gene_key['gene'].isin(cgc['Gene Symbol'])|
                      gene_key['Hugo_Symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Approved name'].isin(cgc['Gene Symbol'])|
                        gene_key['Alias symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Previous symbol'].isin(cgc['Gene Symbol'])),'gene'].unique()),len(
    gene_key.loc[(gene_key['gene'].isin(cgc['Gene Symbol'])|
                      gene_key['Hugo_Symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Approved name'].isin(cgc['Gene Symbol'])|
                        gene_key['Alias symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Previous symbol'].isin(cgc['Gene Symbol'])),'gene'].unique())-len(
    gene_key.loc[gene_key['gene'].isin(sig_genes)&(gene_key['gene'].isin(cgc['Gene Symbol'])|
                      gene_key['Hugo_Symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Approved name'].isin(cgc['Gene Symbol'])|
                        gene_key['Alias symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Previous symbol'].isin(cgc['Gene Symbol'])),'gene'].unique())

cgc_c_sig, cgc_c_sig_c = 20, len(
    gene_key.loc[~(gene_key['gene'].isin(sig_genes)&(gene_key['gene'].isin(cgc['Gene Symbol'])|
                      gene_key['Hugo_Symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Approved name'].isin(cgc['Gene Symbol'])|
                        gene_key['Alias symbol'].isin(cgc['Gene Symbol'])|
                        gene_key['Previous symbol'].isin(cgc['Gene Symbol']))),'gene'].unique())

oddsratio, pvalue = stats.fisher_exact([[sig_cgc, sig_c_cgc], [cgc_c_sig, cgc_c_sig_c]],alternative='greater')

print([sig_cgc,sig_c_cgc,cgc_c_sig,cgc_c_sig_c])
print([oddsratio,pvalue])

140376
Index(['Gene Symbol', 'Name', 'Entrez GeneId', 'Genome Location', 'Tier',
       'Hallmark', 'Chr Band', 'Somatic', 'Germline', 'Tumour Types(Somatic)',
       'Tumour Types(Germline)', 'Cancer Syndrome', 'Tissue Type',
       'Molecular Genetics', 'Role in Cancer', 'Mutation Types',
       'Translocation Partner', 'Other Germline Mut', 'Other Syndrome',
       'Synonyms'],
      dtype='object')
    Gene Symbol                              Tumour Types(Somatic)
34        ASXL1                                          MDS, CMML
38          ATM                                              T-PLL
92          CBL                                     AML, JMML, MDS
124       CEP89                                    Spitzoid tumour
155       CSF1R             MDS, CML, AML, hemangioblastoma, CCRCC
185      DNMT3A                                                AML
230       FANCA                                                NaN
317        IDH2                                       glio