In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import numpy
import re
import math
import scipy.stats
from collections import Counter
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.multitest import fdrcorrection

notebookPath = 'scripts/figure2'
projectDir = re.sub(notebookPath, '', os.getcwd())
sys.path.append(os.path.join(projectDir, 'scripts/utilityScripts'))

import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import clonality_analysis_util
import get_gene_and_cohort_list_utils
import configuration_util

filePathDict = configuration_util.get_all_files_path_dict()

In [2]:
#Set where to write the files
writeDir = os.path.join(projectDir, 'scripts/figure2/FIGURE2_PLOTTING_FILES/plotDataFiles/')

## Figure 2A
Type of driver mutations found in hypermutated tumors

In [3]:
#makes a dataframe of counts of oncogenic mutations
def make_counts_df(maf, dominantSignatureDict):
    
    def summarize_counts_for_mutation_type(oMaf, mutationType):
        
        oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
        tsgs = get_gene_and_cohort_list_utils.get_tsgs()
        truncatingConsequences = set(['Frame_Shift_Del', 'Frame_Shift_Ins', 'Nonsense_Mutation'])
        
        sMaf = None
        counts = None
        if mutationType == 'tsgTrunc':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(tsgs) &
                (oncogenicMaf['Variant_Classification'].isin(truncatingConsequences))]
        if mutationType == 'oncogene':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(oncogenes)]
        if mutationType == 'tsgMissense':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(tsgs) &
                                (~oncogenicMaf['Variant_Classification'].isin(truncatingConsequences))]
            
        counts = Counter(sMaf['Tumor_Sample_Barcode'])
        counts = [(case, count) for case, count in counts.items()] + [(case, 0) for
            case in set(allIds) - set(sMaf['Tumor_Sample_Barcode'])] 
        return dict(counts)

    
    cancerTypeDict = dict(zip(maf['Tumor_Sample_Barcode'], maf['cancerType']))
    allIds = set(maf['Tumor_Sample_Barcode'])
    oncogenicMaf = maf[maf['oncogenic'].notnull()]
    
    tsgTruncatingCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'tsgTrunc')
    oncogenicCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'oncogene')
    tsgMissenseCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'tsgMissense')
    listOfDicts = []
    for case in allIds:
        listOfDicts.append({'Tumor_Sample_Barcode': case, 'TsgTruncating': tsgTruncatingCounts[case],
                            'Oncogene': oncogenicCounts[case], 'TsgMissense': tsgMissenseCounts[case],
                            'cancerType': cancerTypeDict[case], 
                            'dominantSignature': dominantSignatureDict[case] if case in dominantSignatureDict else None
                           })
    return pd.DataFrame(listOfDicts)

def sample_normal_maf(normalCounts, hyperCounts, N=25):
    
    hyperCancerTypeCounter = Counter(hyperCounts['cancerType'])
    normalCancerTypeCounter = Counter(normalCounts['cancerType'])
    normalCounts['hypermutatedCount'] = normalCounts['cancerType'].apply(lambda x:
            1.0*hyperCancerTypeCounter[x]/normalCancerTypeCounter[x]
        if x in hyperCancerTypeCounter else 0)
    sampledCounts = normalCounts.sample(frac=N, weights='hypermutatedCount', replace=True)
    return sampledCounts
   
def create_summary_df(df):
    listOfDicts = []
    for variable in set(df['variable']):
        hypermutatedData = df[df['burdenType'] == 'hypermutated']
        nonHypermutatedData = df[df['burdenType'] != 'hypermutated']
        listOfDicts.append({
            'frac': sum(hypermutatedData[hypermutatedData['variable'] == variable]['value'])/1.0*sum(hypermutatedData['value']),
            'burdenType': 'hypermutated', 'mutType': variable
        })
        listOfDicts.append({
            'frac': sum(nonHypermutatedData[nonHypermutatedData['variable'] == variable]['value'])/1.0*sum(nonHypermutatedData['value']),
            'burdenType': 'non-hypermutated', 'mutType': variable
        })
    summaryDf = pd.DataFrame(listOfDicts)
    return summaryDf


In [4]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
dominantSignatureDict = get_gene_and_cohort_list_utils.get_pan_impact_signature_mapping(
    filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
allHypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
allNormalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO'])

allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
hypermutatedMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(allHypermutantIds)]
nonHypermutatedMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(allNormalIds)]

#Use sampling to get dataframes that match the cancer type distributions
dfCountsNormal = make_counts_df(nonHypermutatedMaf, dominantSignatureDict)
dfCountsHypermutated = make_counts_df(hypermutatedMaf, dominantSignatureDict)
dfCountsNormalSampled = sample_normal_maf(dfCountsNormal, dfCountsHypermutated)

dfCountsHypermutated['burdenType'] = 'hypermutated'
dfCountsNormalSampled['burdenType'] = 'normal'
dfCombined = pd.concat([dfCountsHypermutated, dfCountsNormalSampled])

meltedDf = pd.melt(dfCombined, id_vars=['Tumor_Sample_Barcode', 'burdenType'], value_vars=['Oncogene', 'TsgMissense', 'TsgTruncating'])
summaryDf = create_summary_df(meltedDf)

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)
  sigsDf = pd.read_table(impactSigsPath)
  df = pd.read_table(path)
  impactCancerTypeDf = pd.read_table(impactCancerTypeInfoPath)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [11]:
writePath = os.path.join(writeDir, 'figure_2a.tsv')
summaryDf.to_csv(writePath, index=False, sep='\t')




## Figure 2B
Driver mutations in related vs unrelated genes

In [7]:
def summarize_related_unrelated_driver_frac(maf, relatedGenesD, tmbDict, relatedGeneSizeDict):
    listOfDicts = []
    cntr = 0
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    for case in set(maf['Tumor_Sample_Barcode']):
        
        caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
        cancerType = caseMaf['cancerType'].iloc[0]
        
        if cancerType in relatedGeneSizeDict:
        
            relatedGeneSize = relatedGeneSizeDict[cancerType]
            tmb = tmbDict[case] if case in tmbDict else None #todo get the real TMB
            relatedGenes = relatedGenesD[cancerType]

            caseMafRelated = caseMaf[caseMaf['Hugo_Symbol'].isin(relatedGenes)]
            caseMafUnrelated = caseMaf[~caseMaf['Hugo_Symbol'].isin(relatedGenes)]
            relatedDrivers = caseMafRelated[caseMafRelated['oncogenic'].notnull()]
            unrelatedDrivers = caseMafUnrelated[caseMafUnrelated['oncogenic'].notnull()]

            nRelatedDrivers = 1.0*relatedDrivers.shape[0]
            nUnrelatedDrivers = 1.0*unrelatedDrivers.shape[0]
            nTotalRelated = 1.0*caseMafRelated.shape[0]
            nTotalUnrelated = 1.0*caseMafUnrelated.shape[0]

            listOfDicts.append({'Tumor_Sample_Barcode': case, 'cancerType': cancerType,
                                'TMB': tmb, 'nTotalRelated': nTotalRelated, 'nRelatedDriver': nRelatedDrivers,
                                'nTotalUnrelated': nTotalUnrelated, 'nUnrelatedDrivers': nUnrelatedDrivers, 
                                'relatedGeneSize': relatedGeneSize
                               })
    df = pd.DataFrame(listOfDicts)
    return df

def summarize_gene_type_lengths(geneLengthDict, geneTypes):
    d = {}
    for geneType, genes in geneTypes.items():
        print 'summarizing', geneType
        panelSize = sum([geneLengthDict[gene] if gene in geneLengthDict else 0 for gene in genes])
        d[geneType] = panelSize
    return d

In [8]:
#add related gene size information

allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
allHypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
hypermutationMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(allHypermutantIds)]
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO'])
hypermutationMaf['cancerType'] = hypermutationMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

relatedGenesDict = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type(thresh = 1.0/30.0, impactMafPath = filePathDict['IMPACT_BASE_MAF'])

geneLengthInfo = pd.read_table(filePathDict['GENE_LENGTH_INFO'])
geneLengthDict = dict(zip(geneLengthInfo['hgnc_symbol'], geneLengthInfo['nt.length']))

#get summary information about indels 
geneTypeSizeDict = summarize_gene_type_lengths(geneLengthDict, relatedGenesDict)

tmbDict = get_gene_and_cohort_list_utils.get_all_tmb_info(tmbFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'])
df = summarize_related_unrelated_driver_frac(hypermutationMaf, relatedGenesDict, tmbDict, geneTypeSizeDict)


  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
  allImpactMutsMaf = pd.read_table(impactMafPath) #todo get this in a better way
  if self.run_code(code, result):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  oncMaf['caseGene'] = oncMaf['Tumor_Sample_Barcode'] + '_' + oncMaf['Hugo_Symbol']
  # This is added back by InteractiveShellApp.init_path()


summarizing Rhabdoid Cancer
summarizing Prostate Cancer
summarizing Melanoma
summarizing Colorectal Cancer
summarizing Head and Neck Cancer
summarizing Ovarian Cancer
summarizing Blood Cancer, NOS
summarizing Hepatobiliary Cancer
summarizing Penile Cancer
summarizing Small Cell Lung Cancer
summarizing Gestational Trophoblastic Disease
summarizing Renal Cell Carcinoma
summarizing Soft Tissue Sarcoma
summarizing Mature T and NK Neoplasms
summarizing Salivary Gland Cancer
summarizing Cancer of Unknown Primary
summarizing Angiomatoid Fibrous Histiocytoma
summarizing Pancreatic Cancer
summarizing Embryonal Tumor
summarizing Pheochromocytoma
summarizing Thyroid Cancer
summarizing Parathyroid Cancer
summarizing Breast Sarcoma
summarizing Mesothelioma
summarizing Nerve Sheath Tumor
summarizing Hodgkin Lymphoma
summarizing Uterine Sarcoma
summarizing Esophagogastric Cancer
summarizing Appendiceal Cancer
summarizing Soft Tissue Cancer
summarizing Primary CNS Melanocytic Tumors
summarizing Anal C

  df = pd.read_table(tmbFilePath)


In [12]:
writePath = os.path.join(writeDir, 'figure_2b.tsv')
df.to_csv(writePath, index=False, sep='\t')

In [14]:
print 'test fraction of drivers related and unrelated', scipy.stats.ks_2samp(
    np.array(df['nRelatedDriver']/df['nTotalRelated'])
    ,np.array(df['nUnrelatedDrivers']/df['nTotalUnrelated']))

 test fraction of drivers related and unrelated Ks_2sampResult(statistic=0.8989021043000915, pvalue=0.0)


## Figure 2C
Summarize the rate of mutation in MSI tumors in genes in distinct pathways

In [15]:
def summarize_mutations_by_pathway(hyperMaf, normalMaf, pathwayDf, pathways = [], cancerTypes = []):
    listOfDicts = []
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    impactGenes = get_gene_and_cohort_list_utils.get_im6_genes()
    truncatingConsequences = set(['Frame_Shift_Del', 'Frame_Shift_Ins', 'Nonsense_Mutation'])
    
    for cancerType in cancerTypes:
        cancerTypeHyper = hyperMaf[hyperMaf['cancerType'] == cancerType]
        cancerTypeNormal = normalMaf[normalMaf['cancerType'] == cancerType]
        nHyper = len(set(cancerTypeHyper['Tumor_Sample_Barcode']))
        nNormal = len(set(cancerTypeNormal['Tumor_Sample_Barcode']))
        for pathway in pathways:
            pathwayGenes = set(pathwayDf[pathwayDf['Pathway'] == pathway]['Gene'])
        
            for gene in pathwayGenes & impactGenes: #only include impact genes
                typePathwayHyper = cancerTypeHyper[cancerTypeHyper['Hugo_Symbol'] == gene] 
                typePathwayNormal = cancerTypeNormal[cancerTypeNormal['Hugo_Symbol'] == gene] 

                listOfDicts.append({'mutBurden': 'normal', 'pathway': pathway, 'nTotal': nNormal,
                                    'cancerType': cancerType, 'gene': gene,
                                    'nTruncTsg': len(set(typePathwayNormal[(typePathwayNormal['Variant_Classification'].isin(truncatingConsequences))]['Tumor_Sample_Barcode'])) if gene in tsgs else None, 
                                    'nVus': len(set(typePathwayNormal[(~typePathwayNormal['Variant_Classification'].isin(truncatingConsequences)) & (typePathwayNormal['oncogenic'].isnull())]['Tumor_Sample_Barcode'])), 
                                    'nTruncOncogene': len(set(typePathwayNormal[(typePathwayNormal['Variant_Classification'].isin(truncatingConsequences))]['Tumor_Sample_Barcode'])) if gene not in tsgs else None
                                   })

                listOfDicts.append({'mutBurden': 'hyper', 'pathway': pathway, 'nTotal': nHyper,
                                    'cancerType': cancerType,  'gene': gene,
                                    'nTruncTsg': len(set(typePathwayHyper[(typePathwayHyper['Variant_Classification'].isin(truncatingConsequences))]['Tumor_Sample_Barcode'])) if gene in tsgs else None, 
                                    'nVus': len(set(typePathwayHyper[(~typePathwayHyper['Variant_Classification'].isin(truncatingConsequences)) & (typePathwayHyper['oncogenic'].isnull())]['Tumor_Sample_Barcode'])), 
                                    'nTruncOncogene': len(set(typePathwayHyper[(typePathwayHyper['Variant_Classification'].isin(truncatingConsequences))]['Tumor_Sample_Barcode'])) if gene not in tsgs else None
                                   })
            
            
    df= pd.DataFrame(listOfDicts)
    df['mutBurdenPathway'] = df['mutBurden'] + '_' + df['pathway']
    return df

In [16]:
pathwayData = pd.read_csv(filePathDict['CANCER_PATHWAY_DATA'])
#add INPPL1 and JAK1 to PI3K signaling class
pathwayData = pd.concat([pathwayData, pd.DataFrame([{'Gene': 'INPPL1', 'Pathway': 'PI3K'}, {'Gene': 'JAK1', 'Pathway': 'PI3K'}])])

allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])

cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
msiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)
allMsiCasesMaf = allImpactMutsMaf[(allImpactMutsMaf['Tumor_Sample_Barcode'].isin(msiCases))]

allNormalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(allNormalIds)]


pathwayData = pd.concat([pathwayData, pd.DataFrame([{'Gene': 'INPPL1', 'Pathway': 'PI3K'}, {'Gene': 'JAK1', 'Pathway': 'PI3K'}])])
df = summarize_mutations_by_pathway(allMsiCasesMaf, normalMaf, pathwayData, pathways = ['WNT', 'PI3K'],
                                cancerTypes = ['Colorectal Cancer', 'Endometrial Cancer'])


  """
  msiInfoDf = pd.read_table(msiInfoFilePath)


In [17]:
writePath = os.path.join(writeDir, 'figure_2c.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2D
Summarizing composite mutation

In [18]:
mafWithMutationAttribution = pd.read_table(filePathDict['MAF_WITH_MUTATION_ATTRIBUTION'])
mafWithMutationAttribution = mafWithMutationAttribution[mafWithMutationAttribution['oncogenic'].notnull()]
tsgs = get_gene_and_cohort_list_utils.get_tsgs()
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO'])
mafWithMutationAttribution['cancerType'] = mafWithMutationAttribution['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
relatedGenesDict = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type(thresh = 1.0/30.0, impactMafPath = filePathDict['IMPACT_BASE_MAF'])

dominantSignatureDict = get_gene_and_cohort_list_utils.get_pan_impact_signature_mapping(
    filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
mafWithMutationAttribution['dominantSignature'] = mafWithMutationAttribution['Tumor_Sample_Barcode'].apply(lambda x:
                            dominantSignatureDict[x] if x in dominantSignatureDict else None)
mafWithMutationAttribution['related'] = mafWithMutationAttribution.apply(lambda row:
                'related' if row['cancerType'] in relatedGenesDict and row['Hugo_Symbol'] in relatedGenesDict[row['cancerType']]
                           else 'not-related', axis=1)
truncatingConsequences = set(['Frame_Shift_Del', 'Frame_Shift_Ins', 'Nonsense_Mutation'])
mafWithMutationAttribution['geneType'] = mafWithMutationAttribution.apply(lambda row:
    'tsg_truncating' if row['Hugo_Symbol'] in tsgs and row['Variant_Classification'] in truncatingConsequences
    else 'tsg_missense' if row['Hugo_Symbol'] in tsgs
    else 'oncogene', axis=1)

mafWithMutationAttribution['hypermutationInduced'] = mafWithMutationAttribution['hypermutationInduced'].apply(lambda x:
    'Almost certain' if x == 'hyperInduced'
    else 'Possible' if x == 'unclear'
    else 'Unlikely' if x == 'notHyperAttributable'
           else None)


mafWithMutationAttribution['geneCase'] = mafWithMutationAttribution.apply(lambda row: row['Tumor_Sample_Barcode'] + '_' + row['Hugo_Symbol'], axis=1)
mutCounts = mafWithMutationAttribution['geneCase'].value_counts()
mafWithMutationAttribution['isComposite'] = mafWithMutationAttribution['geneCase'].apply(lambda x:
            1 if x in mutCounts and mutCounts[x] > 1 else 0)
mafWithMutationAttribution['dominantSignatureAdj'] = mafWithMutationAttribution['dominantSignature'].apply(lambda x:
    '_MMR' if x == 'mean_1' or x == 'mean_MMR'
    else '_SMOKING' if x == 'mean_SMOKING'
    else '_APOBEC' if x == 'mean_APOBEC'
    else '_POLE' if x == 'mean_10'
    else '_TMZ' if x == 'mean_11'
    else '_UV' if x == 'mean_7'
    else 'other')


  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


In [19]:
writeMaf = mafWithMutationAttribution[['Hugo_Symbol', 'hypermutationInduced', 'isComposite', 'geneType', 'related', 'dominantSignatureAdj']]
writePath = os.path.join(writeDir, 'figure_2d.tsv')
writeMaf.to_csv(writePath, index=False, sep='\t')

## Figure 2E
Phasing mutations in hypermutated tumors

In [20]:
#load in phasing data, focus on hypermutated cases
phasingData = pd.read_table(filePathDict['PHASING_DATA'])
allHypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(
    hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
phasingHyper = phasingData[phasingData['Tumor_Sample_Barcode'].isin(allHypermutantIds)]

#Whitelist cases: cases with purity = NA with flat genomes (rather than bad facets fits)
#The phasing algorithm marks these cases as trans or separate cells, but because we know hte purity =NA comes from a flat genome we can mark them as trans
#this is important because many of our hypermutated cases fit this description
whitelistFlatGenomeCases = clonality_analysis_util.get_facets_whitelist()
phasingHyper['adjPhase'] = phasingHyper.apply(lambda row: 
    'trans' if row['phase'] == 'trans or separate cells' and row['Tumor_Sample_Barcode'] in whitelistFlatGenomeCases
    else row['phase'], axis=1)

dictionaryOfRelatedGenes = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type()

#Prepare data for plotting
plotThresh = 5
genesToHighlight = [key for key, value in dict(phasingHyper[(phasingHyper['oncogenic.1'].notnull()) & (phasingHyper['oncogenic.2'].notnull()) & (phasingHyper['adjPhase'].isin(['cis', 'trans']))][
    'Hugo_Symbol'].value_counts()).items() if value > plotThresh]

phasingHyper['label'] = phasingHyper.apply(lambda row: 
                                           '1 or 2 silent' if (row['Variant_Classification.1'] == 'Silent') | (row['Variant_Classification.2'] == 'Silent')
                                           else '1 or 2 VUS'
                                               if (not isinstance(row['oncogenic.1'], str)) | (not isinstance(row['oncogenic.2'], str))
                                           else row['Hugo_Symbol'] if row['Hugo_Symbol'] in genesToHighlight
                                           
                                           else 'related_tsg' if row['Role'] == 'TSG' and row['CANCER_TYPE'] in dictionaryOfRelatedGenes and row['Hugo_Symbol'] in dictionaryOfRelatedGenes[row['CANCER_TYPE']]
                                           else 'related_oncogene' if row['Role'] != 'TSG' and row['CANCER_TYPE'] in dictionaryOfRelatedGenes and row['Hugo_Symbol'] in dictionaryOfRelatedGenes[row['CANCER_TYPE']]
                                           
                                           else 'other_tsg' if row['Role'] == 'TSG'
                                           else 'other_oncogene' if row['Role'] == 'Oncogene'
                                           else 'other', axis=1)

phasingHyper['isTrans'] = phasingHyper['adjPhase'].apply(lambda x: 1 if x == 'trans' else 0 if x == 'cis' else None)
phasingHyperWrite = phasingHyper[phasingHyper['isTrans'].notnull()]

  
  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [21]:
writePath = os.path.join(writeDir, 'figure_2e.tsv')
phasingHyper.to_csv(writePath, index=False, sep='\t')

In [20]:
#significance test to show tsgs are significantly more likely to be mutated in trans
tsgPhasing = phasingHyper[phasingHyper['label'].isin(['APC', 'ARID1A', 'B2M', 'PTEN', 
                                                      'TP53', 'other_tsg', 'related_tsg'])]
oncogenePhasing = phasingHyper[phasingHyper['label'].isin(['PIK3CA', 'TERT', 'other_oncogene', 'related_oncogene'])]
nTsgTrans = tsgPhasing[tsgPhasing['adjPhase'] == 'trans'].shape[0]
nTsgCis = tsgPhasing[tsgPhasing['adjPhase'] == 'cis'].shape[0]
nOncogeneTrans = oncogenePhasing[oncogenePhasing['adjPhase'] == 'trans'].shape[0]
nOncogeneCis = oncogenePhasing[oncogenePhasing['adjPhase'] == 'cis'].shape[0]

print 'Fishers exact test:', scipy.stats.fisher_exact([[nTsgTrans, nTsgCis], [nOncogeneTrans, nOncogeneCis]])

Fishers exact test: (9.457219251336898, 2.1550482224061378e-13)
