In [6]:
#IMPORTS and CONFIGURATION

import pandas as pd
import sys
import os
import pandas as pd
import numpy as np
import re
from collections import Counter
import scipy.stats as stats

notebookPath = 'scripts/figure1'
projectDir = re.sub(notebookPath, '', os.getcwd())
sys.path.append(os.path.join(projectDir, 'scripts/utilityScripts'))

import configuration_util
import get_gene_and_cohort_list_utils
import analysis_utils
import mutationSigUtils
import maf_analysis_utils

filePathDict = configuration_util.get_all_files_path_dict()

In [7]:
#Set where to write the files
writeDir = os.path.join(projectDir, 'scripts/figure1/FIGURE1_PLOTTING_FILES/plotDataFiles')

## FIGURE 1A
Cancer types represented in cohort of hypermutated tumors

In [8]:
#Figure 1a functions (used for figure 1b as well)

def summarize_n_hypermutated_and_signatures_of_cases(dominantSignatureDict, mutClassificationDir):
    nCasesSummaryL = []
    signaturesSummaryL = []
    for f in os.listdir(mutClassificationDir):
        cancerType = re.sub('.tsv', '', f)
        filePath = os.path.join(mutClassificationDir, f)
        df = pd.read_table(filePath)
        df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x: dominantSignatureDict[x] if x in dominantSignatureDict else None)
  
        hypermutatedDf = df[df['hypermutantClassification'] == 'Hypermutated']
        highMutBurdenDf = df[df['hypermutantClassification'] == 'highMutationBurden']
        
        nCasesSummaryL.append({'cancerType': cancerType,
        'nHypermutated': hypermutatedDf.shape[0], 'nTotal': df.shape[0], 'nHighMutBurden': highMutBurdenDf.shape[0]})
      
        #TODO make the code for plotting the signatures
        for signature in set(df[df['dominantSignature'].notnull()]['dominantSignature']):
            
            signatureName = re.sub('mean_', '', signature)
            signaturesSummaryL.append({'cancerType': cancerType,
                'signature': signatureName, 'nCases': df.shape[0], 
                'nHypermutatedCases': hypermutatedDf[hypermutatedDf['dominantSignature'] == signature].shape[0],
                'nHighMutationBurdenCases': highMutBurdenDf[highMutBurdenDf['dominantSignature'] == signature].shape[0],
                'nTotal': df.shape[0]})
        
    casesSummaryDf = pd.DataFrame(nCasesSummaryL)
    signaturesSummaryDf = pd.DataFrame(signaturesSummaryL)
    return casesSummaryDf, signaturesSummaryDf

In [11]:
#Summarize dominant signatures
dominantSignatureDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath=filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
casesSummary, signaturesSummary = summarize_n_hypermutated_and_signatures_of_cases(dominantSignatureDict,
    mutClassificationDir = filePathDict['HYPERMUTATION_STATUS_IDS'])
minNCasesToDisplay = 250 #lump all cancers with fewer than this number of cases as other
casesSummary['cancerType'] = casesSummary.apply(lambda row: 'other' if row['nHypermutated'] + row['nHighMutBurden'] < 10 else row['cancerType'] ,axis=1)

#GET RID OF CANCER'S OF INDETERMINATE_ORIGIN
cancersOfIndeterminateOrigin = set(['Cancer_of_Unknown_Primary', 'Skin_Cancer,_Non-Melanoma'])  #remove cancers of indeterminate origin
casesSummary = casesSummary[~casesSummary['cancerType'].isin(cancersOfIndeterminateOrigin)]
casesSummary = casesSummary.groupby(['cancerType']).sum()
casesSummary['cancerType'] = casesSummary.index

#SUMMARIZE THE FRACTION THAT ARE HYPERMUTATED
casesSummary['fracHypermutated'] = casesSummary.apply(lambda row:
        (1.0*(row['nHypermutated'] + row['nHighMutBurden']))/row['nTotal'], axis=1)
casesSummary['fracHypermutatedOrdering'] = casesSummary.apply(lambda row:
        row['fracHypermutated'] if row['cancerType'] != 'other' else -1, axis=1)
casesSummary['label'] = casesSummary.apply(lambda row:
                                          row['cancerType'] + ' (n = ' + str(row['nHypermutated'] + row['nHighMutBurden']) + ')', axis=1)


  if __name__ == '__main__':


In [12]:
writePath = os.path.join(writeDir, 'figure_1a.tsv')
casesSummary.to_csv(writePath, index=False, sep='\t')

## FIGURE 1B
Signatures prominent in hypermutated tumors

In [None]:
#make sure to have run code cells from figure 1a first

In [13]:
dominantSignatureDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath=filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
casesSummary, signaturesSummary = summarize_n_hypermutated_and_signatures_of_cases(dominantSignatureDict,
    mutClassificationDir = filePathDict['HYPERMUTATION_STATUS_IDS'])

#ADJUST THE SIGNATURES SUMMARY INFORMATION
minNHypermutatedCasesToDisplay = 10
signaturesSummary['cancerType'] = signaturesSummary.apply(
    lambda row: 'other' if row['nHypermutatedCases'] + row['nHighMutationBurdenCases'] < minNHypermutatedCasesToDisplay else row['cancerType'] ,axis=1)
signaturesSummary['orderingVal'] = signaturesSummary.apply(
    lambda row: -1 if row['cancerType'] == 'other' else row['nHypermutatedCases'], axis=1)

#RENAME SIGNATURES
signaturesRenameDict = {'1': 'MMR', 'SMOKING': 'SMOKING',
                        'MMR': 'MMR', 'APOBEC': 'APOBEC', '10': 'POLE',
                       '11': 'TMZ', '14': 'MMR', '7': 'UV'}
signaturesSummary['signature'] = signaturesSummary['signature'].apply(lambda x:
        signaturesRenameDict[x] if x in signaturesRenameDict else 'other')

#SUMMARIZE FRACTIONS
signaturesSummary['nHyperHigh'] = signaturesSummary['nHighMutationBurdenCases'] + signaturesSummary['nHypermutatedCases']
signaturesSummary['nCasesHyperHighType'] = signaturesSummary.apply(lambda row:
            sum(signaturesSummary[signaturesSummary['cancerType'] == row['cancerType']]['nHighMutationBurdenCases']) + 
            sum(signaturesSummary[signaturesSummary['cancerType'] == row['cancerType']]['nHypermutatedCases']), axis=1)
signaturesSummary['frac'] = signaturesSummary.apply(lambda row:
        (row['nHighMutationBurdenCases'] + row['nHypermutatedCases'] + 0.0)/row['nCasesHyperHighType'] , axis=1)

#LEAVE OUT 'CANCER'S OF INDETERMINATE ORIGIN'
cancersOfIndeterminateOrigin = set(['Cancer_of_Unknown_Primary', 'Skin_Cancer,_Non-Melanoma'])  #remove cancers of indeterminate origin
signaturesSummary = signaturesSummary[~signaturesSummary['cancerType'].isin(cancersOfIndeterminateOrigin)]

  if __name__ == '__main__':


In [14]:
writePath = os.path.join(writeDir, 'figure_1b.tsv')
signaturesSummary.to_csv(writePath, index=False, sep='\t')

In [13]:
#Values for text: number of cases for each signature aetiology; fraction
nTotalHypermutated = sum(signaturesSummary['nHyperHigh'])
for signature in ['POLE', 'MMR', 'APOBEC', 'SMOKING', 'UV', 'TMZ']:
    s= sum(signaturesSummary[signaturesSummary['signature'] == signature]['nHyperHigh'])
    print signature, ':', s, '/', nTotalHypermutated, ';', round(100.0*s/nTotalHypermutated, 2), '%'


POLE : 85 / 1978 ; 4.3 %
MMR : 889 / 1978 ; 44.94 %
APOBEC : 222 / 1978 ; 11.22 %
SMOKING : 286 / 1978 ; 14.46 %
UV : 379 / 1978 ; 19.16 %
TMZ : 61 / 1978 ; 3.08 %


## FIGURE 1C
Per cancer type summaries of the number of drivers in hypermutated and non-hypermutated tumors

In [15]:
def get_per_case_oncogenic_mut_info(muts):
    oncogenicMuts = muts[muts['oncogenic'].notnull()]
    nMutOncDict = dict(oncogenicMuts['Tumor_Sample_Barcode'].value_counts())
    return nMutOncDict

def get_per_case_hotspot_mut_info(muts):
    hotspotMuts = muts[muts['is-a-hotspot'] == 'Y']
    nHotspotDict = dict(hotspotMuts['Tumor_Sample_Barcode'].value_counts())
    return nHotspotDict

def summarize_putative_drivers(cohortsAndNames, perCaseOncogenicInfo, perCaseHotspotInfo):
    listOfDicts = []
    for cohortName, cohort in cohortsAndNames.items():
        for tid in cohort:
            nOncogenicMuts = None
            nHotspotMuts = None
            if tid in perCaseOncogenicInfo:
                nOncogenicMuts = perCaseOncogenicInfo[tid]
            if tid in perCaseHotspotInfo:
                nHotspotMuts = perCaseHotspotInfo[tid]

            listOfDicts.append({'Tumor_Sample_Barcode': tid, 'nOncMuts': nOncogenicMuts,
                                'nHotspots': nHotspotMuts, 'cohort': cohortName,
                               })
    df = pd.DataFrame(listOfDicts)
    return df
    

In [18]:
allImpactMuts = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
perCaseOncogenicInfo = get_per_case_oncogenic_mut_info(allImpactMuts)
perCaseHotspotInfo = get_per_case_hotspot_mut_info(allImpactMuts)

#we summarize the data as endometrial, colorectal, glioma and other
hypermutantStatusDir = filePathDict['HYPERMUTATION_STATUS_IDS']
normalEndometrial = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=hypermutantStatusDir, cancerType='Endometrial Cancer', hypermutantStatus = 'Normal')
normalColorectal = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=hypermutantStatusDir, cancerType='Colorectal Cancer', hypermutantStatus = 'Normal')
normalGlioma = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=hypermutantStatusDir, cancerType='Glioma', hypermutantStatus = 'Normal')
hyperEndometrial = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=hypermutantStatusDir, cancerType='Endometrial Cancer', hypermutantStatus = 'Hypermutated')
hyperColorectal = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=hypermutantStatusDir, cancerType='Colorectal Cancer', hypermutantStatus = 'Hypermutated')
hyperGlioma = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=hypermutantStatusDir, cancerType='Glioma', hypermutantStatus = 'Hypermutated')

#summarize all the other cases
otherNormal = set([])
otherHypermutated = set([])
for f in os.listdir(hypermutantStatusDir):
    cType = re.sub('_', ' ', f)[:-4]
    otherNormal = otherNormal | analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=hypermutantStatusDir, cancerType=cType, hypermutantStatus = 'Normal')
    otherHypermutated = otherHypermutated | analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=hypermutantStatusDir, cancerType=cType, hypermutantStatus = 'Hypermutated')
cohortsAndNames = {'normal_Endometrial': normalEndometrial, 'normal_Colorectal': normalColorectal, 'normal_Glioma': normalGlioma,
                  'hyper_Endometrial': hyperEndometrial, 'hyper_Colorectal': hyperColorectal, 'hyper_Glioma': hyperGlioma,
                   'normal_Other': otherNormal, 'hyper_Other': otherHypermutated}

#summarize the information
df = summarize_putative_drivers(cohortsAndNames, perCaseOncogenicInfo, perCaseHotspotInfo)
orderingValDict = {'normal_Colorectal': 1, 'hyper_Colorectal': 2, 'normal_Endometrial': 3, 'hyper_Endometrial': 4,
                  'normal_Glioma': 5, 'hyper_Glioma': 6, 'normal_Other': 7, 'hyper_Other': 8}
df['orderingVal'] = df['cohort'].apply(lambda x: orderingValDict[x])
df['cancerType'] = df['cohort'].apply(lambda x: x.split('_')[1])

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)
  df = pd.read_table(path)


In [19]:
writePath = os.path.join(writeDir, 'figure_1c.tsv')
df.to_csv(writePath, index=False, sep = '\t')

## FIGURE 1D
Observed vs Expected SNV mutations

In [20]:
#summarizing the number of consequential mutations in different cases
def get_n_consequential_mut_count(mafPath):
    
    allImpactMuts = pd.read_table(mafPath)
    im3Genes = get_gene_and_cohort_list_utils.get_im3_genes()
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
    allImpactMuts341 = allImpactMuts[allImpactMuts['Hugo_Symbol'].isin(im3Genes)]

    nmutDict = dict(allImpactMuts341[allImpactMuts341['Variant_Type'] == 'SNP']['Tumor_Sample_Barcode'].value_counts()) #n snps
    indelDict = dict(allImpactMuts341[allImpactMuts341['Variant_Type'].isin(['INS', 'DEL'])]['Tumor_Sample_Barcode'].value_counts()) 
    hotspotDict = dict(allImpactMuts341[allImpactMuts341['is-a-hotspot'] == 'Y']['Tumor_Sample_Barcode'].value_counts())
    oncogenicSNPDict = dict(allImpactMuts341[(allImpactMuts341['oncogenic'].notnull()) & (allImpactMuts341['Variant_Type'].isin(['SNP']))]['Tumor_Sample_Barcode'].value_counts())
    oncogenicINDELDict = dict(allImpactMuts341[(allImpactMuts341['oncogenic'].notnull()) & ~(allImpactMuts341['Variant_Type'].isin(['INS', 'DEL']))]['Tumor_Sample_Barcode'].value_counts())
    stopGainDictTSG = dict(allImpactMuts341[(allImpactMuts341['Consequence'] == 'stop_gained') & (allImpactMuts341['Hugo_Symbol'].isin(tsgs))]['Tumor_Sample_Barcode'].value_counts())
    stopGainDictOncogene = dict(allImpactMuts341[(allImpactMuts341['Consequence'] == 'stop_gained') & (allImpactMuts341['Hugo_Symbol'].isin(oncogenes))]['Tumor_Sample_Barcode'].value_counts())
    
    #add entries with the value 0 where necessary
    for case in set(expectedDf['case']):
        if case not in hotspotDict:
            hotspotDict[case] = 0
        if case not in oncogenicSNPDict:
            oncogenicSNPDict[case] = 0
        if case not in oncogenicINDELDict:
            oncogenicINDELDict[case] = 0
        if case not in stopGainDictTSG:
            stopGainDictTSG[case] = 0
        if case not in stopGainDictOncogene:
            stopGainDictOncogene[case] = 0
    return nmutDict, indelDict, hotspotDict, oncogenicSNPDict, oncogenicINDELDict, stopGainDictTSG, stopGainDictOncogene

#a function that returns the number of indels expected in a case as conditioned on its TMB, signatures, and msi score?
#Note this is not currently used but I can add it later
def get_expected_tsg_indels(nSnps, mmrFrac):
    INDEL_ONCOGENICITY = .55*.97 #55% of indels land in TSGs, 97% of indel are frame shifting
    INDEL_TO_SNP_RATIO_MSI = .3 #ratio of indels to snps in MSI cases
    INDEL_TO_SNP_RATIO_NON_MSI = .05 #in non MSI cases the ratio of indels to snps
    tsgIndelsExpected = 0
    indelsExpectedMMR = mmrFrac*(nSnps * INDEL_TO_SNP_RATIO_MSI)
    indelsExpectedNotMMR = indelsExpected = nSnps * INDEL_TO_SNP_RATIO_NON_MSI
    indelsExpected = indelsExpectedMMR + indelsExpectedNotMMR
    nOncogenicIndelsExpected = indelsExpected * INDEL_ONCOGENICITY
    return nOncogenicIndelsExpected

#summarizes the expected number of mutations
#Note that this includes the option to get an 'expected' number of indel mutations but we don't use it
def get_expected_information():
    listOfDicts = []
    cntr = 0
    
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    impactMsiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)
    domSigDict = get_gene_and_cohort_list_utils.get_pan_impact_signature_mapping(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
    
    nmutDict, indelDict, hotspotDict, oncogenicSNPDict, oncogenicINDELDict, stopGainDictTSG, stopGainDictOncogene = get_n_consequential_mut_count(filePathDict['IMPACT_BASE_MAF'])
    for case in set(expectedDf['case']):

        if cntr %100 == 0: print cntr
        cntr +=1

        nmutCase = nmutDict[case]
        signature = None
        if case in impactMsiCases:
            signature = 'MSI'
        elif case in domSigDict:
            signature = domSigDict[case]

        caseExpectation = expectedDf[expectedDf['case'] == case]
        hotspotExpectation = nmutCase *sum(caseExpectation['hotspotChance'])
        oncogenicSNPExpectation = nmutCase *sum(caseExpectation['oncogenicChance'])
        truncatingExpectationTSG = nmutCase * sum(caseExpectation[caseExpectation['gene'].isin(tsgs)]['truncatingChance'])
        truncatingExpectationOncogene = nmutCase * sum(caseExpectation[~caseExpectation['gene'].isin(tsgs)]['truncatingChance'])

        mmrFrac = mmrSigFracDict[case] if case in mmrSigFracDict and signature == 'MSI' else 0
        oncogenicIndelExpectation = get_expected_tsg_indels(nmutCase, mmrFrac)
        oncogenicAllExpectation = oncogenicSNPExpectation + oncogenicIndelExpectation #todo flesh out

        observedHotspots = hotspotDict[case] if case in hotspotDict else 0
        observedOncogenicSNP = oncogenicSNPDict[case] if case in oncogenicSNPDict else 0
        observedOncogenicINDEL = oncogenicINDELDict[case] if case in oncogenicINDELDict else 0
        observedStopGainTSG = stopGainDictTSG[case] if case in stopGainDictTSG else 0
        observedStopGainOncogene = stopGainDictOncogene[case] if case in stopGainDictOncogene else 0

        listOfDicts.append({'obsHotspot':observedHotspots, 'obsOncogenicSNP': observedOncogenicSNP,
                            'obsIndel': indelDict[case] if case in indelDict else 0,
                            'obsOncogenicINDEL': observedOncogenicINDEL, 'obsOncogenic': observedOncogenicSNP + observedOncogenicINDEL,
                            'obsStopGainTSG': observedStopGainTSG, 'obsStopGainOncogene': observedStopGainOncogene,
                            'expectedTruncatingTSG': truncatingExpectationTSG, 'expectedTruncatingOncogene': truncatingExpectationOncogene,
                            'expectedHotspot': hotspotExpectation, 'expectedOncogenicSNP': oncogenicSNPExpectation,
                            'expectedOncogenicIndel': oncogenicIndelExpectation, 'expectedOncogenicAll': oncogenicAllExpectation,
                           'nmut': nmutCase, 'Tumor_Sample_Barcode': case})

    df = pd.DataFrame(listOfDicts)
    return df 


In [21]:
expectedDf = pd.read_table(filePathDict['EXPECTED_MUTATION_INFO_BY_GENE'])
signatureDf = pd.read_table(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
mmrSigFracDict = dict(zip(signatureDf['Tumor_Sample_Barcode'], signatureDf['mean_MMR']))
df = get_expected_information()

#add in cancer type 
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(filePathDict['CANCER_TYPE_INFO'])
df['cancerType'] = df['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
dominantSignatureDict = dominantSignatureDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath=filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x: dominantSignatureDict[x] if x in dominantSignatureDict else None)
df['dominantSignature'] = df['dominantSignature'].apply(lambda x: 
            'APOBEC' if x == 'mean_APOBEC'
            else 'MMR' if x in ['mean_1', 'mean_14', 'mean_MMR']
            else 'TMZ' if x == 'mean_11'
            else 'POLE' if x == 'mean_10'
            else 'UV' if x == 'mean_7'
            else 'SMOKING' if x == 'mean_SMOKING'
            else 'other')

  """Entry point for launching an IPython kernel.
  
  msiInfoDf = pd.read_table(msiInfoFilePath)
  sigsDf = pd.read_table(impactSigsPath)
  after removing the cwd from sys.path.
  exec(code_obj, self.user_global_ns, self.user_ns)


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


  impactCancerTypeDf = pd.read_table(impactCancerTypeInfoPath)


In [22]:
writePath = os.path.join(writeDir, 'figure_1d.tsv')
df.to_csv(writePath, index=False, sep = '\t')

## Figure 1E
Observed vs expected number of INDEL mutations

In [23]:
def get_all_neutral_indel_rates(maf, neutralGenes, neutralGenePanelSize):
    maf = maf[(maf['Hugo_Symbol'].isin(neutralGenes)) & (maf['Variant_Type'].isin(['INS', 'DEL']))]
    counts = dict(maf['Tumor_Sample_Barcode'].value_counts())
    d = {}
    for key, value in counts.items():
        d[key] = (1.0*value)/neutralGenePanelSize
    return d

#TODO fix MLL2, KMT2B etc
def summarize_gene_type_lengths(geneLengthDict, geneTypes):
    d = {}
    for geneType, genes in geneTypes.items():
        print 'summarizing', geneType
        panelSize = sum([geneLengthDict[gene] if gene in geneLengthDict else 0 for gene in genes])
        d[geneType] = panelSize
    return d

#TODO calculate observed and expected based on gene size
def summarize_observed_and_expected_indels(maf, neutralRates, panelSizes, essentialGenes, tsgs, oncogenes):
    
    impactGenes = get_gene_and_cohort_list_utils.get_im6_genes()
    indelMaf = maf[(maf['Variant_Type'].isin(['INS', 'DEL']))]
    mutCounts = maf['Tumor_Sample_Barcode'].value_counts()
    indelCounts =  indelMaf['Tumor_Sample_Barcode'].value_counts()
    
    tsgCounts = indelMaf[indelMaf['Hugo_Symbol'].isin(tsgs)]['Tumor_Sample_Barcode'].value_counts()
    oncogeneCounts = indelMaf[(~indelMaf['Hugo_Symbol'].isin(tsgs)) & (indelMaf['Hugo_Symbol'].isin(oncogenes))]['Tumor_Sample_Barcode'].value_counts()
    
    cntr = 0
    listOfDicts = []
    for case in set(maf['Tumor_Sample_Barcode']):
        cntr += 1
        if cntr%50 == 0: print cntr,
            
        if case in neutralRates:
        
            caseMaf = indelMaf[indelMaf['Tumor_Sample_Barcode'] == case]
            nOncogeneMutations = oncogeneCounts[case] if case in oncogeneCounts else 0
            nTSGMutations = tsgCounts[case] if case in tsgCounts else 0
            nIndels = indelCounts[case] if case in indelCounts else 0
            
            oncogeneExpected = neutralRates[case] * panelSizes['Oncogene']
            tsgExpected = neutralRates[case] * panelSizes['TSG']

            tmb = mutCounts[case]/30.0 if case in mutCounts else None
            listOfDicts.append({'Tumor_Sample_Barcode': case, 
                               'OncogeneObs': nOncogeneMutations, 'TSGObs': nTSGMutations,
                                'OncogeneExp': oncogeneExpected, 'TSGExp': tsgExpected,
                               'tmb': tmb, 'nIndels': nIndels})
        
    return pd.DataFrame(listOfDicts)

In [24]:
#FYI THIS CELL TAKES APROXIMATELY 5 minutes to run

print 'loading exome hypermutant maf'
allExomeHypermutantMaf = pd.read_table(filePathDict['ALL_EXOME_HYPERMUTATOR_MAF'])

print 'summarizing gene information'
#get lists of genes and other information
neutralGenes = get_gene_and_cohort_list_utils.get_cancer_neutral_genes(depMapPath = filePathDict['DEP_MAP_DATA'])
essentialGenes = get_gene_and_cohort_list_utils.get_essential_genes(depMapPath = filePathDict['DEP_MAP_DATA'], mode='getEssentialGenes')
impactGenes = get_gene_and_cohort_list_utils.get_im6_genes()
tsgs = get_gene_and_cohort_list_utils.get_tsgs()
oncogenes = impactGenes - tsgs
geneLengthInfo = pd.read_table(filePathDict['GENE_LENGTH_INFO'])
geneLengthDict = dict(zip(geneLengthInfo['hgnc_symbol'], geneLengthInfo['nt.length']))

#get summary information about indels 
geneTypeSizeDict = summarize_gene_type_lengths(geneLengthDict, {'Essential': essentialGenes, 'Neutral': neutralGenes, 
                                            'TSG': tsgs, 'Oncogene': oncogenes})
neutralIndelRates = get_all_neutral_indel_rates(allExomeHypermutantMaf, neutralGenes, geneTypeSizeDict['Neutral'])

#create a dataframe that summarizes observed and expected rates
df = summarize_observed_and_expected_indels(allExomeHypermutantMaf, neutralIndelRates, geneTypeSizeDict, essentialGenes, tsgs, oncogenes)

#get relevant signature information
exomeRecaptureDomSigDict, tcgaDomSigDict = get_gene_and_cohort_list_utils.get_exome_signature_cohorts(
    filePathDict['TCGA_SIGNATURE_DECOMPOSITIONS'], filePathDict['EXOME_RECAPTURE_SIGNATURE_DECOMPOSITIONS'])
df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x: exomeRecaptureDomSigDict[x] if x in exomeRecaptureDomSigDict
                                                          else tcgaDomSigDict[x[:15]] if x[:15] in tcgaDomSigDict
                                                          else None)

df['dominantSignature'] = df['dominantSignature'].apply(lambda x: 
            'APOBEC' if x == 'Signature.APOBEC'
            else 'MMR' if x in ['Signature.1', 'Signature.14', 'Signature.MMR']
            else 'TMZ' if x == 'Signature.11'
            else 'POLE' if x == 'Signature.10'
            else 'UV' if x == 'Signature.7'
            else 'SMOKING' if x == 'Signature.SMOKING'
            else 'other')


loading exome hypermutant maf


  This is separate from the ipykernel package so we can avoid doing imports until
  interactivity=interactivity, compiler=compiler, result=result)


summarizing gene information


  if sys.path[0] == '':


summarizing TSG
summarizing Neutral
summarizing Oncogene
summarizing Essential
50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900

  tcgaSignatures = pd.read_table(tcgaSigPath)
  exomeRecaptureSignatures = pd.read_table(exomeSigPath)





In [25]:
writePath = os.path.join(writeDir, 'figure_1e.tsv')
df.to_csv(writePath, index=False, sep = '\t')