In [7]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
from collections import Counter
import re
import scipy

notebookPath = 'scripts/figure1'
projectDir = re.sub(notebookPath, '', os.getcwd())
sys.path.append(os.path.join(notebookPath, 'scripts/utilityScripts'))

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import clonality_analysis_util
import get_gene_and_cohort_list_utils
import configuration_util
import mutation_modeling_util

reload(configuration_util)
filePathDict = configuration_util.get_all_files_path_dict()




In [2]:
#specify where to write files
writeDir = os.path.join(projectDir, 'scripts/figure1/FIGURE1_PLOTTING_FILES/plotDataFiles')

## Figure S1(a)
Summaries of TMB distributions and hypermutation classifications<br>
Run code in <code>scripts/utilityScripts/plotAndDefineHypermutationThresholds.R</code>

## Figure S1(b)
Signature summaries of hypermutated cases by cancer type. Use data frame: <code>scripts/figure1/FIGURE1_PLOTTING_FILES/plotDataFiles/figure_1b.tsv</code>

## Figure S1(c)
Driver/VUS ratio by TMB

In [None]:
def count_fraction_drivers(maf):
    impact341Genes = get_gene_and_cohort_list_utils.get_im3_genes()
    im341Maf = maf[maf['Hugo_Symbol'].isin(impact341Genes)]
    driverMaf = im341Maf[im341Maf['oncogenic'].notnull()]
    driverCounts = dict(driverMaf['Tumor_Sample_Barcode'].value_counts())
    mutCounts = dict(im341Maf['Tumor_Sample_Barcode'].value_counts())
    dominantSigDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath=filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
    
    listOfDicts = []
    for case in set(im341Maf['Tumor_Sample_Barcode']):
        listOfDicts.append({
            'nMut': mutCounts[case] if case in mutCounts else 0,
            'nDriver': driverCounts[case] if case in driverCounts else 0,
            'dominantSignature': dominantSigDict[case] if case in dominantSigDict else None
        })
    
    df = pd.DataFrame(listOfDicts)
    df['fracDriver'] = df['nDriver']/df['nMut']
    return df

In [None]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
df = count_fraction_drivers(allImpactMutsMaf)
#adjust the labels for plotting
df['dominantSignature'] = df['dominantSignature'].apply(lambda x:
        'low-TMB' if x == 'insufficientMutBurden'
        else 'APOBEC' if x == 'mean_APOBEC'
        else 'MMR' if x == 'mean_MMR'
        else 'SMOKING' if x == 'mean_SMOKING'
        else 'POLE' if x == 'mean_10'
        else 'TMZ' if x == 'mean_11'
        else 'UV' if x == 'mean_7'
        else 'other')

In [None]:
writePath = os.path.join(writeDir, 'figureS1_c.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure S1(d) 
Distributions of observed and expected mutations in TCGA

In [4]:
def summarize_observed_and_expected_distributions(maf, expectedDf, essentialGenesDict, essentialGeneSizesDict):
    #ONLY look at SNPs
    ###################
    
    #adjust the expected df ids please 
    expectedDf['case'] = expectedDf['case'].apply(lambda x: x[:12])
    
    stopGainSNPMaf = maf[maf['Consequence'] == 'stop_gained']
    im3Genes = get_gene_and_cohort_list_utils.get_im3_genes()
    maf = maf[maf['Variant_Type'] == 'SNP']
    oncogenicIM341MutsMaf = maf[(maf['oncogenic'].notnull()) & (maf['Hugo_Symbol'].isin(im3Genes))]
    impact341VusMutsMaf = maf[(~maf['oncogenic'].notnull()) & (maf['Hugo_Symbol'].isin(im3Genes))]
    
    #get counts for various mutation types
    driverCounts = oncogenicIM341MutsMaf['Tumor_Sample_Barcode'].value_counts()
    vusCounts = impact341VusMutsMaf['Tumor_Sample_Barcode'].value_counts()
    mutCounts = maf['Tumor_Sample_Barcode'].value_counts()
    
    listOfDicts = []
    for case in set(maf['Tumor_Sample_Barcode']):
        mutCountCase = mutCounts[case] if case in mutCounts else 0
        
        #essential genes
        caseStopGainMaf = stopGainSNPMaf[stopGainSNPMaf['Tumor_Sample_Barcode'] == case]
        cancerType = None
        if caseStopGainMaf.shape[0] > 0:
            cancerType = caseStopGainMaf['cancerTypeDepMap'].iloc[0]
        
        #dont do analysis if not one of specified cancer types
        if cancerType in essentialGeneSizesDict and cancerType in essentialGenesDict:
        
            #get the essential genes, and the base pairs they cover
            essentialGenes = essentialGenesDict[cancerType]
            caseStopGainEssential = caseStopGainMaf[caseStopGainMaf['Hugo_Symbol'].isin(essentialGenes)]
            essentialGeneBasePairs = essentialGeneSizesDict[cancerType]

            if mutCountCase != 0:

                snpsExpectedIM3 = mutCountCase*(.89/30) #im genes = 830kb; tcga = 30mb
                caseExpectation = expectedDf[expectedDf['case'] == case[:12]]
                
                oncogenicSNPRate = sum(caseExpectation['oncogenicChance'])
                oncogenicSNPExpected = oncogenicSNPRate*snpsExpectedIM3
                impactVUSSNPExpected = (1-oncogenicSNPRate)*snpsExpectedIM3

                snpsExpectedEssential = mutCountCase*((1.0*essentialGeneBasePairs)/30e6)
                #we assume the expected truncating rate in essential genes will be approximately equal to the expected truncating rate in impact genes
                #Note we do it this way because we have calculated expected rates for all genes, only impact genes
                
                truncatingSNPRate = sum(caseExpectation['truncatingChance'])
                truncatingExpected = truncatingSNPRate*snpsExpectedEssential
                truncatingObservedEssential = caseStopGainEssential.shape[0]

                listOfDicts.append({
                    'Tumor_Sample_Barcode': case,
                    'snpPerMb': mutCountCase/30.0,
                    'oncogenicSNPExpected': oncogenicSNPExpected,
                    'impactVUSSNPExpected': impactVUSSNPExpected,
                    'oncogenicObserved': driverCounts[case] if case in driverCounts else 0,
                    'vusObserved': vusCounts[case] if case in vusCounts else 0,
                    'truncatingEssentialExpected': truncatingExpected, 'truncatingEssentialObserved': truncatingObservedEssential
                })
    return pd.DataFrame(listOfDicts)
       
def summarize_essential_gene_sizes(essentialGenesDict, geneSizes):
    d = {}
    for cancerType, genes in essentialGenesDict.items():
        size = np.nansum([geneSizes[gene] for gene in genes if gene in geneSizes])
        d[cancerType] = size
    return d
            

In [5]:
#Heavy duty cell: takes ~5-10 min to run
#TODO do we change this to go back to a simpler observered vs expected for essential genes
tcgaExpectedMutInfo = pd.read_table(filePathDict['TCGA_EXPECTED_MUTATION_INFO'])
exomeHypermutatorMaf = pd.read_table(filePathDict['ALL_EXOME_HYPERMUTATOR_MAF'])
exomeHypermutatorMafWithCancerType = maf_analysis_utils.annotate_maf_with_dep_map_cancer_type(exomeHypermutatorMaf, tcgaInfoPath=filePathDict['TCGA_CANCER_TYPE_INFO'], impactInfoPath=filePathDict['CANCER_TYPE_INFO'])

#specifically run analysis on certain cancer types where we can identify essential genes
cTypes = set(exomeHypermutatorMafWithCancerType['cancerTypeDepMap'])
d = get_gene_and_cohort_list_utils.get_cancer_type_specific_dep_map_data(cancerTypes = cTypes)
cancerTypeEssentialGenesDict = get_gene_and_cohort_list_utils.get_cancer_type_specific_essential_genes(cTypes, essentialThresh = -1)
geneLengthInfo = pd.read_table(filePathDict['GENE_LENGTH_INFO'])
geneLengthDict = dict(zip(geneLengthInfo['hgnc_symbol'], geneLengthInfo['nt.length']))

essentialSizeDict = summarize_essential_gene_sizes(cancerTypeEssentialGenesDict, geneLengthDict)

df = summarize_observed_and_expected_distributions(exomeHypermutatorMafWithCancerType, tcgaExpectedMutInfo, cancerTypeEssentialGenesDict, essentialSizeDict)

df = df[df['oncogenicSNPExpected'] > 0]


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  interactivity=interactivity, compiler=compiler, result=result)


beginning annotation of exome maf; takes a long time


  tcgaDf = pd.read_table(tcgaInfoPath)
  impactCancerTypeDf = pd.read_table(impactCancerTypeInfoPath)


assigning cancer type info
assiging dep map cancer type info
analyzing  Brain Cancer
analyzing  Prostate Cancer
analyzing  Bladder Cancer
analyzing  Gastric Cancer
analyzing  other
analyzing  Esophageal Cancer
analyzing  Skin Cancer
analyzing  Lung Cancer
analyzing  Breast Cancer
analyzing  Endometrial/Uterine Cancer
analyzing  Colon/Colorectal Cancer
analyzing  Brain Cancer
analyzing  Prostate Cancer
analyzing  Bladder Cancer
analyzing  Gastric Cancer
analyzing  other
analyzing  Esophageal Cancer
analyzing  Skin Cancer
analyzing  Lung Cancer
analyzing  Breast Cancer
analyzing  Endometrial/Uterine Cancer
analyzing  Colon/Colorectal Cancer


  # This is added back by InteractiveShellApp.init_path()


In [9]:
writePath = os.path.join(writeDir, 'figureS1_d.tsv')
df.to_csv(writePath, index=False, sep='\t')

In [8]:
#p values: comparing observed and expected distributions:
print 'oncogenic mutations:', scipy.stats.ks_2samp(
    np.array(df['oncogenicObserved'])
    ,np.array(df['oncogenicSNPExpected']))

print 'vus mutations:', scipy.stats.ks_2samp(
    np.array(df['vusObserved'])
    ,np.array(df['impactVUSSNPExpected']))

oncogenic mutations: Ks_2sampResult(statistic=0.521885521885522, pvalue=2.6429664081916436e-36)
vus mutations: Ks_2sampResult(statistic=0.08417508417508418, pvalue=0.23279104277249027)


## Figure S1(e)
Fraction SNVs, INDELs etc by aetiology

In [None]:
def summarize_mutation_information_by_signature(maf):
    indelTypes = ['INS', 'DEL']
    listOfDicts = []
    for signature in set(maf['dominantSignature']):
        signatureMaf = maf[maf['dominantSignature'] == signature]
        nMuts = 1.0*signatureMaf.shape[0]
        nIndels = 1.0*signatureMaf[signatureMaf['Variant_Type'].isin(indelTypes)].shape[0]
        nTruncating = 1.0*signatureMaf[signatureMaf['Variant_Classification'] == 'Nonsense_Mutation'].shape[0]
        
        listOfDicts.append({'signature': signature, 'fracIndel': nIndels/nMuts,
                           'fracTruncating': nTruncating/nMuts
                           })
        
    df = pd.DataFrame(listOfDicts)
    return df

In [None]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
domSigDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath = filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
hypermutatedIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
hypermutatedMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(hypermutatedIds)]

#summarize dominant signatures
hypermutatedMaf['dominantSignature'] = hypermutatedMaf['Tumor_Sample_Barcode'].apply(lambda x:
    domSigDict[x] if x in domSigDict else None)
hypermutatedMaf['dominantSignature'] = hypermutatedMaf['dominantSignature'].apply(lambda x:
        'APOBEC' if x == 'mean_APOBEC'
        else 'MMR' if x == 'mean_MMR' or x == 'mean_1'
        else 'SMOKING' if x == 'mean_SMOKING'
        else 'POLE' if x == 'mean_10'
        else 'POLE_MMR' if x == 'mean_14'
        else 'TMZ' if x == 'mean_11'
        else 'UV' if x == 'mean_7'
        else 'other')

#summarize indels
indels = ['INS', 'DEL']
hypermutatedMaf['isTruncating'] = hypermutatedMaf['Variant_Classification'].apply(lambda x:
        1 if x == 'Nonsense_Mutation' else 0)
hypermutatedMaf['isIndel'] = hypermutatedMaf['Variant_Type'].apply(lambda x: 1 if x in indels else 0)


In [None]:
hypermutatedMafWrite = hypermutatedMaf[['isIndel', 'isTruncating', 'dominantSignature']]
writePath = os.path.join(writeDir, 'figureS1_e.tsv')
hypermutatedMafWrite.to_csv(writePath, index=False, sep='\t')

## Figure S1(f)
Propensity of cosmic signatures to cause mutations

In [3]:
def calculate_mut_susceptibility_of_genes_by_signature(dfAllPossibleMutations, mutType):
    listOfDicts = []
    sigNames = ['Signature.' + str(i) for i in range(1,31)]
    spectraD = mutationSigUtils.convert_spectrum_file_to_dict_of_dicts(spectrumFile=filePathDict['SIGNATURE_SPECTRUM'])
    for i in range(1,31):
        curSig = 'Signature.' + str(i)
        d = {}
        for s in sigNames:
            d[s] = 0
        d[curSig] = 1
        #PRETEND we got a case with 100% signature i on the decomposition
        quadNucFractions = mutation_modeling_util.get_quadnuc_fracs_given_decomposition(d, spectraD)
        d = mutation_modeling_util.summarize_mutation_chances(dfAllPossibleMutations, mutType)

        p = mutation_modeling_util.get_expected_mut_chance_given_quadnuc_fractions(quadNucFractions, d)
        
        listOfDicts.append({'Signature_Name': curSig, 'frac': p, 'mutType': mutType})
        
    return pd.DataFrame(listOfDicts)

In [None]:
dfAllPossibleMutations = pd.read_table(filePathDict['ALL_POSSIBLE_MUTATION_SUMMARY'])
dfAllPossibleMutations = mutation_modeling_util.add_zero_cols_to_counts_df(dfAllPossibleMutations)
dfTruncating = calculate_mut_susceptibility_of_genes_by_signature(dfAllPossibleMutations, 'truncating')
dfOncogneic = calculate_mut_susceptibility_of_genes_by_signature(dfAllPossibleMutations, 'oncogenic')
dfHotspot = calculate_mut_susceptibility_of_genes_by_signature(dfAllPossibleMutations, 'hotspot')

#set colors for plotting
df = pd.concat([dfTruncating, dfOncogneic, dfHotspot])
df['colorName'] = df['Signature_Name'].apply(lambda x:
                                            'POLE' if x == 'Signature.10'
                                            else 'TMZ' if x == 'Signature.11'
                                            else 'MMR' if x in set(['Signature.6', 'Signature.15', 'Signature.20', 'Signature.21', 'Signature.26'])
                                            else 'UV' if x == 'Signature.7'
                                            else 'APOBEC' if x in set(['Signature.2', 'Signature.13'])
                                            else 'BRCA' if x == 'Signature.3'
                                            else 'SMOKING' if x == 'Signature.4'
                                            else 'POLE_MMR' if x == 'Signature.14'
                                            else 'AGING' if x == 'Signature.1'
                                            else 'OTHER')

In [None]:
writePath = os.path.join(writeDir, 'figureS1_d.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure S1(vi)
Nucleosome positioning does not explain enrichment of drivers in hypermutated tumors

In [None]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]


In [None]:
nucleosomeBed = pd.read_table(filePathDict['NUCLEOSOME_DYAD_POSITIONS'], header=None)
nucleosomeBed = nucleosomeBed.rename(columns = {0: 'Chromosome', 1: 'Nucleosome_Start_Position', 2: 'Nucleosome_End_Position'})
nucleosomeBed['Chromosome'] = nucleosomeBed['Chromosome'].apply(lambda x: re.sub('chr', '', x))
impactMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])

#Figure out the closest nucleosome
chromosomeDicts = {}
for chromosome in set(nucleosomeBed['Chromosome']):
    chromosomeBed = nucleosomeBed[nucleosomeBed['Chromosome'] == chromosome]
    chromosomeDicts[chromosome] = list(chromosomeBed['Nucleosome_Start_Position'])
impactMaf['closestNucleosome'] = impactMaf.apply(lambda row:
        find_nearest(chromosomeDicts[str(row['Chromosome'])], row['Start_Position']), axis = 1)
impactMaf['closestNucleosomeDistance'] = impactMaf.apply(lambda row: row['Start_Position'] - row['closestNucleosome'], axis=1)
nucleosomeCloseThresh = 1000
impactMaf['isCloseToNucleosome'] = impactMaf['closestNucleosomeDistance'].apply(lambda x:
        1 if abs(x) < nucleosomeCloseThresh else 0)

#Make curved plot for fraction of mutations
hypermutationIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
hypermutationMaf = impactMaf[impactMaf['Tumor_Sample_Barcode'].isin(hypermutationIds)]
tsgs = get_gene_and_cohort_list_utils.get_tsgs()
oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
truncatingConsequences = set(['Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins'])
hypermutationMaf['truncatingType'] = hypermutationMaf.apply(lambda row:
        'truncatingTSG' if row['Hugo_Symbol'] in tsgs and row['Variant_Classification'] in truncatingConsequences
        else 'truncatingOncogene' if row['Hugo_Symbol'] in oncogenes and row['Variant_Classification'] in truncatingConsequences
        else None, axis=1)


In [None]:
#FYI this is written as a csv because of errors that were present when writing as a tsv
writePath = os.path.join(writeDir, 'figureS1_g.csv')
hypermutationMaf.to_csv(writePath, index=False, sep=',')

## Figure S1(vii)
Expression does not explain enrichment of drivers in hypermutated tumors

In [None]:
#currently this analysis is performed in impact it could be done in tcga as well
#returns to data frames, one for analysis by signature and one for analysis by mutation type
def summarize_mutation_rate_by_expression(maf, expressedGenes, geneSizeDict, tmbDict):
    impactToGtexMapping = {
        'Endometrial Cancer': ['Uterus'],
        'Colorectal Cancer': ['Colon - Sigmoid', 'Colon - Transverse'],
        'Non-Small Cell Lung Cancer': ['Lung'],
        'Bladder Cancer': ['Bladder'],
        'Melanoma': ['Skin - Sun Exposed (Lower leg)', 'Skin - Not Sun Exposed (Suprapubic)'],
        'Esophagogastric Cancer': ['Esophagus - Muscularis', 'Esophagus - Mucosa', 'Esophagus - Gastroesophageal Junction'],
        'Prostate': ['Prostate'],
        'Glioma': ['Brain - Anterior cingulate cortex (BA24)',
                     'Brain - Putamen (basal ganglia)',
                     'Brain - Cerebellum',
                     'Brain - Cerebellar Hemisphere',
                     'Brain - Hypothalamus',
                     'Brain - Amygdala',
                     'Brain - Hippocampus',
                     'Brain - Cortex',
                     'Brain - Spinal cord (cervical c-1)',
                     'Brain - Nucleus accumbens (basal ganglia)',
                     'Brain - Caudate (basal ganglia)',
                     'Brain - Frontal Cortex (BA9)',
                     'Brain - Substantia nigra']
    }
    
    impactGenes = get_gene_and_cohort_list_utils.get_im6_genes()
    #collapse all multiple gtex tissue types
    impactMapping = {}
    for impactCancerType, gtexCancerTypes in impactToGtexMapping.items():
        impactMapping[impactCancerType] = set.union(*[expressedGenes[x] for x in gtexCancerTypes])
    
    maf = maf[maf['cancerType'].isin(impactToGtexMapping.keys())]
    analysisSignatures = ['mean_10', 'mean_11', 'mean_APOBEC', 'mean_7', 'mean_MMR', 'mean_SMOKING']
    maf = maf[maf['dominantSignature'].isin(analysisSignatures)]
    
    #iterate this way to be faster
    listOfDicts = []
    for cancerType in set(maf['cancerType']):
        
        #we calculate expressed genes for each cancer type
        print cancerType,
        cancerTypeMaf = maf[maf['cancerType'] == cancerType]
        expressedGenes = impactMapping[cancerType]
        bpExpressedGenes = 1.0*sum([value for key, value in geneSizeDict.items() if key in expressedGenes and key in impactGenes])
        bpNotExpressedGenes = 1.0*sum([value for key, value in geneSizeDict.items() if key in expressedGenes and key in impactGenes])
        
        for case in set(maf['Tumor_Sample_Barcode']):
            caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
            caseMafExpressed = caseMaf[caseMaf['Hugo_Symbol'].isin(expressedGenes)]
            caseMafNotExpressed = caseMaf[~caseMaf['Hugo_Symbol'].isin(expressedGenes)]
            #mutTypeAnalysis
            truncatingConsequences = set(['Frame_Shift_Del', 'Frame_Shift_Ins', 'Nonsense_Mutation'])
            caseMafDriver = caseMaf[caseMaf['oncogenic'].notnull()]
            caseMafVus = caseMaf[~caseMaf['oncogenic'].notnull()]
            caseMafTruncating = caseMaf[caseMaf['Variant_Classification'].isin(truncatingConsequences)]
            
            listOfDicts.append({'Tumor_Sample_Barcode': case,
                                'tmb': tmbDict[case] if case in tmbDict else None,
                               'Signature': caseMaf['dominantSignature'].iloc[0],
                                
                                'mutRateExpressed': caseMafExpressed.shape[0]/bpExpressedGenes,
                                'mutRateNotExpressed': caseMafNotExpressed.shape[0]/bpNotExpressedGenes,
                                'mutRateDriverExpressed': caseMafDriver[caseMafDriver['Hugo_Symbol'].isin(expressedGenes)].shape[0]/bpExpressedGenes,
                                'mutRateDriverNotExpressed': caseMafDriver[~caseMafDriver['Hugo_Symbol'].isin(expressedGenes)].shape[0]/bpNotExpressedGenes,
                                'mutRateVUSExpressed': caseMafVus[caseMafVus['Hugo_Symbol'].isin(expressedGenes)].shape[0]/bpExpressedGenes,
                                'mutRateVUSNotExpressed': caseMafVus[~caseMafVus['Hugo_Symbol'].isin(expressedGenes)].shape[0]/bpNotExpressedGenes,
                                'mutRateTruncatingExpressed': caseMafTruncating[caseMafTruncating['Hugo_Symbol'].isin(expressedGenes)].shape[0]/bpExpressedGenes,
                                'mutRateTruncatingNotExpressed': caseMafTruncating[~caseMafTruncating['Hugo_Symbol'].isin(expressedGenes)].shape[0]/bpNotExpressedGenes
                               })
    df = pd.DataFrame(listOfDicts)
    return df
            
            

In [None]:
#note this cell takes ~3 minutes to run
#get summary data as needed
expressedGenes = get_gene_and_cohort_list_utils.get_expressed_genes_from_gtex(
    gtexDataPath = filePathDict['GTEX_DATA'], thresh=1)
geneLengthInfo = pd.read_table(filePathDict['GENE_LENGTH_INFO'])
geneLengthDict = dict(zip(geneLengthInfo['hgnc_symbol'], geneLengthInfo['nt.length']))
tmbDict = get_gene_and_cohort_list_utils.get_all_tmb_info(tmbFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'])

#prepare the maf as needed
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
dominantSignatureDict = get_gene_and_cohort_list_utils.get_pan_impact_signature_mapping(
    filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
allHypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
allNormalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
hypermutatedMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(allHypermutantIds)]
hypermutatedMaf['dominantSignature'] = hypermutatedMaf['Tumor_Sample_Barcode'].apply(lambda x: dominantSignatureDict[x] if x in dominantSignatureDict else None)


df = summarize_mutation_rate_by_expression(hypermutatedMaf, expressedGenes, geneLengthDict, tmbDict)
df['Signature'] = df['Signature'].apply(lambda x:
                                                       'POLE' if x == 'mean_10'
                                                       else 'MMR' if x == 'mean_MMR'
                                                       else 'TMZ' if x == 'mean_11'
                                                       else 'UV' if x == 'mean_7'
                                                       else 'APOBEC' if x == 'mean_APOBEC'
                                                       else 'SMOKING' if x == 'mean_SMOKING'
                                                       else None)

In [None]:
writePath = os.path.join(writeDir, 'figureS1_h.tsv')
df.to_csv(writePath, index=False, sep='\t')