In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import math
import re
import sys
from collections import Counter
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.multitest import fdrcorrection

notebookPath = 'scripts/figure2'
projectDir = re.sub(notebookPath, '', os.getcwd())
sys.path.append(os.path.join(projectDir, 'scripts/utilityScripts'))

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import configuration_util
filePathDict = configuration_util.get_all_files_path_dict()
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import clonality_analysis_util
import get_gene_and_cohort_list_utils
import attribute_mutations_to_signatures


In [2]:
#Set where to write the files
writeDir = os.path.join(projectDir, 'scripts/figure2/FIGURE2_PLOTTING_FILES/plotDataFiles/')

## Figure 2S(i)

In [10]:
def make_counts_df(maf, dominantSignatureDict):
    
    def summarize_counts_for_mutation_type(oMaf, mutationType):
        
        oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
        tsgs = get_gene_and_cohort_list_utils.get_tsgs()
        truncatingConsequences = set(['Frame_Shift_Del', 'Frame_Shift_Ins', 'Nonsense_Mutation'])
        
        sMaf = None
        counts = None
        if mutationType == 'tsgTrunc':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(tsgs) &
                (oncogenicMaf['Variant_Classification'].isin(truncatingConsequences))]
        if mutationType == 'oncogene':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(oncogenes)]
        if mutationType == 'tsgMissense':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(tsgs) &
                                (~oncogenicMaf['Variant_Classification'].isin(truncatingConsequences))]
            
        counts = Counter(sMaf['Tumor_Sample_Barcode'])
        counts = [(case, count) for case, count in counts.items()] + [(case, 0) for
            case in set(allIds) - set(sMaf['Tumor_Sample_Barcode'])] 
        return dict(counts)

    
    cancerTypeDict = dict(zip(maf['Tumor_Sample_Barcode'], maf['cancerType']))
    allIds = set(maf['Tumor_Sample_Barcode'])
    oncogenicMaf = maf[maf['oncogenic'].notnull()]
    
    tsgTruncatingCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'tsgTrunc')
    oncogenicCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'oncogene')
    tsgMissenseCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'tsgMissense')
    listOfDicts = []
    for case in allIds:
        listOfDicts.append({'Tumor_Sample_Barcode': case, 'TsgTruncating': tsgTruncatingCounts[case],
                            'Oncogene': oncogenicCounts[case], 'TsgMissense': tsgMissenseCounts[case],
                            'cancerType': cancerTypeDict[case], 
                            'dominantSignature': dominantSignatureDict[case] if case in dominantSignatureDict else None
                           })
    return pd.DataFrame(listOfDicts)

In [12]:
#set up a maf of hypermutated cancers with cancer type annotation
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO'])
allHypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
dominantSignatureDict = get_gene_and_cohort_list_utils.get_pan_impact_signature_mapping(
    filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
hypermutatedMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(allHypermutantIds)]

#get counts of mutations
dfCountsHypermutated = make_counts_df(hypermutatedMaf, dominantSignatureDict)
meltedDf2 = pd.melt(dfCountsHypermutated, id_vars=['dominantSignature'], value_vars=['Oncogene', 'TsgMissense', 'TsgTruncating'])
meltedDf2['dominantSignatureAdj'] = meltedDf2['dominantSignature'].apply(lambda x:
    '_MMR' if x == 'mean_1' or x == 'mean_MMR'
    else '_SMOKING' if x == 'mean_SMOKING'
    else '_APOBEC' if x == 'APOBEC'
    else '_POLE' if x == 'mean_10'
    else '_TMZ' if x == 'mean_11'
    else '_UV' if x == 'mean_7'
    else 'other')

meltedDf2['frac'] = meltedDf2.apply(lambda row:
    1.0*row['value']/sum(meltedDf2[meltedDf2['dominantSignatureAdj'] == row['dominantSignatureAdj']]['value']), axis=1)


  
  sigsDf = pd.read_table(impactSigsPath)


In [14]:
writePath = os.path.join(writeDir, 'figureS2_i.tsv')
meltedDf2.to_csv(writePath, index=False, sep='\t')

## Figure 2S(b)
Mutation attribution for hypermutated tumors

In [25]:
mafWithMutationAttribution = pd.read_table(filePathDict['MAF_WITH_MUTATION_ATTRIBUTION'])
tsgs = get_gene_and_cohort_list_utils.get_tsgs()
oncogenicConsequences = ['Likely Oncogenic', 'Oncogenic', 'Predicted Oncogenic']
mafWithMutationAttribution['geneType'] = mafWithMutationAttribution.apply(lambda row:
        'TSG_truncating' if row['Hugo_Symbol'] in tsgs and row['Variant_Classification'] in ['Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins'] and row['oncogenic'] in oncogenicConsequences 
        else 'TSG_missense' if row['Hugo_Symbol'] in tsgs and row['oncogenic'] in oncogenicConsequences
        else 'Oncogene' if row['oncogenic'] in oncogenicConsequences
        else 'VUS', axis=1
)
mafWithMutationAttribution['hypermutationInduced'] = mafWithMutationAttribution['hypermutationInduced'].apply(lambda x:
    'Almost certain' if x == 'hyperInduced'
    else 'Possible' if x == 'unclear'
    else 'Unlikely' if x == 'notHyperAttributable'
            else None)

  """Entry point for launching an IPython kernel.


In [26]:
writeMaf = mafWithMutationAttribution[['geneType', 'hypermutationInduced', 'Hugo_Symbol', 'HGVSp_Short']]
writePath = os.path.join(writeDir, 'figureS2_b.tsv')
writeMaf.to_csv(writePath, index=False, sep='\t')

In [28]:
#Percent truncating mutations due to signatures
tsgTruncMaf = mafWithMutationAttribution[(mafWithMutationAttribution['geneType'] == 'TSG_truncating')]
print (tsgTruncMaf[tsgTruncMaf['hypermutationInduced'] != 'Unlikely'].shape[0]*100.)/tsgTruncMaf.shape[0]

78.3807312826


## Figure 2S(c)
Summary of all possible drivers in IMPACT

In [33]:
def summarize_all_possible_drivers(allPossibleMutations):
    genes = set(allPossibleMutations['Hugo_Symbol'])
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    total = 0
    tsgTrunc = 0
    tsgOncogenic = 0
    oncogene = 0
    for gene in genes:
        geneGroup = dict(allPossibleMutations[allPossibleMutations['Hugo_Symbol'] == gene].iloc[0])
        if gene in tsgs:
            tsgTrunc += np.nansum([value for key, value in geneGroup.items() if 'truncating' in key])
            tsgOncogenic += np.nansum([value for key, value in geneGroup.items() if 'oncogenic' in key])
            total += np.nansum([value for key, value in geneGroup.items() if 'nonSilent' in key])
        else:
            total += np.nansum([value for key, value in geneGroup.items() if 'nonSilent' in key])
            oncogene += np.nansum([value for key, value in geneGroup.items() if 'oncogenic' in key])
    vus = total - oncogene - tsgOncogenic
    tsgMissense = tsgOncogenic - tsgTrunc
    df = pd.DataFrame([{'mutationType':'VUS', 'number': vus}, 
                        {'mutationType':'TSG_MISSENSE', 'number': tsgMissense},
                       {'mutationType': 'TSG_TRUNCATING', 'number': tsgTrunc},
                      {'mutationType':'ONCOGENE', 'number':oncogene}])
    return df
        

In [34]:
dfAllPossibleMutations = pd.read_table(filePathDict['ALL_POSSIBLE_MUTATION_SUMMARY'])
df = summarize_all_possible_drivers(dfAllPossibleMutations)


  """Entry point for launching an IPython kernel.


In [36]:
writePath = os.path.join(writeDir, 'figureS2_c.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2S(d)

In [15]:
def summarize_obs_vs_expected_by_gene_type(maf, relatedGenesDict, expectedDf, cancerTypeDict):
    impact341Genes = get_gene_and_cohort_list_utils.get_im3_genes()
    expectedDf = expectedDf[expectedDf['gene'].isin(impact341Genes)]
    maf = maf[maf['Hugo_Symbol'].isin(impact341Genes)]
    oncSnpMaf = maf[(maf['oncogenic'].notnull()) & (maf['Variant_Type'] == 'SNP')]
    mutCounts = dict(maf['Tumor_Sample_Barcode'].value_counts())
    listOfDicts = []
    cntr = 0
    for case, cancerType in cancerTypeDict.items():
        if case in set(maf['Tumor_Sample_Barcode']) and case in set(expectedDf['case']):
        
            if cntr%50 == 0: print cntr,
            caseMaf = oncSnpMaf[oncSnpMaf['Tumor_Sample_Barcode'] == case]
            caseExpectation = expectedDf[expectedDf['case'] == case]
            relatedGenes = relatedGenesDict[cancerType]

            relatedGenesMaf = caseMaf[caseMaf['Hugo_Symbol'].isin(relatedGenes)]
            notRelatedGenesMaf = caseMaf[~caseMaf['Hugo_Symbol'].isin(relatedGenes)]
            relatedGenesExpected = caseExpectation[caseExpectation['gene'].isin(relatedGenes)]
            notRelatedGenesExpected = caseExpectation[~caseExpectation['gene'].isin(relatedGenes)]

            listOfDicts.append({'cancerType': cancerType, 'Tumor_Sample_Barcode': case,
                                'nmut_IM341': mutCounts[case],
                'nRelatedObserved': relatedGenesMaf.shape[0],
                'nUnrelatedObserved': notRelatedGenesMaf.shape[0],
                'nRelatedExpected': mutCounts[case]*sum(relatedGenesExpected['oncogenicChance']),
                'nUnrelatedExpected': mutCounts[case]*sum(notRelatedGenesExpected['oncogenicChance'])
            })

            cntr += 1
        
    df = pd.DataFrame(listOfDicts)
    return df

In [24]:
#Takes ~5 minutes to run
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
relatedGenesDict = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type()
dominantSignatureDict = get_gene_and_cohort_list_utils.get_pan_impact_signature_mapping(
    filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])

#Load in data as needed
cancerTypeDict = dict(get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO']))
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
allImpactMutsMaf['hypermutationStatus'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x:
    'hypermutated' if x in hypermutantIds else 'normal' if x in normalIds else 'Intermediate')
hypermutantMaf = allImpactMutsMaf[allImpactMutsMaf['hypermutationStatus'] == 'hypermutated']

expectedDf = pd.read_table(filePathDict['EXPECTED_MUTATION_INFO_BY_GENE'])
df = summarize_obs_vs_expected_by_gene_type(hypermutantMaf, relatedGenesDict, expectedDf, cancerTypeDict)
df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x: dominantSignatureDict[x] if x in dominantSignatureDict else None)

df['dominantSignature'] = df['dominantSignature'].apply(lambda x: 
            'APOBEC' if x == 'mean_APOBEC'
            else 'MMR' if x == 'mean_MMR'
            else 'TMZ' if x == 'mean_11'
            else 'POLE' if x == 'mean_10'
            else 'other')

In [25]:
writePath = os.path.join(writeDir, 'figureS2_ii.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2S(iii)

In [27]:
df = pd.read_table('/Users/friedman/Desktop/hypermutationProjectFinal/tables/table5.tsv')

#LOAD data from 'table' that contains dnds scores
df = pd.read_table(filePathDict['TABLE_5'])
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
relatedGenesDict = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type()
df['related'] = df.apply(lambda row:
                'related' if row['CANCER_TYPE'] in relatedGenesDict and row['GENE'] in relatedGenesDict[row['CANCER_TYPE']]
                           else 'not-related', axis=1)
df['dndsIsSignificantHyper'] = df['DNDS_HYPER_SCORE'].apply(lambda x: 1 if x <.05 else 0)
df['cancerTypeGene'] = df.apply(lambda row: row['CANCER_TYPE'] + '_' + row['related'], axis=1)

  after removing the cwd from sys.path.
  import sys
  


In [31]:
writePath = os.path.join(writeDir, 'figureS2_iii.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2S(iv)

In [33]:
#samples a normal maf to have mutations drawn from similar distribution as hypermutated cases
def sample_normal_maf(normalMaf, hyperMaf, N=25):
    
    hyperCancerTypeCounter = Counter(hyperMaf['cancerType'])
    normalCancerTypeCounter = Counter(normalMaf['cancerType'])
    
    normalMaf['weight'] = normalMaf['cancerType'].apply(lambda x: 
        (1.0*normalMaf.shape[0]*hyperCancerTypeCounter[x])/(hyperMaf.shape[0]*normalCancerTypeCounter[x]))
    sampledCounts = normalMaf.sample(frac=N, weights='weight', replace=True)
    return sampledCounts

def summarize_mutational_prevalence(hyperMaf, normalMafSampled):
    
    nCasesHyper = len(set(hyperMaf['Tumor_Sample_Barcode']))
    nCasesNormal = len(set(normalMaf['Tumor_Sample_Barcode']))
    
    oncoMafNormal = normalMafSampled[normalMafSampled['oncogenic'].notnull()]
    oncoMafHyper = hyperMaf[hyperMaf['oncogenic'].notnull()]
    
    listOfDicts = []
    
    cntr = 0
    for gene in set(oncoMafNormal['Hugo_Symbol']) | set(oncoMafHyper['Hugo_Symbol']):
        
        geneMafNormal = oncoMafNormal[oncoMafNormal['Hugo_Symbol'] == gene]
        geneMafHyper = oncoMafHyper[oncoMafHyper['Hugo_Symbol'] == gene]
        normalCount = len(set(geneMafNormal['Tumor_Sample_Barcode']))
        hyperCount = len(set(geneMafHyper['Tumor_Sample_Barcode']))
        
        listOfDicts.append({'gene': gene,
            'countNormal': normalCount,
            'countHyper': hyperCount
        })
        
        cntr += 1
        if cntr%50 == 0: print cntr
        
    df = pd.DataFrame(listOfDicts)
    df['fracNormal'] = df['countNormal'].apply(lambda x: (1.0*x)/nCasesNormal)
    df['fracHyper'] = df['countHyper'].apply(lambda x:  (1.0*x)/nCasesHyper)
    
    return df

In [None]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
msiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)
apobecCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_APOBEC')
poleCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_10')
hypermutatedIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(normalIds)]

#summarize the data for different signature cohorts
#note more signature cohorts could be added here
listOfDfs = []
caseMapping = {'POLE': poleCases, 'APOBEC': apobecCases, 'MSI': msiCases}
for sigType, cases in caseMapping.items():
    print 'sampling', sigType
    hypermutatedMaf = allImpactMutsMaf[(allImpactMutsMaf['Tumor_Sample_Barcode'].isin(cases))]
    samples = sample_normal_maf(normalMaf, hypermutatedMaf, N=15)
    print 'summarizing'
    df = summarize_mutational_prevalence(hypermutatedMaf, samples)
    df['signature'] = sigType
    listOfDfs.append(df)


combinedDf = pd.concat(listOfDfs)
combinedDf['ratio'] = combinedDf['fracHyper']/combinedDf['fracNormal']
combinedDf['ddrGene'] = combinedDf['gene'].apply(lambda x: True if x in ['POLE', 'MLH1', 'MSH2', 'MSH6', 'PMS2'] else False)


In [36]:
writePath = os.path.join(writeDir, 'figureS2_iv.tsv')
combinedDf.to_csv(writePath, index=False, sep='\t')

## Figure 2S(v)

In [None]:
#TODO will adapt code later: 9/21/2020

#specfically look at pentanucleotide context in POLE cases and its relation to mutationa susceptibility


poleMafWithPentaContext = pd.read_table(filePathDict['PENTANUCLEOTIDE_CONTEXT_ANNOTATED_MAF'])
poleMafWithPentaContext['pentaChange'] = poleMafWithPentaContext.apply(lambda row: 
                                    mutationSigUtils.create_strand_specific_pentanucleotide_change(row['Ref_Tri.1'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)




## Figure 2S(vi)

In [None]:
#Todo: adapt the code for cumulative prevalence curves for mutations

## Figure 2S(vii)

In [37]:
def compare_mutation_prevalences_across_multiple_cancer_types(
        allMsiCasesMaf,
        cancerTypes = ['Endometrial Cancer', 'Colorectal Cancer', 'Esophagogastric Cancer', 'Prostate Cancer']):
    
    listOfDfs = []
    compsDone = []
    for c1 in cancerTypes:
        for c2 in cancerTypes:
            comp = '/'.join(sorted((c1, c2)))
            if c1 != c2 and comp not in compsDone:
                cancerTypesMaf = allMsiCasesMaf[
                    allMsiCasesMaf['cancerType'].isin([c1, c2]) & (allMsiCasesMaf['Tumor_Sample_Barcode'].isin(msiCases))]
                df = analysis_utils.make_comparissons(cancerTypesMaf, mode='gene', cancerType1 = c1, cancerType2 = c2)
                df['comp'] = comp            
                compsDone.append(comp)
                listOfDfs.append(df)
    combinedDf = pd.concat(listOfDfs)
    return combinedDf

In [38]:
#NOTE THE CODE FOR THESE SUPPLEMENTAL FIGURES IS VERY SIMILAR TO 2C
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
msiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)

#We have to match MSI alleles (sometimes they have different names etc)
#the value correctedAllele is the proper allele for us to work with
msiSummary = pd.read_table(filePathDict['MICROSATELLITE_INFORMATION'])
allMsiCasesMaf = allImpactMutsMaf[(allImpactMutsMaf['Tumor_Sample_Barcode'].isin(msiCases))]
allMsiCasesMaf['allele'] = allMsiCasesMaf.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)
msiSummary['allele'] = msiSummary.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)
neverObservedSites, msiSitesToNameMapping, mafMsiSiteToNameMapping =  analysis_utils.standardize_allele_names(msiSummary, allMsiCasesMaf)
msiSummary['correctedAllele'] = msiSummary['allele'].apply(lambda x: mafMsiSiteToNameMapping[x] if x in mafMsiSiteToNameMapping else None)
allMsiCasesMaf['correctedAllele'] = allMsiCasesMaf['allele'].apply(lambda x:
                                                                 mafMsiSiteToNameMapping[x] if x in mafMsiSiteToNameMapping else None)

df = compare_mutation_prevalences_across_multiple_cancer_types(allMsiCasesMaf)

  
  if __name__ == '__main__':
  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [42]:
writePath = os.path.join(writeDir, 'figureS2_vii.tsv')
df.to_csv(writePath, index=False, sep='\t')


## Figure 2S(viii)

In [44]:
poleCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_10')
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
poleEndometrialColorectalMaf = allImpactMutsMaf[
    (allImpactMutsMaf['cancerType'].isin(['Endometrial Cancer', 'Colorectal Cancer'])) & 
    (allImpactMutsMaf['Tumor_Sample_Barcode'].isin(poleCases))]
df = analysis_utils.make_comparissons(poleEndometrialColorectalMaf, mode = 'gene', mutationType='pole')

  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  maf['allele'] = maf['Hugo_Symbol']


In [46]:
writePath = os.path.join(writeDir, 'figureS2_viii.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2S(ix)

In [47]:
def summarize_inactivation_method(maf, hyperIds, normalIds, cancerTypes = ['']):
    listOfDicts = []
    for cancerType in cancerTypes:
        cancerTypeMaf = maf[maf['cancerType'] == cancerType]
        hyperMaf = cancerTypeMaf[cancerTypeMaf['Tumor_Sample_Barcode'].isin(hyperIds)]
        normalMaf = cancerTypeMaf[cancerTypeMaf['Tumor_Sample_Barcode'].isin(normalIds)]
    
        listOfDicts.append({'nHyperLOH': len(set(hyperMaf[hyperMaf['lossType'] == 'LOH']['geneCase'])),
                            'nHyperComposite': len(set(hyperMaf[hyperMaf['lossType'] == 'composite_mutation']['geneCase'])),
                            'nNormalLOH': len(set(normalMaf[normalMaf['lossType'] == 'LOH']['geneCase'])),
                            'nNormalComposite': len(set(normalMaf[normalMaf['lossType'] == 'composite_mutation']['geneCase'])),
                            'cancerType': cancerType
                           })
        
    df = pd.DataFrame(listOfDicts)
    df['totalHyper'] = df['nHyperLOH'] + df['nHyperComposite']
    df['totalNormal'] = df['nNormalLOH'] + df['nNormalComposite']
        
    return df

In [None]:
#Takes ~2 minutes
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
allImpactMutsMaf['varUuid'] = allImpactMutsMaf.apply(lambda row:
    row['Tumor_Sample_Barcode'] + '_' + str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)
oncogenicMutIds = set(allImpactMutsMaf[allImpactMutsMaf['oncogenic'].notnull()]['varUuid'])
mafWithClonalityAnnotation = pd.read_csv(filePathDict['IMPACT_MAF_WITH_ADJUSTED_CLONALITY_ANNOTATION'])
mafWithClonalityAnnotation['varUuid'] = mafWithClonalityAnnotation.apply(lambda row:
    row['Tumor_Sample_Barcode'] + '_' + str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)

tsgs = get_gene_and_cohort_list_utils.get_tsgs()
tsgOncogenicMaf = mafWithClonalityAnnotation[(mafWithClonalityAnnotation['varUuid'].isin(oncogenicMutIds)) &
                                             (mafWithClonalityAnnotation['Hugo_Symbol'].isin(tsgs))]
tsgOncogenicMaf['geneCase'] = tsgOncogenicMaf.apply(lambda row: row['Hugo_Symbol'] + '_' + row['Tumor_Sample_Barcode'], axis=1)
occurenceDict = dict(tsgOncogenicMaf['geneCase'].value_counts())                                                                                     
tsgOncogenicMaf['isMultiplet'] = tsgOncogenicMaf['geneCase'].apply(lambda x: True if occurenceDict[x] > 1 else False)
tsgOncogenicMaf['isLOH'] = tsgOncogenicMaf['lcn'].apply(lambda x: True if x == 0 else False)
tsgOncogenicMaf['lossType'] = tsgOncogenicMaf.apply(lambda row: 'LOH' if row['isLOH'] == True
    else 'composite_mutation' if row['isMultiplet'] == True else False, axis=1)
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info()
tsgOncogenicMaf['cancerType'] = tsgOncogenicMaf['Tumor_Sample_Barcode'].apply(lambda x:
    cancerTypeDict[x] if x in cancerTypeDict else None)

df = summarize_inactivation_method(tsgOncogenicMaf, hypermutantIds, normalIds, cancerTypes =
    ['Endometrial Cancer', 'Colorectal Cancer', 'Melanoma',
     'Prostate Cancer', 'Glioma', 'Bladder Cancer', 'Non-Small Cell Lung Cancer'])

In [49]:
writePath = os.path.join(writeDir, 'figureS2_ix.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2S(x)

In [3]:
def convert_df_to_np_array(df):
    dfReduced = df[['Tumor_Sample_Barcode', 'shuffleColumn']]
    #npArr = dfReduced.to_numpy()
    npArr = dfReduced.values
    return npArr

def shuffle_arr(arr):
    nameCol = arr[:,0]
    valueCol = arr[:,1]
    np.random.shuffle(nameCol)
    return np.column_stack((nameCol, valueCol))

def summarize_shuffle_results(shuffledArr, genes):
    zippedList = zip(shuffledArr[:,0], shuffledArr[:,1])

    df = pd.DataFrame([])
    df['Tumor_Sample_Barcode'] = shuffledArr[:,0]
    df['Hugo_Symbol'] = shuffledArr[:,1]
    df['geneCase'] = df.apply(lambda row: row['Tumor_Sample_Barcode'] + '_' + row['Hugo_Symbol'], axis=1)
    
    l = [geneCase.split('_')[1] for geneCase, count in df['geneCase'].value_counts().items() if count > 1]
    
    return Counter(l)

def do_n_iterations_of_shuffling(maf, n, genes, mode='geneDist'):
    npArr = convert_df_to_np_array(maf)
    listOfDicts = []
    for i in range(n):
        if i%10 == 0: print 'on iteration ', i,
        shuffledArr = shuffle_arr(npArr)
        shuffleResults = summarize_shuffle_results(shuffledArr, genes)
        shuffleResults['iter'] = i
        listOfDicts.append(shuffleResults)
        i += 1
    return pd.DataFrame(listOfDicts)

def do_permutation_test_across_cancer_types(maf, cancerTypes = [], n =10):
    listOfDfs = []
    for cancerType in cancerTypes:
        cTypeMaf = maf[maf['cancerType'] == cancerType]
        observedMultipletDict = dict(cTypeMaf[cTypeMaf['isMultiplet'] == True].drop_duplicates(subset=['caseGene'])['Hugo_Symbol'].value_counts())
        permTestSummary = do_n_iterations_of_shuffling(cTypeMaf, n, genes = set(cTypeMaf['Hugo_Symbol']), mode='geneDist')
        df = summarize_permutation_test_results(permTestSummary, observedMultipletDict, n)
        df['cancerType'] = cancerType
        listOfDfs.append(df)
    cDf = pd.concat(listOfDfs)
    return cDf

def summarize_significant_genes_by_cancer_type(results, cancerTypes = []):
    tsgs = set(['ERRFI1', 'ASXL2', 'PMAIP1', 'ACTG1', 'SUFU', 'FBXO11', 'MEN1', 'FAM58A', 'B2M', 'RB1', 'DUSP22', 'SESN1', 'GPS2', 'RAD51D', 'SMG1', 'CDC73', 'MAP3K1', 'SMARCB1', 'INPP4B', 'PARK2', 'SMAD4', 'CBFB', 'CDH1', 'PPP6C', 'SETDB1', 'SETDB2', 'NF2', 'CDKN2B', 'CDKN2C', 'CDKN2A', 'DDX3X', 'PIK3R1', 'BARD1', 'PDS5B', 'KLF4', 'SPRED1', 'VHL', 'SMAD2', 'PMS1', 'PMS2', 'SETD2', 'GATA3', 'TBL1XR1', 'MUTYH', 'SOCS1', 'FAM175A', 'ROBO1', 'ARID1B', 'ARID1A', 'TCF7L2', 'STK11', 'FOXA1', 'PTEN', 'FAT1', 'FAS', 'CYLD', 'MAX', 'SH2D1A', 'APC', 'NTHL1', 'CTCF', 'KDM5C', 'KMT2C', 'ZFHX3', 'FOXP1', 'PIGA', 'CDKN1B', 'CDKN1A', 'FUBP1', 'MSH2', 'ID3', 'TNFRSF14', 'TRAF3', 'EP400', 'BRIP1', 'ARID4A', 'ARID4B', 'XRCC2', 'DAXX', 'SDHAF2', 'ASXL1', 'AMER1', 'RASA1', 'EGR1', 'MST1', 'SOX17', 'RUNX1', 'PIK3R3', 'NCOR1', 'NF1', 'JAK1', 'PTPRD', 'CHEK2', 'CHEK1', 'SMC1A', 'TMEM127', 'STAG1', 'RAD51', 'TCF3', 'STAG2', 'ARID2', 'RAD50', 'RNF43', 'PARP1', 'BLM', 'CUX1', 'RECQL', 'RAD21', 'PTPN2', 'PTPN1', 'SLX4', 'INHA', 'PAX5', 'IRF1', 'TP53', 'HLA-A', 'IRF8', 'CBL', 'TOP1', 'SHQ1', 'PRDM1', 'NSD1', 'ATXN2', 'CREBBP', 'HDAC4', 'SESN2', 'PPP2R1A', 'EPHA7', 'ATM', 'EPHA3', 'POT1', 'SMAD3', 'MOB3B', 'TBX3', 'POLE', 'ATR', 'FANCD2', 'FH', 'BCORL1', 'SOX9', 'IKZF3', 'TSC1', 'TP63', 'MRE11A', 'SDHC', 'BTG1', 'POLD1', 'CIITA', 'SMC3', 'SAMHD1', 'RTEL1', 'ECT2L', 'PIK3R2', 'CRBN', 'FANCC', 'NBN', 'FANCA', 'HLA-B', 'RECQL4', 'DUSP4', 'ERCC2', 'FBXW7', 'TGFBR2', 'TGFBR1', 'MSH3', 'RBM15', 'TET1', 'TET3', 'SESN3', 'MGA', 'LTB', 'FOXL2', 'SH2B3', 'BCOR', 'HIST1H1D', 'ATRX', 'EP300', 'RAD51C', 'RAD51B', 'HIST1H1B', 'TNFAIP3', 'DICER1', 'ARID5B', 'LATS2', 'FOXO1', 'KEAP1', 'EZH2', 'SP140', 'NKX3-1', 'PBRM1', 'PALB2', 'CIC', 'BRCA1', 'DTX1', 'FLCN', 'SPEN', 'CD58', 'ERCC3', 'ERCC4', 'MSH6', 'BCL11B', 'BMPR1A', 'ERF', 'BRCA2', 'NOTCH2', 'EED', 'MITF', 'ELF3', 'SMARCA4', 'BBC3', 'ANKRD11', 'CEBPA', 'BCL2L11', 'AXIN2', 'AXIN1', 'CDK12', 'ESCO2', 'MLH1', 'SDHB', 'MED12', 'HNF1A', 'RYBP', 'ATP6V1B2', 'DNMT3B', 'KMT2B', 'KMT2A', 'DNMT3A', 'NFKBIA', 'TRAF5', 'KMT2D', 'SPOP', 'RBM10', 'P2RY8', 'TP53BP1', 'TSC2', 'KDM6A', 'EPCAM', 'PHOX2B', 'NPM1', 'BCL10', 'LATS1', 'HOXB13', 'ARID3A', 'PTPRT', 'PTPRS', 'INPPL1', 'NOTCH4', 'TET2', 'NOTCH1', 'CASP8', 'NOTCH3', 'GRIN2A', 'MAP2K4', 'WT1', 'BACH2', 'SDHA', 'BAP1', 'PTCH1', 'SDHD'])
    listOfDicts = []
    for cancerType in cancerTypes:
        print 'analyzing', cancerType
        cTypeResults = results[results['cancerType'] == cancerType]
        signifResults = cTypeResults[cTypeResults['pVal'] < .05]
        for gene in set(signifResults['gene']):
            geneResults = signifResults[signifResults['gene'] == gene]
            t = 'TSG'
            if gene not in tsgs:
                t = 'Oncogene'
            
            listOfDicts.append({'gene': gene, 'geneType': t, 'cancerType': cancerType})
    return pd.DataFrame(listOfDicts)

def summarize_permutation_test_results(permTestTable, observedMultipletDict, n):
    permTestTable = permTestTable.fillna(0)
    listOfDicts = []
    cntr = 0
    
    impactGenes = get_gene_and_cohort_list_utils.get_im6_genes()
    for gene in set(permTestTable.columns.values) - set(['iter']):
        nCases = 0
        if gene in observedMultipletDict:
            nCases = observedMultipletDict[gene]
        nCasesPermAbove = permTestTable[permTestTable[gene] >= nCases].shape[0]
        pVal = (1.0*nCasesPermAbove)/n
        listOfDicts.append({'nObs': nCases, 'permAbove': nCasesPermAbove, 'pVal': pVal, 'gene': gene})

    return pd.DataFrame(listOfDicts)

In [4]:
nIter = 300 
#Load in data and summarize it
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

#mark information so we can do the permutation test
allHypermutantMutMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(hypermutantIds)]
oncmutsMaf = allHypermutantMutMaf[allHypermutantMutMaf['oncogenic'].notnull()]
oncmutsMaf['caseGene'] = oncmutsMaf['Tumor_Sample_Barcode'] + '_' + oncmutsMaf['Hugo_Symbol']
mutCountsDict = dict(oncmutsMaf['caseGene'].value_counts())
oncmutsMaf['mutationCount'] = oncmutsMaf.apply(lambda row: mutCountsDict[row['caseGene']], axis=1)
oncmutsMaf['isMultiplet'] = oncmutsMaf['mutationCount'].apply(lambda x: True if x > 1 else False)

#do permutation test
oncmutsMaf['shuffleColumn'] = oncmutsMaf['Hugo_Symbol']
cTypes =  ['Endometrial Cancer', 'Colorectal Cancer', 'Prostate Cancer', 
                                       'Bladder Cancer', 'Esophagogastric Cancer', 'Glioma', 'Non-Small Cell Lung Cancer', 'Small Bowel Cancer'] 

df = do_permutation_test_across_cancer_types(oncmutsMaf, cancerTypes = cTypes, n = nIter) 

#summarize the data
sumDf = summarize_significant_genes_by_cancer_type(df, cancerTypes = cTypes)
counts = Counter(sumDf['gene'])
sumDf['geneCount'] = sumDf['gene'].apply(lambda x: counts[x])

tsgs = get_gene_and_cohort_list_utils.get_tsgs()
df['geneType'] = df['gene'].apply(lambda x: 'tsg' if x in tsgs else 'oncogene')

  This is separate from the ipykernel package so we can avoid doing imports until
  interactivity=interactivity, compiler=compiler, result=result)
  impactCancerTypeDf = pd.read_table(impactCancerTypeInfoPath)


NameError: name 'hypermutantIds' is not defined

In [174]:
writePath = os.path.join(writeDir, 'figureS2_x.tsv')
df.to_csv(writePath, index=False, sep='\t')









## SUPPLEMENTARY FIGURE: gene prevalene normal vs hyper by signature type

In [None]:
def make_comparissons(maf, mode = 'gene', mutationType='msi',
        cancerType1 = 'Endometrial Cancer', cancerType2 = 'Colorectal Cancer'):
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
    indelClassifications = ['Frame_Shift_Del', 'Frame_Shift_Ins']  #TODO actually only include MSI indels not just random indels
    truncatingClassifications = ['']
    
    if mutationType == 'msi':
        maf = maf[maf['Variant_Classification'].isin(indelClassifications)]
        maf = maf[maf['correctedAllele'].notnull()]
    elif mutationType == 'pole':
        maf = maf[maf['Variant_Classification'].isin(['Nonsense_Mutation'])] 
    else:
        pass
    
    if mode == 'gene':
        maf['allele'] = maf['Hugo_Symbol']
    else:
        maf['allele'] = maf['Hugo_Symbol'] + '_' + maf['HGVSp_Short']
    
    c1Maf = maf[maf['cancerType'] == cancerType1]
    c2Maf = maf[maf['cancerType'] == cancerType2]
    
    nC1 = 1.0*len(set(c1Maf['Tumor_Sample_Barcode']))
    nC2 = 1.0*len(set(c2Maf['Tumor_Sample_Barcode']))
    
    listOfDicts = []
    for allele in set(maf['allele']):
        allele = str(allele)
        aMafC1 = c1Maf[c1Maf['allele'] == allele]
        aMafC2 = c2Maf[c2Maf['allele'] == allele]
        
        gene = ''
        if mode == 'gene':
            gene = allele
        else:
            gene = allele.split('_')[0]
        
        
            
        geneType = 'tsg' if gene in tsgs else 'oncogene' if gene in oncogenes else None
        c1Count = len(set(aMafC1['Tumor_Sample_Barcode']))
        c2Count = len(set(aMafC2['Tumor_Sample_Barcode']))
        listOfDicts.append({'Allele': allele, 'GeneType': geneType, 'Gene': gene,
                            'N_C1': c1Count, 'N_C2': c2Count, 'total_C1': nC1, 'total_C2': nC2,
                            'c1_cancerType': cancerType1, 'c2_cancerType': cancerType2,
                           'perCase_c1': c1Count/nC1, 'perCase_c2': c2Count/nC2})
    
    df = pd.DataFrame(listOfDicts)    
    df['n_NotPresent_c1'] = df['N_C1'].apply(lambda x: nC1 - x)
    df['n_NotPresent_c2'] = df['N_C2'].apply(lambda x: nC2 - x)
    
    #get fisher's test results
    df['p_proportions_z_score'] = df.apply(lambda row: proportions_ztest(np.array([row['N_C1'], row['N_C2']]),
                                                                         np.array([nC1, nC2]))[1], axis=1)
    
    df = df[df['p_proportions_z_score'].notnull()] #remove null z scores, otherwise the qVal wont calculate
    fdrDict = dict(zip(df['Allele'], fdrcorrection(df['p_proportions_z_score'])[1]))
    df['qVal'] = df['Allele'].apply(lambda x: fdrDict[x])
    
    wntGenes = get_gene_and_cohort_list_utils.get_pathway_genes('WNT')
    pi3kGenes = get_gene_and_cohort_list_utils.get_pathway_genes('PI3K') | set(['INPPL1', 'JAK1']) #manually add these guys
    df['pathway'] = df['Gene'].apply(lambda x: 'WNT' if x in wntGenes else 'PI3K' if x in pi3kGenes else 'OTHER')
    return df

In [None]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])

In [None]:
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
msiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)
hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])


In [None]:
poleCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_10')
apobecCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_APOBEC')


In [None]:
apobecMaf = allImpactMutsMaf[
    allImpactMutsMaf['Tumor_Sample_Barcode'].isin(apobecCases) &
    (allImpactMutsMaf['Tumor_Sample_Barcode'].isin(hypermutantIds))]

In [None]:
#Counter(apobecMaf.drop_duplicates(subset=['Tumor_Sample_Barcode'])['cancerType'])

df = make_comparissons(apobecMaf, mode = 'gene', mutationType='',
        cancerType1 = 'Bladder Cancer', cancerType2 = 'Breast Cancer')



In [None]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/apobecCompTest.tsv', index=False, sep='\t')

## Observed and expcted by related vs unrelated gene class

In [None]:
def summarize_obs_vs_expected_by_gene_type(maf, relatedGenesDict, expectedDf, cancerTypeDict):
    impact341Genes = get_gene_and_cohort_list_utils.get_im3_genes()
    expectedDf = expectedDf[expectedDf['gene'].isin(impact341Genes)]
    maf = maf[maf['Hugo_Symbol'].isin(impact341Genes)]
    oncSnpMaf = maf[(maf['oncogenic'].notnull()) & (maf['Variant_Type'] == 'SNP')]
    mutCounts = dict(maf['Tumor_Sample_Barcode'].value_counts())
    listOfDicts = []
    cntr = 0
    for case, cancerType in cancerTypeDict.items():
        if case in set(maf['Tumor_Sample_Barcode']) and case in set(expectedDf['case']):
        
            if cntr%50 == 0: print cntr,
            caseMaf = oncSnpMaf[oncSnpMaf['Tumor_Sample_Barcode'] == case]
            caseExpectation = expectedDf[expectedDf['case'] == case]
            relatedGenes = relatedGenesDict[cancerType]

            relatedGenesMaf = caseMaf[caseMaf['Hugo_Symbol'].isin(relatedGenes)]
            notRelatedGenesMaf = caseMaf[~caseMaf['Hugo_Symbol'].isin(relatedGenes)]
            relatedGenesExpected = caseExpectation[caseExpectation['gene'].isin(relatedGenes)]
            notRelatedGenesExpected = caseExpectation[~caseExpectation['gene'].isin(relatedGenes)]

            listOfDicts.append({'cancerType': cancerType, 'Tumor_Sample_Barcode': case,
                                'nmut_IM341': mutCounts[case],
                'nRelatedObserved': relatedGenesMaf.shape[0],
                'nUnrelatedObserved': notRelatedGenesMaf.shape[0],
                'nRelatedExpected': mutCounts[case]*sum(relatedGenesExpected['oncogenicChance']),
                'nUnrelatedExpected': mutCounts[case]*sum(notRelatedGenesExpected['oncogenicChance'])
            })

            cntr += 1
        
    df = pd.DataFrame(listOfDicts)
    return df

In [None]:
expectedDf = pd.read_table(filePathDict['EXPECTED_MUTATION_INFO_BY_GENE'])
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
relatedGenesDict = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type()

In [None]:
cancerTypeDict = dict(get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO']))
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
allImpactMutsMaf['hypermutationStatus'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x:
    'hypermutated' if x in hypermutantIds else 'normal' if x in normalIds else 'Intermediate')
hypermutantMaf = allImpactMutsMaf[allImpactMutsMaf['hypermutationStatus'] == 'hypermutated']

In [None]:
df = summarize_obs_vs_expected_by_gene_type(hypermutantMaf, relatedGenesDict, expectedDf, cancerTypeDict)

In [None]:
#dominantSignatureDict = get_gene_and_cohort_list_utils.get_pan_impact_signature_mapping(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x:
        dominantSignatureDict[x] if x in dominantSignatureDict else None)
df['dominantSignature'] = df['dominantSignature'].apply(lambda x: 
        'APOBEC' if x == 'mean_APOBEC'
        else 'TMZ' if x == 'mean_11'
        else 'MMR' if x in ['mean_1', 'mean_MMR']
        #else 'UV' if x == 'mean_7'
        else 'POLE' if x == 'mean_10'
        #else 'SMOKING' if x == 'mean_SMOKING'
        else 'other')

In [None]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/obsExpectedRelatedGenesSup.tsv', index=False, sep='\t')

## Supplementary figure: enrichment of mutations in hypermutated tumors

In [None]:
#samples a normal maf to have mutations drawn from similar distribution as hypermutated cases
def sample_normal_maf(normalMaf, hyperMaf, N=25):
    
    hyperCancerTypeCounter = Counter(hyperMaf['cancerType'])
    normalCancerTypeCounter = Counter(normalMaf['cancerType'])
    
    normalMaf['weight'] = normalMaf['cancerType'].apply(lambda x: 
        (1.0*normalMaf.shape[0]*hyperCancerTypeCounter[x])/(hyperMaf.shape[0]*normalCancerTypeCounter[x]))
    sampledCounts = normalMaf.sample(frac=N, weights='weight', replace=True)
    return sampledCounts

def summarize_mutational_prevalence(hyperMaf, normalMafSampled):
    
    nCasesHyper = len(set(hyperMaf['Tumor_Sample_Barcode']))
    nCasesNormal = len(set(normalMaf['Tumor_Sample_Barcode']))
    
    oncoMafNormal = normalMafSampled[normalMafSampled['oncogenic'].notnull()]
    oncoMafHyper = hyperMaf[hyperMaf['oncogenic'].notnull()]
    
    listOfDicts = []
    
    cntr = 0
    for gene in set(oncoMafNormal['Hugo_Symbol']) | set(oncoMafHyper['Hugo_Symbol']):
        
        geneMafNormal = oncoMafNormal[oncoMafNormal['Hugo_Symbol'] == gene]
        geneMafHyper = oncoMafHyper[oncoMafHyper['Hugo_Symbol'] == gene]
        normalCount = len(set(geneMafNormal['Tumor_Sample_Barcode']))
        hyperCount = len(set(geneMafHyper['Tumor_Sample_Barcode']))
        
        listOfDicts.append({'gene': gene,
            'countNormal': normalCount,
            'countHyper': hyperCount
        })
        
        cntr += 1
        if cntr%50 == 0: print cntr
        
    df = pd.DataFrame(listOfDicts)
    df['fracNormal'] = df['countNormal'].apply(lambda x: (1.0*x)/nCasesNormal)
    df['fracHyper'] = df['countHyper'].apply(lambda x:  (1.0*x)/nCasesHyper)
    
    return df
        

In [None]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

msiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)
apobecCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_APOBEC')
poleCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_10')

hypermutatedIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(normalIds)]


In [None]:
listOfDfs = []
caseMapping = {'POLE': poleCases, 'APOBEC': apobecCases, 'MSI': msiCases}
"""for sigType, cases in caseMapping.items():
    print 'sampling', sigType
    hypermutatedMaf = allImpactMutsMaf[(allImpactMutsMaf['Tumor_Sample_Barcode'].isin(cases))]
    samples = sample_normal_maf(normalMaf, hypermutatedMaf, N=15)
    print 'summarizing'
    df = summarize_mutational_prevalence(hypermutatedMaf, samples)
    df['signature'] = sigType
    listOfDfs.append(df)
"""

#combinedDf = pd.concat(listOfDfs)
combinedDf['ratio'] = combinedDf['fracHyper']/combinedDf['fracNormal']
combinedDf['ddrGene'] = combinedDf['gene'].apply(lambda x: True if x in ['POLE', 'MLH1', 'MSH2', 'MSH6', 'PMS2'] else False)


In [None]:
combinedDf.to_csv('~/Desktop/WORK/dataForLocalPlotting/percentComps.tsv', index=False, sep='\t')

## DNDS supplemental figure

In [None]:
df = pd.read_table('/Users/friedman/Desktop/hypermutationProjectFinal/tables/table5.tsv')
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])

In [None]:
relatedGenesDict = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type()

In [None]:
#relatedGenesDict
df['related'] = df.apply(lambda row:
                'related' if row['CANCER_TYPE'] in relatedGenesDict and row['GENE'] in relatedGenesDict[row['CANCER_TYPE']]
                           else 'not-related', axis=1)
df['dndsIsSignificantHyper'] = df['DNDS_HYPER_SCORE'].apply(lambda x: 1 if x <.05 else 0)
df['cancerTypeGene'] = df.apply(lambda row: row['CANCER_TYPE'] + '_' + row['related'], axis=1)

In [None]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/dndsSupplement.tsv',index=False, sep='\t')