In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import math
import re
import sys
import scipy
from collections import Counter
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.multitest import fdrcorrection

notebookPath = 'scripts/figure2'
projectDir = re.sub(notebookPath, '', os.getcwd())
sys.path.append(os.path.join(projectDir, 'scripts/utilityScripts'))

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import configuration_util
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import clonality_analysis_util
import get_gene_and_cohort_list_utils
import attribute_mutations_to_signatures

filePathDict = configuration_util.get_all_files_path_dict()

In [2]:
#Set where to write the files
writeDir = os.path.join(projectDir, 'scripts/figure2/FIGURE2_PLOTTING_FILES/plotDataFiles/')

## Figure 2S(a)
Driver mutation types in hypermutated tumors by signature

In [3]:
def make_counts_df(maf, dominantSignatureDict):
    
    def summarize_counts_for_mutation_type(oMaf, mutationType):
        
        oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
        tsgs = get_gene_and_cohort_list_utils.get_tsgs()
        truncatingConsequences = set(['Frame_Shift_Del', 'Frame_Shift_Ins', 'Nonsense_Mutation'])
        
        sMaf = None
        counts = None
        if mutationType == 'tsgTrunc':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(tsgs) &
                (oncogenicMaf['Variant_Classification'].isin(truncatingConsequences))]
        if mutationType == 'oncogene':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(oncogenes)]
        if mutationType == 'tsgMissense':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(tsgs) &
                                (~oncogenicMaf['Variant_Classification'].isin(truncatingConsequences))]
            
        counts = Counter(sMaf['Tumor_Sample_Barcode'])
        counts = [(case, count) for case, count in counts.items()] + [(case, 0) for
            case in set(allIds) - set(sMaf['Tumor_Sample_Barcode'])] 
        return dict(counts)

    
    cancerTypeDict = dict(zip(maf['Tumor_Sample_Barcode'], maf['cancerType']))
    allIds = set(maf['Tumor_Sample_Barcode'])
    oncogenicMaf = maf[maf['oncogenic'].notnull()]
    
    tsgTruncatingCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'tsgTrunc')
    oncogenicCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'oncogene')
    tsgMissenseCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'tsgMissense')
    listOfDicts = []
    for case in allIds:
        listOfDicts.append({'Tumor_Sample_Barcode': case, 'TsgTruncating': tsgTruncatingCounts[case],
                            'Oncogene': oncogenicCounts[case], 'TsgMissense': tsgMissenseCounts[case],
                            'cancerType': cancerTypeDict[case], 
                            'dominantSignature': dominantSignatureDict[case] if case in dominantSignatureDict else None
                           })
    return pd.DataFrame(listOfDicts)

In [4]:
#set up a maf of hypermutated cancers with cancer type annotation
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO'])
allHypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
dominantSignatureDict = get_gene_and_cohort_list_utils.get_pan_impact_signature_mapping(
    filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
hypermutatedMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(allHypermutantIds)]

#get counts of mutations
dfCountsHypermutated = make_counts_df(hypermutatedMaf, dominantSignatureDict)
meltedDf2 = pd.melt(dfCountsHypermutated, id_vars=['dominantSignature'], value_vars=['Oncogene', 'TsgMissense', 'TsgTruncating'])
meltedDf2['dominantSignatureAdj'] = meltedDf2['dominantSignature'].apply(lambda x:
    '_MMR' if x == 'mean_1' or x == 'mean_MMR'
    else '_SMOKING' if x == 'mean_SMOKING'
    else '_APOBEC' if x == 'mean_APOBEC'
    else '_POLE' if x == 'mean_10'
    else '_TMZ' if x == 'mean_11'
    else '_UV' if x == 'mean_7'
    else 'other')

meltedDf2['frac'] = meltedDf2.apply(lambda row:
    1.0*row['value']/sum(meltedDf2[meltedDf2['dominantSignatureAdj'] == row['dominantSignatureAdj']]['value']), axis=1)


  
  interactivity=interactivity, compiler=compiler, result=result)
  impactCancerTypeDf = pd.read_table(impactCancerTypeInfoPath)
  df = pd.read_table(path)
  sigsDf = pd.read_table(impactSigsPath)


In [5]:
writePath = os.path.join(writeDir, 'figureS2_a.tsv')
meltedDf2.to_csv(writePath, index=False, sep='\t')

## Figure 2S(b)
Mutation attribution for hypermutated tumors

In [6]:
mafWithMutationAttribution = pd.read_table(filePathDict['MAF_WITH_MUTATION_ATTRIBUTION'])
tsgs = get_gene_and_cohort_list_utils.get_tsgs()
oncogenicConsequences = ['Likely Oncogenic', 'Oncogenic', 'Predicted Oncogenic']
mafWithMutationAttribution['geneType'] = mafWithMutationAttribution.apply(lambda row:
        'TSG_truncating' if row['Hugo_Symbol'] in tsgs and row['Variant_Classification'] in ['Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins'] and row['oncogenic'] in oncogenicConsequences 
        else 'TSG_missense' if row['Hugo_Symbol'] in tsgs and row['oncogenic'] in oncogenicConsequences
        else 'Oncogene' if row['oncogenic'] in oncogenicConsequences
        else 'VUS', axis=1
)
mafWithMutationAttribution['hypermutationInduced'] = mafWithMutationAttribution['hypermutationInduced'].apply(lambda x:
    'Almost certain' if x == 'hyperInduced'
    else 'Possible' if x == 'unclear'
    else 'Unlikely' if x == 'notHyperAttributable'
            else None)

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
writeMaf = mafWithMutationAttribution[['geneType', 'hypermutationInduced', 'Hugo_Symbol', 'HGVSp_Short']]
writePath = os.path.join(writeDir, 'figureS2_b.tsv')
writeMaf.to_csv(writePath, index=False, sep='\t')

In [8]:
#Percent truncating mutations due to signatures
tsgTruncMaf = mafWithMutationAttribution[(mafWithMutationAttribution['geneType'] == 'TSG_truncating')]
print (tsgTruncMaf[tsgTruncMaf['hypermutationInduced'] != 'Unlikely'].shape[0]*100.)/tsgTruncMaf.shape[0]

78.3807312826


## Figure 2S(c)
Summary of all possible drivers in IMPACT

In [9]:
def summarize_all_possible_drivers(allPossibleMutations):
    genes = set(allPossibleMutations['Hugo_Symbol'])
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    total = 0
    tsgTrunc = 0
    tsgOncogenic = 0
    oncogene = 0
    for gene in genes:
        geneGroup = dict(allPossibleMutations[allPossibleMutations['Hugo_Symbol'] == gene].iloc[0])
        if gene in tsgs:
            tsgTrunc += np.nansum([value for key, value in geneGroup.items() if 'truncating' in key])
            tsgOncogenic += np.nansum([value for key, value in geneGroup.items() if 'oncogenic' in key])
            total += np.nansum([value for key, value in geneGroup.items() if 'nonSilent' in key])
        else:
            total += np.nansum([value for key, value in geneGroup.items() if 'nonSilent' in key])
            oncogene += np.nansum([value for key, value in geneGroup.items() if 'oncogenic' in key])
    vus = total - oncogene - tsgOncogenic
    tsgMissense = tsgOncogenic - tsgTrunc
    df = pd.DataFrame([{'mutationType':'VUS', 'number': vus}, 
                        {'mutationType':'TSG_MISSENSE', 'number': tsgMissense},
                       {'mutationType': 'TSG_TRUNCATING', 'number': tsgTrunc},
                      {'mutationType':'ONCOGENE', 'number':oncogene}])
    return df
        

In [10]:
dfAllPossibleMutations = pd.read_table(filePathDict['ALL_POSSIBLE_MUTATION_SUMMARY'])
df = summarize_all_possible_drivers(dfAllPossibleMutations)


  """Entry point for launching an IPython kernel.


In [11]:
writePath = os.path.join(writeDir, 'figureS2_c.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2S(d)
Fraction drivers in related genes

In [12]:
def summarize_obs_vs_expected_by_gene_type(maf, relatedGenesDict, expectedDf, cancerTypeDict):
    impact341Genes = get_gene_and_cohort_list_utils.get_im3_genes()
    expectedDf = expectedDf[expectedDf['gene'].isin(impact341Genes)]
    maf = maf[maf['Hugo_Symbol'].isin(impact341Genes)]
    oncSnpMaf = maf[(maf['oncogenic'].notnull()) & (maf['Variant_Type'] == 'SNP')]
    mutCounts = dict(maf['Tumor_Sample_Barcode'].value_counts())
    listOfDicts = []
    cntr = 0
    for case, cancerType in cancerTypeDict.items():
        if case in set(maf['Tumor_Sample_Barcode']) and case in set(expectedDf['case']):
        
            if cntr%50 == 0: print cntr,
            caseMaf = oncSnpMaf[oncSnpMaf['Tumor_Sample_Barcode'] == case]
            caseExpectation = expectedDf[expectedDf['case'] == case]
            relatedGenes = relatedGenesDict[cancerType]

            relatedGenesMaf = caseMaf[caseMaf['Hugo_Symbol'].isin(relatedGenes)]
            notRelatedGenesMaf = caseMaf[~caseMaf['Hugo_Symbol'].isin(relatedGenes)]
            relatedGenesExpected = caseExpectation[caseExpectation['gene'].isin(relatedGenes)]
            notRelatedGenesExpected = caseExpectation[~caseExpectation['gene'].isin(relatedGenes)]

            listOfDicts.append({'cancerType': cancerType, 'Tumor_Sample_Barcode': case,
                                'nmut_IM341': mutCounts[case],
                'nRelatedObserved': relatedGenesMaf.shape[0],
                'nUnrelatedObserved': notRelatedGenesMaf.shape[0],
                'nRelatedExpected': mutCounts[case]*sum(relatedGenesExpected['oncogenicChance']),
                'nUnrelatedExpected': mutCounts[case]*sum(notRelatedGenesExpected['oncogenicChance'])
            })

            cntr += 1
        
    df = pd.DataFrame(listOfDicts)
    return df

In [48]:
#Takes ~5 minutes to run
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
relatedGenesDict = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type()
dominantSignatureDict = get_gene_and_cohort_list_utils.get_pan_impact_signature_mapping(
    filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])

#Load in data as needed
cancerTypeDict = dict(get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO']))
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
allImpactMutsMaf['hypermutationStatus'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x:
    'hypermutated' if x in hypermutantIds else 'normal' if x in normalIds else 'Intermediate')
hypermutantMaf = allImpactMutsMaf[allImpactMutsMaf['hypermutationStatus'] == 'hypermutated']

expectedDf = pd.read_table(filePathDict['EXPECTED_MUTATION_INFO_BY_GENE'])
df = summarize_obs_vs_expected_by_gene_type(hypermutantMaf, relatedGenesDict, expectedDf, cancerTypeDict)
df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x: dominantSignatureDict[x] if x in dominantSignatureDict else None)

df['dominantSignature'] = df['dominantSignature'].apply(lambda x: 
            'APOBEC' if x == 'mean_APOBEC'
            else 'MMR' if x == 'mean_MMR'
            else 'TMZ' if x == 'mean_11'
            else 'POLE' if x == 'mean_10'
            else 'other')

  
  app.launch_new_instance()


0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450


In [13]:
writePath = os.path.join(writeDir, 'figureS2_d.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2S(e)
DNDS scores by related vs unrelated genes

In [14]:
df = pd.read_table('/Users/friedman/Desktop/hypermutationProjectFinal/tables/table5.tsv')

#LOAD data from 'table' that contains dnds scores
df = pd.read_table(filePathDict['TABLE_5'])
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
relatedGenesDict = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type()
df['related'] = df.apply(lambda row:
                'related' if row['CANCER_TYPE'] in relatedGenesDict and row['GENE'] in relatedGenesDict[row['CANCER_TYPE']]
                           else 'not-related', axis=1)
df['dndsIsSignificantHyper'] = df['DNDS_HYPER_SCORE'].apply(lambda x: 1 if x <.05 else 0)
df['cancerTypeGene'] = df.apply(lambda row: row['CANCER_TYPE'] + '_' + row['related'], axis=1)

  """Entry point for launching an IPython kernel.
  after removing the cwd from sys.path.
  """
  allImpactMutsMaf = pd.read_table(impactMafPath) #todo get this in a better way
  if self.run_code(code, result):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  oncMaf['caseGene'] = oncMaf['Tumor_Sample_Barcode'] + '_' + oncMaf['Hugo_Symbol']


In [15]:
writePath = os.path.join(writeDir, 'figureS2_e.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2S(f)
Comparing the enrichment of mutations in hypermutated tumors of various aetiologies

In [16]:
#samples a normal maf to have mutations drawn from similar distribution as hypermutated cases
def sample_normal_maf(normalMaf, hyperMaf, N=25):
    
    hyperCancerTypeCounter = Counter(hyperMaf['cancerType'])
    normalCancerTypeCounter = Counter(normalMaf['cancerType'])
    
    normalMaf['weight'] = normalMaf['cancerType'].apply(lambda x: 
        (1.0*normalMaf.shape[0]*hyperCancerTypeCounter[x])/(hyperMaf.shape[0]*normalCancerTypeCounter[x]))
    sampledCounts = normalMaf.sample(frac=N, weights='weight', replace=True)
    return sampledCounts

def summarize_mutational_prevalence(hyperMaf, normalMafSampled):
    
    nCasesHyper = len(set(hyperMaf['Tumor_Sample_Barcode']))
    nCasesNormal = len(set(normalMaf['Tumor_Sample_Barcode']))
    
    oncoMafNormal = normalMafSampled[normalMafSampled['oncogenic'].notnull()]
    oncoMafHyper = hyperMaf[hyperMaf['oncogenic'].notnull()]
    
    listOfDicts = []
    
    cntr = 0
    for gene in set(oncoMafNormal['Hugo_Symbol']) | set(oncoMafHyper['Hugo_Symbol']):
        
        geneMafNormal = oncoMafNormal[oncoMafNormal['Hugo_Symbol'] == gene]
        geneMafHyper = oncoMafHyper[oncoMafHyper['Hugo_Symbol'] == gene]
        normalCount = len(set(geneMafNormal['Tumor_Sample_Barcode']))
        hyperCount = len(set(geneMafHyper['Tumor_Sample_Barcode']))
        
        listOfDicts.append({'gene': gene,
            'countNormal': normalCount,
            'countHyper': hyperCount
        })
        
        cntr += 1
        if cntr%50 == 0: print cntr
        
    df = pd.DataFrame(listOfDicts)
    df['fracNormal'] = df['countNormal'].apply(lambda x: (1.0*x)/nCasesNormal)
    df['fracHyper'] = df['countHyper'].apply(lambda x:  (1.0*x)/nCasesHyper)
    
    return df

In [17]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
msiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)
apobecCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_APOBEC')
poleCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_10')
hypermutatedIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(normalIds)]

#summarize the data for different signature cohorts
#note more signature cohorts could be added here
listOfDfs = []
caseMapping = {'POLE': poleCases, 'APOBEC': apobecCases, 'MSI': msiCases}
for sigType, cases in caseMapping.items():
    print 'sampling', sigType
    hypermutatedMaf = allImpactMutsMaf[(allImpactMutsMaf['Tumor_Sample_Barcode'].isin(cases))]
    samples = sample_normal_maf(normalMaf, hypermutatedMaf, N=15)
    print 'summarizing'
    df = summarize_mutational_prevalence(hypermutatedMaf, samples)
    df['signature'] = sigType
    listOfDfs.append(df)


combinedDf = pd.concat(listOfDfs)
combinedDf['ratio'] = combinedDf['fracHyper']/combinedDf['fracNormal']
combinedDf['ddrGene'] = combinedDf['gene'].apply(lambda x: True if x in ['POLE', 'MLH1', 'MSH2', 'MSH6', 'PMS2'] else False)


  """Entry point for launching an IPython kernel.
  msiInfoDf = pd.read_table(msiInfoFilePath)
  sigsDf = pd.read_table(impactSigsPath)


sampling POLE


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


summarizing
50
100
150
200
250
300
sampling APOBEC
summarizing
50
100
150
200
250
300
350
sampling MSI
summarizing
50
100
150
200
250
300
350


In [18]:
writePath = os.path.join(writeDir, 'figureS2_f.tsv')
combinedDf.to_csv(writePath, index=False, sep='\t')

## Figure 2S(g)
POLE mutational hotspots compared to mutations of equal mutability

In [21]:
#specfically look at pentanucleotide context in POLE cases and its relation to mutationa susceptibility
poleMafWithPentaContext = pd.read_table(filePathDict['PENTANUCLEOTIDE_CONTEXT_ANNOTATED_MAF'])
poleMafWithPentaContext['pentaChange'] = poleMafWithPentaContext.apply(lambda row: 
                                    mutationSigUtils.create_strand_specific_pentanucleotide_change(row['Ref_Tri.1'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)
poleMafWithPentaContext['allele'] = poleMafWithPentaContext.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)
tsgs = get_gene_and_cohort_list_utils.get_tsgs()

poleMafWithPentaContext['mutationType'] = poleMafWithPentaContext.apply(lambda row:
                'tsg_driver' if row['Hugo_Symbol'] in tsgs and row['oncogenic'] in ['Oncogenic', 'Likely Oncogenic', 'Predicted Oncogenic']
                else 'oncogene_driver' if row['oncogenic'] in ['Oncogenic', 'Likely Oncogenic', 'Predicted Oncogenic', 'Inconclusive']
                else 'VUS', axis=1)
poleMafWithPentaContext['uniqueMutationType'] = poleMafWithPentaContext.apply(lambda row: str(row['allele']) + ';' + str(row['mutationType']), axis=1)

meanDict = {}
stdDict = {}
for penta, count in Counter(poleMafWithPentaContext['pentaChange']).most_common(10):
    pentaMaf = poleMafWithPentaContext[poleMafWithPentaContext['pentaChange'] == penta]
    counts = [count for value, count in Counter(pentaMaf['allele']).items()]
    mean = np.mean(counts)
    std = np.std(counts)
    meanDict[penta] = mean
    stdDict[penta] = std

poleMafWithPentaContext['std'] = poleMafWithPentaContext['pentaChange'].apply(lambda x: stdDict[x] if x in stdDict else None)
poleMafWithPentaContext['mean'] = poleMafWithPentaContext['pentaChange'].apply(lambda x: meanDict[x] if x in meanDict else None)
writeMaf = poleMafWithPentaContext[poleMafWithPentaContext['mean'].notnull()]    
writeMaf['count'] = 1
#writeMaf['driverType'] = df['oncogenic'].apply(lambda x: 'oncogenic' if x in ['Likely Oncogenic', 'Oncogenic', 'Predicted Oncogenic'] else 'not_driver')

df = writeMaf.groupby(['pentaChange', 'std', 'mean', 'allele', 'Hugo_Symbol', 'uniqueMutationType'])['count'].sum().reset_index()
df['deviation'] = df.apply(lambda row: (row['count'] - row['mean'])/row['std'], axis=1)

df['geneType'] = df['Hugo_Symbol'].apply(lambda x: 'TSG' if x in tsgs else 'Oncogene')
df['mutationType'] = df['uniqueMutationType'].apply(lambda x: x.split(';')[1])


  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [20]:
writePath = os.path.join(writeDir, 'figureS2_g.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2S(h)
Hypermutated tumors disproportionately mutate common hotspots

In [25]:
def assign_hotspot_freq_dict(df):
    d = {}
    for index, row in df.iterrows():
        refAminoAcid = row['ref']
        gene = row['Hugo_Symbol']
        position = row['Amino_Acid_Position']
        for entry in row['Var_AA'].split('|'):
            fullAltName = ''
            altAminoAcid, count = entry.split(':')
            fullAltName = gene + ':' + refAminoAcid + position + altAminoAcid
            d[fullAltName] = float(count)/47000
    return d

#returns a dictionary mapping a hotspot to its pancan rank order 
def rank_hotspots(hotspotIncidenceD):
    d = {}
    sortedHotspots = sorted(zip(hotspotIncidenceD.keys(), hotspotIncidenceD.values()), key= lambda x: x[1], reverse=True)
    i = 0
    for allele, freq in sortedHotspots:
        d[allele] = i
        i += 1
    return d

#given a df and hotspots, for the 3 hypermutator cancer types assigns hotspots percentiles
def assign_percentiles_to_df(df):
    for ct in ['Endometrial_Cancer', 'Colorectal_Cancer', 'Glioma']:
        ctRelated = df[df[ct + '_RelatedHotspot'] == True]
        ctNotRelated = df[df[ct + '_RelatedHotspot'] == False]
        percentilesRelated = dict(zip(ctRelated['hotspot'], ctRelated['freq'].rank(pct=True)))
        percentilesNotRelated = dict(zip(ctNotRelated['hotspot'], ctNotRelated['freq'].rank(pct=True)))
        df[ct + '_related_percentile'] = df.apply(lambda row:
                                               None if row[ct + '_RelatedHotspot'] == False
                                               else percentilesRelated[row['hotspot']], axis=1)
        df[ct + '_not_related_percentile'] = df.apply(lambda row:
                                               None if row[ct + '_RelatedHotspot'] == True
                                               else percentilesNotRelated[row['hotspot']], axis=1)
    return df 

def assign_hotspot_mutation_utilization_curves(hotspotMafDf, hotspotRankingD, cancerTypeName):
    
    def normalize_hotspot_rankings(df):
        l = [] #create a list of tuples of percentiles and hotspots
        n = 0
        df = df.drop_duplicates(subset=['hotspotName'])
        sortedHotspotRanks = sorted(zip(df['hotspotName'], df['hotspotRank']), key= lambda x: x[1])
        nHotspotsUnderConsideration = len(sortedHotspotRanks)
        cntr = 0
        for name, rank in sortedHotspotRanks:
            l.append((name, 1.0*cntr/nHotspotsUnderConsideration))
            cntr += 1
        return l
    
    relatedHotspotMaf = hotspotMafDf[hotspotMafDf['isCtypeCancerGene'] == True]
    unrelatedHotspotMaf = hotspotMafDf[hotspotMafDf['isCtypeCancerGene'] == False]
    
    relatedHotspotMaf['hotspotRank'] = relatedHotspotMaf['hotspotName'].apply(lambda x: hotspotRankingD[x] if x in hotspotRankingD else None)
    unrelatedHotspotMaf['hotspotRank'] = unrelatedHotspotMaf['hotspotName'].apply(lambda x: hotspotRankingD[x] if x in hotspotRankingD else None)
    #FILTER out hotspots that we do not have a ranking for
    relatedHotspotMaf = relatedHotspotMaf[relatedHotspotMaf['hotspotRank'].notnull()]
    unrelatedHotspotMaf = unrelatedHotspotMaf[unrelatedHotspotMaf['hotspotRank'].notnull()]
    
    nRelatedHotspots = relatedHotspotMaf.shape[0]
    nUnrelatedHotspots = unrelatedHotspotMaf.shape[0]
    
    relatedHotspotRankings = normalize_hotspot_rankings(relatedHotspotMaf)
    unrelatedHotspotRankings = normalize_hotspot_rankings(unrelatedHotspotMaf)
    
    listOfDicts = [{'hotspot':'NA_origin_for_path_Related', 'class': 'related', 'val':0, 'percentile': 0},
                  {'hotspot':'NA_origin_for_path_Unrelated', 'class': 'unrelated', 'val':0, 'percentile': 0}] #what we return
    runningSum = 0
    for hotspot, percentile in relatedHotspotRankings:
        frac = 1.0*relatedHotspotMaf[relatedHotspotMaf['hotspotName'] == hotspot].shape[0]/nRelatedHotspots
        runningSum += frac
        listOfDicts.append({
         'hotspot': hotspot,
         'class': 'related',
         'val': runningSum,
         'percentile': percentile
        })
        
    runningSum = 0
    for hotspot, percentile in unrelatedHotspotRankings:
        nTimesHotspotOccurs = unrelatedHotspotMaf[unrelatedHotspotMaf['hotspotName'] == hotspot].shape[0]
        frac = 1.0*nTimesHotspotOccurs/nUnrelatedHotspots
        runningSum += frac
        listOfDicts.append({
         'hotspot': hotspot,
         'class': 'unrelated',
        'val': runningSum,
        'percentile': percentile
        })
      
    df = pd.DataFrame(listOfDicts)
    df['cancerType'] = cancerTypeName
    return df

def prepare_cancer_type_maf_for_analysis(maf, cType, relatedGenesDict):
    cancerTypeHypermutatorHotspots = maf[maf['is-a-hotspot'] == 'Y']
    cancerTypeHypermutatorHotspots['hotspotName'] = cancerTypeHypermutatorHotspots.apply(lambda row:
                                                                                     str(row['Hugo_Symbol']) + ':' + str(row['HGVSp_Short']).strip('p.'), axis=1)
    cancerTypeHypermutatorHotspots['isCtypeCancerGene'] = cancerTypeHypermutatorHotspots['Hugo_Symbol'].apply(lambda x: True if x in relatedGenesDict[cType] else False)
    return cancerTypeHypermutatorHotspots

In [23]:
relatedGenesDict = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type()

allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO'])
allHypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
hypermutatedMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(allHypermutantIds)]

hotspotsDf = pd.read_table(filePathDict['HOTSPOT_DATA'])
hotspotIncidenceD = assign_hotspot_freq_dict(hotspotsDf)
hotspotRankingD = rank_hotspots(hotspotIncidenceD)

endometrialHypermutatorHotspots = prepare_cancer_type_maf_for_analysis(hypermutatedMaf, 'Endometrial Cancer', relatedGenesDict)
colorectalHypermutatorHotspots = prepare_cancer_type_maf_for_analysis(hypermutatedMaf, 'Colorectal Cancer', relatedGenesDict)
gliomaHypermutatorHotspots = prepare_cancer_type_maf_for_analysis(hypermutatedMaf, 'Glioma', relatedGenesDict)


dfColo = assign_hotspot_mutation_utilization_curves(colorectalHypermutatorHotspots, hotspotRankingD, 'Colorectal_Cancer')
dfEndo = assign_hotspot_mutation_utilization_curves(endometrialHypermutatorHotspots, hotspotRankingD, 'Endometrial_Cancer')
dfGlio = assign_hotspot_mutation_utilization_curves(gliomaHypermutatorHotspots, hotspotRankingD, 'Glioma')

df = pd.concat([dfColo, dfEndo, dfGlio])



  This is separate from the ipykernel package so we can avoid doing imports until
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

In [24]:
writePath = os.path.join(writeDir, 'figureS2_h.tsv')
df.to_csv(writePath, index=False, sep='\t')

In [29]:
#P values: 2 sample Kolmogorov-Smirnov statistic 
print 'endometrial', scipy.stats.ks_2samp(
    np.array(df[(df['cancerType'] == 'Endometrial_Cancer') & (df['class'] == 'related')]['val'])
    ,np.array(df[(df['cancerType'] == 'Endometrial_Cancer') & (df['class'] == 'unrelated')]['val']))
print 'colorectal', scipy.stats.ks_2samp(
    np.array(df[(df['cancerType'] == 'Colorectal_Cancer') & (df['class'] == 'related')]['val'])
    ,np.array(df[(df['cancerType'] == 'Colorectal_Cancer') & (df['class'] == 'unrelated')]['val']))
print 'glioma', scipy.stats.ks_2samp(
    np.array(df[(df['cancerType'] == 'Glioma') & (df['class'] == 'related')]['val'])
    ,np.array(df[(df['cancerType'] == 'Glioma') & (df['class'] == 'unrelated')]['val']))


endometrial Ks_2sampResult(statistic=0.12602338464555346, pvalue=0.0002191371595835052)
colorectal Ks_2sampResult(statistic=0.14175255268649345, pvalue=3.0240663648390716e-05)
glioma Ks_2sampResult(statistic=0.048874294037975, pvalue=0.5260843067270379)


## Figure 2S(i)
Rate of gene mutation compared between distinct cancer types of identical aetiologies

In [26]:
def compare_mutation_prevalences_across_multiple_cancer_types(
        allMsiCasesMaf,
        cancerTypes = ['Endometrial Cancer', 'Colorectal Cancer', 'Esophagogastric Cancer', 'Prostate Cancer']):
    
    listOfDfs = []
    compsDone = []
    for c1 in cancerTypes:
        for c2 in cancerTypes:
            comp = '/'.join(sorted((c1, c2)))
            if c1 != c2 and comp not in compsDone:
                cancerTypesMaf = allMsiCasesMaf[
                    allMsiCasesMaf['cancerType'].isin([c1, c2]) & (allMsiCasesMaf['Tumor_Sample_Barcode'].isin(msiCases))]
                df = analysis_utils.make_comparissons(cancerTypesMaf, mode='gene', cancerType1 = c1, cancerType2 = c2)
                df['comp'] = comp            
                compsDone.append(comp)
                listOfDfs.append(df)
    combinedDf = pd.concat(listOfDfs)
    return combinedDf

In [29]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
msiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)

#We have to match MSI alleles (sometimes they have different names etc)
#the value correctedAllele is the proper allele for us to work with
msiSummary = pd.read_table(filePathDict['MICROSATELLITE_INFORMATION'])
allMsiCasesMaf = allImpactMutsMaf[(allImpactMutsMaf['Tumor_Sample_Barcode'].isin(msiCases))]
allMsiCasesMaf['allele'] = allMsiCasesMaf.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)
msiSummary['allele'] = msiSummary.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)
neverObservedSites, msiSitesToNameMapping, mafMsiSiteToNameMapping =  analysis_utils.standardize_allele_names(msiSummary, allMsiCasesMaf)
msiSummary['correctedAllele'] = msiSummary['allele'].apply(lambda x: mafMsiSiteToNameMapping[x] if x in mafMsiSiteToNameMapping else None)
allMsiCasesMaf['correctedAllele'] = allMsiCasesMaf['allele'].apply(lambda x:
                                                                 mafMsiSiteToNameMapping[x] if x in mafMsiSiteToNameMapping else None)

msiDf = compare_mutation_prevalences_across_multiple_cancer_types(allMsiCasesMaf)
msiDf['signature'] = 'MSI'

#Also do it for pole
poleCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_10')
poleEndometrialColorectalMaf = allImpactMutsMaf[
    (allImpactMutsMaf['cancerType'].isin(['Endometrial Cancer', 'Colorectal Cancer'])) & 
    (allImpactMutsMaf['Tumor_Sample_Barcode'].isin(poleCases))]
poleDf = analysis_utils.make_comparissons(poleEndometrialColorectalMaf, mode = 'gene', mutationType='pole')
poleDf['signature'] = 'POLE'
poleDf['comp'] = '/'.join(sorted(('Endometrial Cancer', 'Colorectal Cancer')))

#and for apobec
apobecCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_APOBEC')
apobecBreastBladderMaf = allImpactMutsMaf[
    (allImpactMutsMaf['cancerType'].isin(['Breast Cancer', 'Bladder Cancer'])) & 
    (allImpactMutsMaf['Tumor_Sample_Barcode'].isin(apobecCases))]
apobecDf = analysis_utils.make_comparissons(apobecBreastBladderMaf, mode = 'gene', mutationType='apobec',
                                            cancerType1='Bladder Cancer', cancerType2='Breast Cancer')
apobecDf['signature'] = 'APOBEC'
apobecDf['comp'] = '/'.join(sorted(('Breast Cancer', 'Bladder Cancer')))

combinedDf = pd.concat([msiDf, apobecDf, poleDf])

combinedDf['cohort'] = combinedDf['signature']+ '_' + combinedDf['comp']


  """Entry point for launching an IPython kernel.
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [31]:
writePath = os.path.join(writeDir, 'figureS2_i.tsv')
combinedDf.to_csv(writePath, index=False, sep='\t')

## Figure 2S(j)
Manner of inactivation in hypermutated tumors

In [32]:
def summarize_inactivation_method(maf, hyperIds, normalIds, cancerTypes = ['']):
    listOfDicts = []
    for cancerType in cancerTypes:
        cancerTypeMaf = maf[maf['cancerType'] == cancerType]
        hyperMaf = cancerTypeMaf[cancerTypeMaf['Tumor_Sample_Barcode'].isin(hyperIds)]
        normalMaf = cancerTypeMaf[cancerTypeMaf['Tumor_Sample_Barcode'].isin(normalIds)]
    
        listOfDicts.append({'nLOH': len(set(normalMaf[normalMaf['lossType'] == 'LOH']['geneCase'])),
                            'nComposite': len(set(normalMaf[normalMaf['lossType'] == 'composite_mutation']['geneCase'])),
                            'type': 'Normal',
                            'cancerType': cancerType
                           })
        listOfDicts.append({'nLOH': len(set(hyperMaf[hyperMaf['lossType'] == 'LOH']['geneCase'])),
                            'nComposite': len(set(hyperMaf[hyperMaf['lossType'] == 'composite_mutation']['geneCase'])),
                            'type': 'Hyper',
                            'cancerType': cancerType
                           })
        
        
    df = pd.DataFrame(listOfDicts)
    df['total'] = df['nLOH'] + df['nComposite']
        
    return df

In [33]:
#Takes ~2 minutes
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
allImpactMutsMaf['varUuid'] = allImpactMutsMaf.apply(lambda row:
    row['Tumor_Sample_Barcode'] + '_' + str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)
oncogenicMutIds = set(allImpactMutsMaf[allImpactMutsMaf['oncogenic'].notnull()]['varUuid'])
mafWithClonalityAnnotation = pd.read_csv(filePathDict['IMPACT_MAF_WITH_ADJUSTED_CLONALITY_ANNOTATION'])
mafWithClonalityAnnotation['varUuid'] = mafWithClonalityAnnotation.apply(lambda row:
    row['Tumor_Sample_Barcode'] + '_' + str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)

tsgs = get_gene_and_cohort_list_utils.get_tsgs()
tsgOncogenicMaf = mafWithClonalityAnnotation[(mafWithClonalityAnnotation['varUuid'].isin(oncogenicMutIds)) &
                                             (mafWithClonalityAnnotation['Hugo_Symbol'].isin(tsgs))]
tsgOncogenicMaf['geneCase'] = tsgOncogenicMaf.apply(lambda row: row['Hugo_Symbol'] + '_' + row['Tumor_Sample_Barcode'], axis=1)
occurenceDict = dict(tsgOncogenicMaf['geneCase'].value_counts())                                                                                     
tsgOncogenicMaf['isMultiplet'] = tsgOncogenicMaf['geneCase'].apply(lambda x: True if occurenceDict[x] > 1 else False)
tsgOncogenicMaf['isLOH'] = tsgOncogenicMaf['lcn'].apply(lambda x: True if x == 0 else False)
tsgOncogenicMaf['lossType'] = tsgOncogenicMaf.apply(lambda row: 'LOH' if row['isLOH'] == True
    else 'composite_mutation' if row['isMultiplet'] == True else False, axis=1)
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info()
tsgOncogenicMaf['cancerType'] = tsgOncogenicMaf['Tumor_Sample_Barcode'].apply(lambda x:
    cancerTypeDict[x] if x in cancerTypeDict else None)

normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
df = summarize_inactivation_method(tsgOncogenicMaf, hypermutantIds, normalIds, cancerTypes =
    ['Endometrial Cancer', 'Colorectal Cancer', 'Melanoma',
     'Prostate Cancer', 'Glioma', 'Bladder Cancer', 'Non-Small Cell Lung Cancer'])

  
  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] 

In [34]:
writePath = os.path.join(writeDir, 'figureS2_j.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2S(k)
Permutation test to identify genes enriched for composite mutation

In [35]:
def convert_df_to_np_array(df):
    dfReduced = df[['Tumor_Sample_Barcode', 'shuffleColumn']]
    #npArr = dfReduced.to_numpy()
    npArr = dfReduced.values
    return npArr

def shuffle_arr(arr):
    nameCol = arr[:,0]
    valueCol = arr[:,1]
    np.random.shuffle(nameCol)
    return np.column_stack((nameCol, valueCol))

def summarize_shuffle_results(shuffledArr, genes):
    zippedList = zip(shuffledArr[:,0], shuffledArr[:,1])

    df = pd.DataFrame([])
    df['Tumor_Sample_Barcode'] = shuffledArr[:,0]
    df['Hugo_Symbol'] = shuffledArr[:,1]
    df['geneCase'] = df.apply(lambda row: row['Tumor_Sample_Barcode'] + '_' + row['Hugo_Symbol'], axis=1)
    
    l = [geneCase.split('_')[1] for geneCase, count in df['geneCase'].value_counts().items() if count > 1]
    
    return Counter(l)

def do_n_iterations_of_shuffling(maf, n, genes, mode='geneDist'):
    npArr = convert_df_to_np_array(maf)
    listOfDicts = []
    for i in range(n):
        if i%100 == 0: print 'on iteration ', i,
        shuffledArr = shuffle_arr(npArr)
        shuffleResults = summarize_shuffle_results(shuffledArr, genes)
        shuffleResults['iter'] = i
        listOfDicts.append(shuffleResults)
        i += 1
    return pd.DataFrame(listOfDicts)

def do_permutation_test_across_cancer_types(maf, cancerTypes = [], n =10):
    listOfDfs = []
    for cancerType in cancerTypes:
        cTypeMaf = maf[maf['cancerType'] == cancerType]
        observedMultipletDict = dict(cTypeMaf[cTypeMaf['isMultiplet'] == True].drop_duplicates(subset=['caseGene'])['Hugo_Symbol'].value_counts())
        permTestSummary = do_n_iterations_of_shuffling(cTypeMaf, n, genes = set(cTypeMaf['Hugo_Symbol']), mode='geneDist')
        df = summarize_permutation_test_results(permTestSummary, observedMultipletDict, n)
        df['cancerType'] = cancerType
        listOfDfs.append(df)
    cDf = pd.concat(listOfDfs)
    return cDf

def summarize_significant_genes_by_cancer_type(results, cancerTypes = []):
    tsgs = set(['ERRFI1', 'ASXL2', 'PMAIP1', 'ACTG1', 'SUFU', 'FBXO11', 'MEN1', 'FAM58A', 'B2M', 'RB1', 'DUSP22', 'SESN1', 'GPS2', 'RAD51D', 'SMG1', 'CDC73', 'MAP3K1', 'SMARCB1', 'INPP4B', 'PARK2', 'SMAD4', 'CBFB', 'CDH1', 'PPP6C', 'SETDB1', 'SETDB2', 'NF2', 'CDKN2B', 'CDKN2C', 'CDKN2A', 'DDX3X', 'PIK3R1', 'BARD1', 'PDS5B', 'KLF4', 'SPRED1', 'VHL', 'SMAD2', 'PMS1', 'PMS2', 'SETD2', 'GATA3', 'TBL1XR1', 'MUTYH', 'SOCS1', 'FAM175A', 'ROBO1', 'ARID1B', 'ARID1A', 'TCF7L2', 'STK11', 'FOXA1', 'PTEN', 'FAT1', 'FAS', 'CYLD', 'MAX', 'SH2D1A', 'APC', 'NTHL1', 'CTCF', 'KDM5C', 'KMT2C', 'ZFHX3', 'FOXP1', 'PIGA', 'CDKN1B', 'CDKN1A', 'FUBP1', 'MSH2', 'ID3', 'TNFRSF14', 'TRAF3', 'EP400', 'BRIP1', 'ARID4A', 'ARID4B', 'XRCC2', 'DAXX', 'SDHAF2', 'ASXL1', 'AMER1', 'RASA1', 'EGR1', 'MST1', 'SOX17', 'RUNX1', 'PIK3R3', 'NCOR1', 'NF1', 'JAK1', 'PTPRD', 'CHEK2', 'CHEK1', 'SMC1A', 'TMEM127', 'STAG1', 'RAD51', 'TCF3', 'STAG2', 'ARID2', 'RAD50', 'RNF43', 'PARP1', 'BLM', 'CUX1', 'RECQL', 'RAD21', 'PTPN2', 'PTPN1', 'SLX4', 'INHA', 'PAX5', 'IRF1', 'TP53', 'HLA-A', 'IRF8', 'CBL', 'TOP1', 'SHQ1', 'PRDM1', 'NSD1', 'ATXN2', 'CREBBP', 'HDAC4', 'SESN2', 'PPP2R1A', 'EPHA7', 'ATM', 'EPHA3', 'POT1', 'SMAD3', 'MOB3B', 'TBX3', 'POLE', 'ATR', 'FANCD2', 'FH', 'BCORL1', 'SOX9', 'IKZF3', 'TSC1', 'TP63', 'MRE11A', 'SDHC', 'BTG1', 'POLD1', 'CIITA', 'SMC3', 'SAMHD1', 'RTEL1', 'ECT2L', 'PIK3R2', 'CRBN', 'FANCC', 'NBN', 'FANCA', 'HLA-B', 'RECQL4', 'DUSP4', 'ERCC2', 'FBXW7', 'TGFBR2', 'TGFBR1', 'MSH3', 'RBM15', 'TET1', 'TET3', 'SESN3', 'MGA', 'LTB', 'FOXL2', 'SH2B3', 'BCOR', 'HIST1H1D', 'ATRX', 'EP300', 'RAD51C', 'RAD51B', 'HIST1H1B', 'TNFAIP3', 'DICER1', 'ARID5B', 'LATS2', 'FOXO1', 'KEAP1', 'EZH2', 'SP140', 'NKX3-1', 'PBRM1', 'PALB2', 'CIC', 'BRCA1', 'DTX1', 'FLCN', 'SPEN', 'CD58', 'ERCC3', 'ERCC4', 'MSH6', 'BCL11B', 'BMPR1A', 'ERF', 'BRCA2', 'NOTCH2', 'EED', 'MITF', 'ELF3', 'SMARCA4', 'BBC3', 'ANKRD11', 'CEBPA', 'BCL2L11', 'AXIN2', 'AXIN1', 'CDK12', 'ESCO2', 'MLH1', 'SDHB', 'MED12', 'HNF1A', 'RYBP', 'ATP6V1B2', 'DNMT3B', 'KMT2B', 'KMT2A', 'DNMT3A', 'NFKBIA', 'TRAF5', 'KMT2D', 'SPOP', 'RBM10', 'P2RY8', 'TP53BP1', 'TSC2', 'KDM6A', 'EPCAM', 'PHOX2B', 'NPM1', 'BCL10', 'LATS1', 'HOXB13', 'ARID3A', 'PTPRT', 'PTPRS', 'INPPL1', 'NOTCH4', 'TET2', 'NOTCH1', 'CASP8', 'NOTCH3', 'GRIN2A', 'MAP2K4', 'WT1', 'BACH2', 'SDHA', 'BAP1', 'PTCH1', 'SDHD'])
    listOfDicts = []
    for cancerType in cancerTypes:
        print 'analyzing', cancerType
        cTypeResults = results[results['cancerType'] == cancerType]
        signifResults = cTypeResults[cTypeResults['pVal'] < .05]
        for gene in set(signifResults['gene']):
            geneResults = signifResults[signifResults['gene'] == gene]
            t = 'TSG'
            if gene not in tsgs:
                t = 'Oncogene'
            
            listOfDicts.append({'gene': gene, 'geneType': t, 'cancerType': cancerType})
    return pd.DataFrame(listOfDicts)

def summarize_permutation_test_results(permTestTable, observedMultipletDict, n):
    permTestTable = permTestTable.fillna(0)
    listOfDicts = []
    cntr = 0
    
    impactGenes = get_gene_and_cohort_list_utils.get_im6_genes()
    for gene in set(permTestTable.columns.values) - set(['iter']):
        nCases = 0
        if gene in observedMultipletDict:
            nCases = observedMultipletDict[gene]
        nCasesPermAbove = permTestTable[permTestTable[gene] >= nCases].shape[0]
        pVal = (1.0*nCasesPermAbove)/n
        listOfDicts.append({'nObs': nCases, 'permAbove': nCasesPermAbove, 'pVal': pVal, 'gene': gene})

    return pd.DataFrame(listOfDicts)

In [36]:
#note there may exist a difficult to reproduce error where with enough shuffling iterations we cause an error
#this cell takes a long time
nIter = 500 
#Load in data and summarize it
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

#mark information so we can do the permutation test
allHypermutantMutMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(hypermutantIds)]
oncmutsMaf = allHypermutantMutMaf[allHypermutantMutMaf['oncogenic'].notnull()]
oncmutsMaf['caseGene'] = oncmutsMaf['Tumor_Sample_Barcode'] + '_' + oncmutsMaf['Hugo_Symbol']
mutCountsDict = dict(oncmutsMaf['caseGene'].value_counts())
oncmutsMaf['mutationCount'] = oncmutsMaf.apply(lambda row: mutCountsDict[row['caseGene']], axis=1)
oncmutsMaf['isMultiplet'] = oncmutsMaf['mutationCount'].apply(lambda x: True if x > 1 else False)

#do permutation test
oncmutsMaf['shuffleColumn'] = oncmutsMaf['Hugo_Symbol']
cTypes =  ['Endometrial Cancer', 'Colorectal Cancer', 'Prostate Cancer', 'Melanoma',
                                       'Bladder Cancer', 'Esophagogastric Cancer', 'Glioma', 'Non-Small Cell Lung Cancer', 'Small Bowel Cancer'] 

df = do_permutation_test_across_cancer_types(oncmutsMaf, cancerTypes = cTypes, n = nIter) 

#summarize the data
sumDf = summarize_significant_genes_by_cancer_type(df, cancerTypes = cTypes)
counts = Counter(sumDf['gene'])
sumDf['geneCount'] = sumDf['gene'].apply(lambda x: counts[x])

tsgs = get_gene_and_cohort_list_utils.get_tsgs()
df['geneType'] = df['gene'].apply(lambda x: 'tsg' if x in tsgs else 'oncogene')

df['geneCancerType'] = df.apply(lambda row: str(row['gene']) + '_' +
        str(row['cancerType']), axis=1)

  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in 

on iteration  0 on iteration  100 on iteration  200 on iteration  300 on iteration  400 on iteration  0 on iteration  100 on iteration  200 on iteration  300 on iteration  400 on iteration  0 on iteration  100 on iteration  200 on iteration  300 on iteration  400 on iteration  0 on iteration  100 on iteration  200 on iteration  300 on iteration  400 on iteration  0 on iteration  100 on iteration  200 on iteration  300 on iteration  400 on iteration  0 on iteration  100 on iteration  200 on iteration  300 on iteration  400 on iteration  0 on iteration  100 on iteration  200 on iteration  300 on iteration  400 on iteration  0 on iteration  100 on iteration  200 on iteration  300 on iteration  400 on iteration  0 on iteration  100 on iteration  200 on iteration  300 on iteration  400 analyzing Endometrial Cancer
analyzing Colorectal Cancer
analyzing Prostate Cancer
analyzing Melanoma
analyzing Bladder Cancer
analyzing Esophagogastric Cancer
analyzing Glioma
analyzing Non-Small Cell Lung C

In [37]:
writePath = os.path.join(writeDir, 'figureS2_k.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure S2_l
Double hit mutations violating the infinite sites hypothesis 

In [38]:
#TODO move this to the supplement
def summarize_double_hit_occurence_data(maf, lossAnnotatedMaf):
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    driverMaf = maf[maf['isOncogenic'] == True]
    doubleHitValidMaf = driverMaf[(driverMaf['isDoubleHit'] == True) & (driverMaf['doubleHitValidates'] != 'other_SNPs_unbalanced')]
    counts = dict(doubleHitValidMaf['allele'].value_counts())
    listOfDicts = []
    for allele in set(doubleHitValidMaf['allele']):
        gene = allele.split('_')[0]
        doubleHitCount = counts[allele]
        geneMaf = driverMaf[driverMaf['Hugo_Symbol'] == gene]
        geneAlleleMaf = geneMaf[geneMaf['allele'] == allele]
        
        #note this does not include biallelic loss via 2x mutation
        nCasesBiallelicLoss = len(set(lossAnnotatedMaf[(lossAnnotatedMaf['lossType'] != False) & (lossAnnotatedMaf['Hugo_Symbol'] == gene)]['Tumor_Sample_Barcode']))
        
        geneType = 'TSG' if gene in tsgs else 'Oncogene'
        listOfDicts.append({
            'gene': gene, 'allele': allele, 'nGene': geneMaf.shape[0], 'nAllele': geneAlleleMaf.shape[0],
            'geneType': geneType, 'nDoubleHit': doubleHitCount, 'nBiallelicLoss': doubleHitCount + nCasesBiallelicLoss
        })
    return pd.DataFrame(listOfDicts)

In [39]:
#TAKES APPROXIMATELY 10 minutes
#Load the maf with clonality annotation as its crucial to have clonality info and the normal maf
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
allImpactMutsMaf['varUuid'] = allImpactMutsMaf.apply(lambda row:
    row['Tumor_Sample_Barcode'] + '_' + str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)
oncogenicMutIds = set(allImpactMutsMaf[allImpactMutsMaf['oncogenic'].notnull()]['varUuid'])
mafWithClonalityAnnotation = pd.read_csv(filePathDict['IMPACT_MAF_WITH_ADJUSTED_CLONALITY_ANNOTATION'])
mafWithClonalityAnnotation['varUuid'] = mafWithClonalityAnnotation.apply(lambda row:
    row['Tumor_Sample_Barcode'] + '_' + str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)
mafWithClonalityAnnotation['isOncogenic'] = mafWithClonalityAnnotation['varUuid'].apply(lambda x: True if x in oncogenicMutIds else False)

#mark caases with whether they have flat genomes and their median vaf
flatGenomeCases = clonality_analysis_util.get_facets_whitelist()
hypermutantCases = get_gene_and_cohort_list_utils.get_all_hypermutant_ids()
hypermutantMaf = mafWithClonalityAnnotation[mafWithClonalityAnnotation['Tumor_Sample_Barcode'].isin(hypermutantCases)]
hypermutantMaf['flatGenome'] = hypermutantMaf['Tumor_Sample_Barcode'].apply(lambda x: True if x in flatGenomeCases else False)
hypermutantMaf = maf_analysis_utils.mark_cases_with_median_clonal_vaf_of_case(hypermutantMaf)

#annotate double hit mutations
hypermutantMaf['isDoubleHit'] = hypermutantMaf.apply(lambda row:
    clonality_analysis_util.is_mut_double_hit(row, row['flatGenome'], 
 doubleFactor=2), axis=1)
hypermutantMaf = clonality_analysis_util.annotate_double_hit_mutations(hypermutantMaf)

#focus specifically on double hit drivers
driverMaf = hypermutantMaf[hypermutantMaf['isOncogenic'] == True]
driverMaf['geneCase'] = driverMaf.apply(lambda row: row['Hugo_Symbol'] + '_' + row['Tumor_Sample_Barcode'], axis=1)
occurenceDict = dict(driverMaf['geneCase'].value_counts())                                                                                     
driverMaf['isMultiplet'] = driverMaf['geneCase'].apply(lambda x: True if occurenceDict[x] > 1 else False)
driverMaf['isLOH'] = driverMaf['lcn'].apply(lambda x: True if x == 0 else False)
driverMaf['lossType'] = driverMaf.apply(lambda row: 'LOH' if row['isLOH'] == True
    else 'composite_mutation' if row['isMultiplet'] == True else False, axis=1)

hypermutantMaf['allele'] = hypermutantMaf.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)
driverMaf['allele'] = driverMaf.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)

#get a dataframe for plotting that summarizes double hit information
df = summarize_double_hit_occurence_data(hypermutantMaf, driverMaf)

  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  caseMaf['clonal'] = caseMaf['ccf_Mcopies'].apply(lambda x: 1 if x > clonalThresh else 0)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


100
200
300
400
500
600
700
800
900
1000
1100
1200
1300


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  maf['medianClonalVaf'] = maf['Tumor_Sample_Barcode'].apply(lambda x: vafMapping[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  maf['geneCase'] = maf.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['Tumor_Sample_Barcode']), axis=1)
A value is trying to be set on a copy of a slice from a Data

In [41]:
writePath = os.path.join(writeDir, 'figureS2_l.tsv')
df.to_csv(writePath, index=False, sep='\t')