In [25]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import numpy
import re
import math
from collections import Counter
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.multitest import fdrcorrection

notebookPath = 'scripts/figure2'
projectDir = re.sub(notebookPath, '', os.getcwd())
sys.path.append(os.path.join(notebookPath, 'scripts/utilityScripts'))

import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import clonality_analysis_util
import get_gene_and_cohort_list_utils
import configuration_util

filePathDict = configuration_util.get_all_files_path_dict()

In [43]:
#Set where to write the files
writeDir = os.path.join(projectDir, 'scripts/figure2/FIGURE2_PLOTTING_FILES/plotDataFiles/')

## Figure 2A

In [5]:
#makes a dataframe of counts of oncogenic mutations
def make_counts_df(maf, dominantSignatureDict):
    
    def summarize_counts_for_mutation_type(oMaf, mutationType):
        
        oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
        tsgs = get_gene_and_cohort_list_utils.get_tsgs()
        truncatingConsequences = set(['Frame_Shift_Del', 'Frame_Shift_Ins', 'Nonsense_Mutation'])
        
        sMaf = None
        counts = None
        if mutationType == 'tsgTrunc':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(tsgs) &
                (oncogenicMaf['Variant_Classification'].isin(truncatingConsequences))]
        if mutationType == 'oncogene':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(oncogenes)]
        if mutationType == 'tsgMissense':
            sMaf = oncogenicMaf[oncogenicMaf['Hugo_Symbol'].isin(tsgs) &
                                (~oncogenicMaf['Variant_Classification'].isin(truncatingConsequences))]
            
        counts = Counter(sMaf['Tumor_Sample_Barcode'])
        counts = [(case, count) for case, count in counts.items()] + [(case, 0) for
            case in set(allIds) - set(sMaf['Tumor_Sample_Barcode'])] 
        return dict(counts)

    
    cancerTypeDict = dict(zip(maf['Tumor_Sample_Barcode'], maf['cancerType']))
    allIds = set(maf['Tumor_Sample_Barcode'])
    oncogenicMaf = maf[maf['oncogenic'].notnull()]
    
    tsgTruncatingCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'tsgTrunc')
    oncogenicCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'oncogene')
    tsgMissenseCounts = summarize_counts_for_mutation_type(oncogenicMaf, 'tsgMissense')
    listOfDicts = []
    for case in allIds:
        listOfDicts.append({'Tumor_Sample_Barcode': case, 'TsgTruncating': tsgTruncatingCounts[case],
                            'Oncogene': oncogenicCounts[case], 'TsgMissense': tsgMissenseCounts[case],
                            'cancerType': cancerTypeDict[case], 
                            'dominantSignature': dominantSignatureDict[case] if case in dominantSignatureDict else None
                           })
    return pd.DataFrame(listOfDicts)

def sample_normal_maf(normalCounts, hyperCounts, N=25):
    
    hyperCancerTypeCounter = Counter(hyperCounts['cancerType'])
    normalCancerTypeCounter = Counter(normalCounts['cancerType'])
    normalCounts['hypermutatedCount'] = normalCounts['cancerType'].apply(lambda x:
            1.0*hyperCancerTypeCounter[x]/normalCancerTypeCounter[x]
        if x in hyperCancerTypeCounter else 0)
    sampledCounts = normalCounts.sample(frac=N, weights='hypermutatedCount', replace=True)
    return sampledCounts
   
def create_summary_df(df):
    listOfDicts = []
    for variable in set(df['variable']):
        hypermutatedData = df[df['burdenType'] == 'hypermutated']
        nonHypermutatedData = df[df['burdenType'] != 'hypermutated']
        listOfDicts.append({
            'frac': sum(hypermutatedData[hypermutatedData['variable'] == variable]['value'])/1.0*sum(hypermutatedData['value']),
            'burdenType': 'hypermutated', 'mutType': variable
        })
        listOfDicts.append({
            'frac': sum(nonHypermutatedData[nonHypermutatedData['variable'] == variable]['value'])/1.0*sum(nonHypermutatedData['value']),
            'burdenType': 'non-hypermutated', 'mutType': variable
        })
    summaryDf = pd.DataFrame(listOfDicts)
    return summaryDf


In [None]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
dominantSignatureDict = get_gene_and_cohort_list_utils.get_pan_impact_signature_mapping(
    filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
allHypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
allNormalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO'])

allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
hypermutatedMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(allHypermutantIds)]
nonHypermutatedMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(allNormalIds)]

#Use sampling to get dataframes that match the cancer type distributions
dfCountsNormal = make_counts_df(nonHypermutatedMaf, dominantSignatureDict)
dfCountsHypermutated = make_counts_df(hypermutatedMaf, dominantSignatureDict)
dfCountsNormalSampled = sample_normal_maf(dfCountsNormal, dfCountsHypermutated)

dfCountsHypermutated['burdenType'] = 'hypermutated'
dfCountsNormalSampled['burdenType'] = 'normal'
dfCombined = pd.concat([dfCountsHypermutated, dfCountsNormalSampled])

meltedDf = pd.melt(dfCombined, id_vars=['Tumor_Sample_Barcode', 'burdenType'], value_vars=['Oncogene', 'TsgMissense', 'TsgTruncating'])
summaryDf = create_summary_df(meltedDf)

In [7]:
writePath = os.path.join(writeDir, 'figure_2a.tsv')
summaryDf.to_csv(writePath, index=False, sep='\t')

## Figure 2B

In [11]:
def summarize_related_unrelated_driver_frac(maf, relatedGenesD, tmbDict):
    listOfDicts = []
    cntr = 0
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    for case in set(maf['Tumor_Sample_Barcode']):
        
        caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
        cancerType = caseMaf['cancerType'].iloc[0]
        tmb = tmbDict[case] if case in tmbDict else None #todo get the real TMB
        relatedGenes = relatedGenesD[cancerType]
        hypermutationStatus = caseMaf['hypermutationStatus'].iloc[0]
        
        caseMafRelated = caseMaf[caseMaf['Hugo_Symbol'].isin(relatedGenes)]
        caseMafUnrelated = caseMaf[~caseMaf['Hugo_Symbol'].isin(relatedGenes)]
        relatedDrivers = caseMafRelated[caseMafRelated['oncogenic'].notnull()]
        unrelatedDrivers = caseMafUnrelated[caseMafUnrelated['oncogenic'].notnull()]
        
        nRelatedDrivers = 1.0*relatedDrivers.shape[0]
        nUnrelatedDrivers = 1.0*unrelatedDrivers.shape[0]
        nTotalRelated = 1.0*caseMafRelated.shape[0]
        nTotalUnrelated = 1.0*caseMafUnrelated.shape[0]
        
        ratioRelated = None if nTotalRelated == 0 else nRelatedDrivers/(nTotalRelated)
        ratioUnrelated = None if nTotalUnrelated == 0 else nUnrelatedDrivers/(nTotalUnrelated)
        
        listOfDicts.append({'Tumor_Sample_Barcode': case, 'cancerType': cancerType,
                            'TMB': tmb, 'hypermutationStatus': hypermutationStatus,
                            'fracDriverRelated': ratioRelated, 'fracDriverUnrelated': ratioUnrelated})
    df = pd.DataFrame(listOfDicts)
    return df

In [12]:
hypermutationMaf = allImpactMutsMaf[allImpactMutsMaf['hypermutationStatus'] == 'hypermutated']
relatedGenesDict = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type(thresh = 1.0/30.0, impactMafPath = filePathDict['IMPACT_BASE_MAF'])
tmbDict = get_gene_and_cohort_list_utils.get_all_tmb_info(tmbFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'])
df = summarize_related_unrelated_driver_frac(hypermutationMaf, relatedGenesDict, tmbDict)


In [13]:
writePath = os.path.join(writeDir, 'figure_2b.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2C

In [16]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
msiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)

#We have to match MSI alleles (sometimes they have different names etc)
#the value correctedAllele is the proper allele for us to work with
msiSummary = pd.read_table(filePathDict['MICROSATELLITE_INFORMATION'])
allMsiCasesMaf = allImpactMutsMaf[(allImpactMutsMaf['Tumor_Sample_Barcode'].isin(msiCases))]
allMsiCasesMaf['allele'] = allMsiCasesMaf.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)
msiSummary['allele'] = msiSummary.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)
neverObservedSites, msiSitesToNameMapping, mafMsiSiteToNameMapping =  analysis_utils.standardize_allele_names(msiSummary, allMsiCasesMaf)
msiSummary['correctedAllele'] = msiSummary['allele'].apply(lambda x: mafMsiSiteToNameMapping[x] if x in mafMsiSiteToNameMapping else None)
allMsiCasesMaf['correctedAllele'] = allMsiCasesMaf['allele'].apply(lambda x:
                                                                 mafMsiSiteToNameMapping[x] if x in mafMsiSiteToNameMapping else None)

#Figure 2C is specifically about MSI endometrial vs MSI colorectal
msiCasesEndometrialColorectalMaf = allMsiCasesMaf[
    (allMsiCasesMaf['cancerType'].isin(['Endometrial Cancer', 'Colorectal Cancer'])) & 
    (allMsiCasesMaf['Tumor_Sample_Barcode'].isin(msiCases))]
df = analysis_utils.make_comparissons(msiCasesEndometrialColorectalMaf)

In [17]:
writePath = os.path.join(writeDir, 'figure_2c.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Figure 2D

In [None]:
#load in phasing data, focus on hypermutated cases
phasingData = pd.read_table(filePathDict['PHASING_DATA'])
allHypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(
    hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
phasingHyper = phasingData[phasingData['Tumor_Sample_Barcode'].isin(allHypermutantIds)]

#Whitelist cases: cases with purity = NA with flat genomes (rather than bad facets fits)
#The phasing algorithm marks these cases as trans or separate cells, but because we know hte purity =NA comes from a flat genome we can mark them as trans
#this is important because many of our hypermutated cases fit this description
whitelistFlatGenomeCases = clonality_analysis_util.get_facets_whitelist()
phasingHyper['adjPhase'] = phasingHyper.apply(lambda row: 
    'trans' if row['phase'] == 'trans or separate cells' and row['Tumor_Sample_Barcode'] in whitelistFlatGenomeCases
    else row['phase'], axis=1)

dictionaryOfRelatedGenes = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type()

#Prepare data for plotting
plotThresh = 5
genesToHighlight = [key for key, value in dict(phasingHyper[(phasingHyper['oncogenic.1'].notnull()) & (phasingHyper['oncogenic.2'].notnull()) & (phasingHyper['adjPhase'].isin(['cis', 'trans']))][
    'Hugo_Symbol'].value_counts()).items() if value > plotThresh]

phasingHyper['label'] = phasingHyper.apply(lambda row: 
                                           '1 or 2 silent' if (row['Variant_Classification.1'] == 'Silent') | (row['Variant_Classification.2'] == 'Silent')
                                           else '1 or 2 VUS'
                                               if (not isinstance(row['oncogenic.1'], str)) | (not isinstance(row['oncogenic.2'], str))
                                           else row['Hugo_Symbol'] if row['Hugo_Symbol'] in genesToHighlight
                                           
                                           else 'related_tsg' if row['Role'] == 'TSG' and row['CANCER_TYPE'] in dictionaryOfRelatedGenes and row['Hugo_Symbol'] in dictionaryOfRelatedGenes[row['CANCER_TYPE']]
                                           else 'related_oncogene' if row['Role'] != 'TSG' and row['CANCER_TYPE'] in dictionaryOfRelatedGenes and row['Hugo_Symbol'] in dictionaryOfRelatedGenes[row['CANCER_TYPE']]
                                           
                                           else 'other_tsg' if row['Role'] == 'TSG'
                                           else 'other_oncogene' if row['Role'] == 'Oncogene'
                                           else 'other', axis=1)

phasingHyper['isTrans'] = phasingHyper['adjPhase'].apply(lambda x: 1 if x == 'trans' else 0 if x == 'cis' else None)
phasingHyperWrite = phasingHyper[phasingHyper['isTrans'].notnull()]

In [45]:
writePath = os.path.join(writeDir, 'figure_2d.tsv')
phasingHyper.to_csv(writePath, index=False, sep='\t')

## Figure 2E

In [46]:
def summarize_double_hit_occurence_data(maf, lossAnnotatedMaf):
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    driverMaf = maf[maf['isOncogenic'] == True]
    doubleHitValidMaf = driverMaf[(driverMaf['isDoubleHit'] == True) & (driverMaf['doubleHitValidates'] != 'other_SNPs_unbalanced')]
    counts = dict(doubleHitValidMaf['allele'].value_counts())
    listOfDicts = []
    for allele in set(doubleHitValidMaf['allele']):
        gene = allele.split('_')[0]
        doubleHitCount = counts[allele]
        geneMaf = driverMaf[driverMaf['Hugo_Symbol'] == gene]
        geneAlleleMaf = geneMaf[geneMaf['allele'] == allele]
        
        #note this does not include biallelic loss via 2x mutation
        nCasesBiallelicLoss = len(set(lossAnnotatedMaf[(lossAnnotatedMaf['lossType'] != False) & (lossAnnotatedMaf['Hugo_Symbol'] == gene)]['Tumor_Sample_Barcode']))
        
        geneType = 'TSG' if gene in tsgs else 'Oncogene'
        listOfDicts.append({
            'gene': gene, 'allele': allele, 'nGene': geneMaf.shape[0], 'nAllele': geneAlleleMaf.shape[0],
            'geneType': geneType, 'nDoubleHit': doubleHitCount, 'nBiallelicLoss': doubleHitCount + nCasesBiallelicLoss
        })
    return pd.DataFrame(listOfDicts)
        
        

In [48]:
#TAKES APPROXIMATELY 5 minutes
#Load the maf with clonality annotation as its crucial to have clonality info and the normal maf
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
allImpactMutsMaf['varUuid'] = allImpactMutsMaf.apply(lambda row:
    row['Tumor_Sample_Barcode'] + '_' + str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)
oncogenicMutIds = set(allImpactMutsMaf[allImpactMutsMaf['oncogenic'].notnull()]['varUuid'])
mafWithClonalityAnnotation = pd.read_csv(filePathDict['IMPACT_MAF_WITH_ADJUSTED_CLONALITY_ANNOTATION'])
mafWithClonalityAnnotation['varUuid'] = mafWithClonalityAnnotation.apply(lambda row:
    row['Tumor_Sample_Barcode'] + '_' + str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)
mafWithClonalityAnnotation['isOncogenic'] = mafWithClonalityAnnotation['varUuid'].apply(lambda x: True if x in oncogenicMutIds else False)

#mark caases with whether they have flat genomes and their median vaf
flatGenomeCases = clonality_analysis_util.get_facets_whitelist()
hypermutantCases = get_gene_and_cohort_list_utils.get_all_hypermutant_ids()
hypermutantMaf = mafWithClonalityAnnotation[mafWithClonalityAnnotation['Tumor_Sample_Barcode'].isin(hypermutantCases)]
hypermutantMaf['flatGenome'] = hypermutantMaf['Tumor_Sample_Barcode'].apply(lambda x: True if x in flatGenomeCases else False)
hypermutantMaf = maf_analysis_utils.mark_cases_with_median_clonal_vaf_of_case(hypermutantMaf)

#annotate double hit mutations
hypermutantMaf['isDoubleHit'] = hypermutantMaf.apply(lambda row:
    clonality_analysis_util.is_mut_double_hit(row, row['flatGenome'], 
 doubleFactor=2), axis=1)
hypermutantMaf = clonality_analysis_util.annotate_double_hit_mutations(hypermutantMaf)

#focus specifically on double hit drivers
driverMaf = hypermutantMaf[hypermutantMaf['isOncogenic'] == True]
driverMaf['geneCase'] = driverMaf.apply(lambda row: row['Hugo_Symbol'] + '_' + row['Tumor_Sample_Barcode'], axis=1)
occurenceDict = dict(driverMaf['geneCase'].value_counts())                                                                                     
driverMaf['isMultiplet'] = driverMaf['geneCase'].apply(lambda x: True if occurenceDict[x] > 1 else False)
driverMaf['isLOH'] = driverMaf['lcn'].apply(lambda x: True if x == 0 else False)
driverMaf['lossType'] = driverMaf.apply(lambda row: 'LOH' if row['isLOH'] == True
    else 'composite_mutation' if row['isMultiplet'] == True else False, axis=1)

hypermutantMaf['allele'] = hypermutantMaf.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)
driverMaf['allele'] = driverMaf.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)

#get a dataframe for plotting that summarizes double hit information
df = summarize_double_hit_occurence_data(hypermutantMaf, driverMaf)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [49]:
writePath = os.path.join(writeDir, 'figure_2e.tsv')
df.to_csv(writePath, index=False, sep='\t')