In [38]:
import pandas as pd
import sys
import argparse
import os
import pandas as pd
import numpy as np
import math
import re
from collections import Counter
import scipy.stats as stats

notebookPath = 'scripts/figure3'
projectDir = re.sub(notebookPath, '', os.getcwd())
sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
sys.path.append(os.path.join(projectDir, 'scripts/utilityScripts'))

import configuration_util
import get_gene_and_cohort_list_utils
import analysis_utils
import mutationSigUtils
import maf_analysis_utils

filePathDict = configuration_util.get_all_files_path_dict()

In [31]:
#Set where to write the files
writeDir = os.path.join(projectDir, 'scripts/figure3/FIGURE3_PLOTTING_FILES/plotDataFiles')

## Mutations in essential genes figure

In [33]:
def summarize_essential_gene_mutation_burden(maf, cancerTypeEssentialGenesDict):
    impactGenes = get_gene_and_cohort_list_utils.get_im6_genes()
    listOfDicts = []
    cntr = 0
    for case in set(maf['Tumor_Sample_Barcode']):
        if cntr %50== 0: print round((1.0*cntr)/len(set(maf['Tumor_Sample_Barcode'])), 2),
        cntr +=1
        caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
        cancerType = caseMaf['cancerTypeDepMap'].iloc[0]
        essentialGenes = cancerTypeEssentialGenesDict[cancerType]
        essentialGeneMaf = caseMaf[caseMaf['Hugo_Symbol'].isin(essentialGenes)]
        neutralGeneMaf = caseMaf[~caseMaf['Hugo_Symbol'].isin(set(essentialGenes) | set(impactGenes))]
        essentialGeneCompositeMaf = essentialGeneMaf[essentialGeneMaf['isComposite'] == 1]
        neutralGeneCompositeMaf = neutralGeneMaf[neutralGeneMaf['isComposite'] == 1]
        
        listOfDicts.append({'Tumor_Sample_Barcode': case,
                            'nNeutralTruncating': neutralGeneMaf.shape[0],
                            'nEssentialTruncatingGenes': len(set(essentialGeneMaf['Hugo_Symbol'])),
                            'nNeutralTruncatingGenes': len(set(neutralGeneMaf['Hugo_Symbol'])),
                            'nEssentialDoubleTruncatingGenes': len(set(essentialGeneCompositeMaf['Hugo_Symbol'])),
                            'nNeutralDoubleTruncatingGenes': len(set(neutralGeneCompositeMaf['Hugo_Symbol'])),
                           })
    return pd.DataFrame(listOfDicts)

In [34]:
exomeHypermutatorMaf = pd.read_table(filePathDict['ALL_EXOME_HYPERMUTATOR_MAF'])
exomeHypermutatorMafWithCancerType = maf_analysis_utils.annotate_maf_with_dep_map_cancer_type(exomeHypermutatorMaf, tcgaInfoPath=filePathDict['TCGA_CANCER_TYPE_INFO'], impactInfoPath=filePathDict['CANCER_TYPE_INFO'])

#specifically run analysis on certain cancer types where we can identify essential genes
cTypes = set(exomeHypermutatorMafWithCancerType['cancerTypeDepMap'])
d = get_gene_and_cohort_list_utils.get_cancer_type_specific_dep_map_data(cancerTypes = cTypes)
cancerTypeEssentialGenesDict = get_gene_and_cohort_list_utils.get_cancer_type_specific_essential_genes(cTypes, essentialThresh = -1)

exomeHypermutatorMafWithCancerType = exomeHypermutatorMafWithCancerType[exomeHypermutatorMafWithCancerType['cancerTypeDepMap'] != 'other']
exomeHypermutatorMafWithCancerType = exomeHypermutatorMafWithCancerType[exomeHypermutatorMafWithCancerType['cancerTypeDepMap'] != 'other']
truncatingConsequences = set(['Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins'])
truncatingMaf = exomeHypermutatorMafWithCancerType[exomeHypermutatorMafWithCancerType['Variant_Classification'].isin(truncatingConsequences)]

truncatingMaf['geneCase'] = truncatingMaf.apply(lambda row: row['Tumor_Sample_Barcode'] + '_' + row['Hugo_Symbol'], axis=1)
mutCounts = truncatingMaf['geneCase'].value_counts()
truncatingMaf['isComposite'] = truncatingMaf['geneCase'].apply(lambda x:
            1 if x in mutCounts and mutCounts[x] > 1 else 0)

df = summarize_essential_gene_mutation_burden(truncatingMaf, cancerTypeEssentialGenesDict)

0.0 0.07 0.15 0.22 0.3 0.37 0.44 0.52 0.59 0.67 0.74 0.81 0.89 0.96


In [35]:
writePath = os.path.join(writeDir, 'figure_3z1.tsv')
df.to_csv(writePath, index=False, sep='\t')


## Ongoing evolution late in tumor development

In [43]:
def identify_parallel_evolution(maf, trunkId, branchId, pid):
    oncoMaf = maf[maf['oncogenic'].notnull()]
    otherBranchesMaf = oncoMaf[(~oncoMaf['adjLabel'].isin([trunkId, branchId])) &
                    (oncoMaf['pid'] == pid)]
    branchMaf = oncoMaf[oncoMaf['adjLabel'] == branchId]
    if len(set(otherBranchesMaf['Hugo_Symbol']) & set(branchMaf['Hugo_Symbol'])) > 0:
        return True
    else:
        return False
    
#todo this could return something non-boolean
def identify_tsg_second_hit(maf, trunkId, branchId):
    oncoMaf = maf[maf['oncogenic'].notnull()]
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    oncoTsgMaf = oncoMaf[oncoMaf['Hugo_Symbol'].isin(tsgs)]
    trunkOncoTsg = oncoTsgMaf[oncoTsgMaf['adjLabel'] == trunkId].drop_duplicates(subset=['varUuid']) #NOTE all mutations are at least duplicated in the truncal maf, fix it
    branchOncoTsg = oncoTsgMaf[oncoTsgMaf['adjLabel'] == branchId]
    
    oneHitTrunkTsgs = [gene for gene, count in dict(trunkOncoTsg['Hugo_Symbol'].value_counts()).items() if count == 1]
    secondHitTsgs = set(branchOncoTsg['Hugo_Symbol']) & set(oneHitTrunkTsgs)
    if len(secondHitTsgs) > 0:
        return True
    else:
        return False
    
def identify_denovo_biallelic_inactivation(maf, trunkId, branchId):
    oncoMaf = maf[maf['oncogenic'].notnull()]
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    oncoTsgMaf = oncoMaf[oncoMaf['Hugo_Symbol'].isin(tsgs)]
    trunkOncoTsg = oncoTsgMaf[oncoTsgMaf['adjLabel'] == trunkId].drop_duplicates(subset=['varUuid']) #NOTE all mutations are at least duplicated in the truncal maf, fix it
    branchOncoTsg = oncoTsgMaf[oncoTsgMaf['adjLabel'] == branchId]
    
    doubleHitBranchTsgs = [gene for gene, count in dict(branchOncoTsg['Hugo_Symbol'].value_counts()).items() if count > 1]
    denovoBranchBiallelic = set(doubleHitBranchTsgs) - set(trunkOncoTsg['Hugo_Symbol'])
    if len(denovoBranchBiallelic) > 0:
        return True
    else:
        return False

def summarize_mut_branch_info(maf, relatedGenesDict):
    listOfDicts = []
    for pid in set(maf['pid']):
        patientMaf = maf[maf['pid'] == pid]
        trunkLabel = pid + '_trunk'
        branchNumber = 1
        for branch in set(patientMaf['adjLabel']) - set([trunkLabel]):
            trunkMaf = mutsLabeledByBranch[mutsLabeledByBranch['adjLabel'] == trunkLabel]
            branchMaf = mutsLabeledByBranch[mutsLabeledByBranch['Tumor_Sample_Barcode'] == branch]
            
            cancerType = branchMaf['cancerType'].iloc[0]
            relatedGenes = relatedGenesDict[cancerType]
            
            #Number of related and unrelated drivers
            nRelatedDriverTrunk = trunkMaf[(trunkMaf['oncogenic'].notnull()) & (trunkMaf['Hugo_Symbol'].isin(relatedGenes))].shape[0]
            nUnrelatedDriverTrunk = trunkMaf[(trunkMaf['oncogenic'].notnull()) & (~trunkMaf['Hugo_Symbol'].isin(relatedGenes))].shape[0]    
            nRelatedDriverBranch = branchMaf[(branchMaf['oncogenic'].notnull()) & (branchMaf['Hugo_Symbol'].isin(relatedGenes))].shape[0]
            nUnrelatedDriverBranch = branchMaf[(branchMaf['oncogenic'].notnull()) & (~branchMaf['Hugo_Symbol'].isin(relatedGenes))].shape[0]    
            
            convergentEvolution = identify_parallel_evolution(mutsLabeledByBranch, trunkLabel, branch, pid)
            tsgSecondHit = identify_tsg_second_hit(mutsLabeledByBranch, trunkLabel, branch)
            denovoBiallelicInactivation = identify_denovo_biallelic_inactivation(mutsLabeledByBranch, trunkLabel, branch)
            
            listOfDicts.append({'cancerType': cancerType, 'branchNumber': branchNumber,
                               'trunkId': trunkLabel, 'nRelatedTrunk': nRelatedDriverTrunk, 'nUnrelatedTrunk': nUnrelatedDriverTrunk,
                                'nRelatedBranch': nRelatedDriverBranch, 'nUnrelatedBranch': nUnrelatedDriverBranch,
                                'branchId': branch, 'convergentEvolution': convergentEvolution,
                                'secondHitTSG': tsgSecondHit, 'denovoBiallelicInactivation': denovoBiallelicInactivation
                               })
            
            branchNumber += 1
            
    df = pd.DataFrame(listOfDicts)
    
    cancerTypeOrdering = {'Bladder Cancer': '1', 'Colorectal Cancer': '2', 'Endometrial Cancer': '3',
                         'Esophagogastric Cancer': '4', 'Glioma': '5', 'Prostate Cancer': '6'}
    
    df['orderingVal'] = df.apply(lambda row: str(cancerTypeOrdering[row['cancerType']]) +
                re.sub("[^0-9]", '', str(row['trunkId'])) + str(row['branchNumber']), axis=1)
    return df
            
            

In [44]:
mutsLabeledByBranch = pd.read_table(filePathDict['BRANCH_ANNOTATED_MAF'])
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = dict(get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO']))
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

mutsLabeledByBranch['barcodeOld'] = mutsLabeledByBranch['adjLabel'].apply(lambda x: x.split('_')[0])
mutsLabeledByBranch['cancerType'] = mutsLabeledByBranch['barcodeOld'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
allImpactMutsMaf['hypermutationStatus'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x:
    'hypermutated' if x in hypermutantIds else 'normal' if x in normalIds else 'Intermediate')

relatedGenesDict = get_gene_and_cohort_list_utils.get_related_genes_by_cancer_type(thresh = 1.0/30.0, impactMafPath = filePathDict['IMPACT_BASE_MAF'])

df = summarize_mut_branch_info(mutsLabeledByBranch, relatedGenesDict)

In [45]:
writePath = os.path.join(writeDir, 'figure_3z2.tsv')
df.to_csv(writePath, index=False, sep='\t')

## Clonality figure

In [47]:
def summarize_clonality_info(maf):
    tmzIds = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_11')
    poleIds = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_10') | get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_14')
    mmrIds = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_MMR')
    #aging ids in hypermutators are actually MMR
    agingIds = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_1')
    endogenousIds = poleIds | mmrIds | agingIds
    hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])

    hypermutantMaf = maf[maf['Tumor_Sample_Barcode'].isin(hypermutantIds)]
    tmzGliomaIds = set(hypermutantMaf[(hypermutantMaf['Tumor_Sample_Barcode'].isin(tmzIds)) & (hypermutantMaf['cancerType'] == 'Glioma')]['Tumor_Sample_Barcode'])
    
    oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    
    hypermutantMaf['driverType'] = hypermutantMaf['oncogenic'].apply(lambda x: 'driver' if len(str(x)) > 4 else 'VUS')
    hypermutantMaf['geneType'] = hypermutantMaf['Hugo_Symbol'].apply(lambda x: 'TSG' if x in tsgs else 'Oncogene' if x in oncogenes else None)
    hypermutantMaf['mutationType'] = hypermutantMaf['Tumor_Sample_Barcode'].apply(lambda x: 'Endogenous' if x in endogenousIds else 'TMZ-glioma' if x in tmzGliomaIds else None)
    
    hypermutantMaf = hypermutantMaf[hypermutantMaf['HGVSp_Short'].notnull()]
    
    return hypermutantMaf

In [48]:
"""allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
allImpactMutsMaf['varUuid'] = allImpactMutsMaf.apply(lambda row: str(row['Tumor_Sample_Barcode']) + '_' +
                               str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)
oncogenicDict = dict(zip(allImpactMutsMaf['varUuid'], allImpactMutsMaf['oncogenic']))

clonalityMaf = pd.read_csv(filePathDict['IMPACT_MAF_WITH_ADJUSTED_CLONALITY_ANNOTATION'])
clonalityMaf['varUuid'] = clonalityMaf.apply(lambda row: str(row['Tumor_Sample_Barcode']) + '_' +
                               str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)
clonalityMaf['oncogenic'] = clonalityMaf['varUuid'].apply(lambda x: oncogenicDict[x] if x in oncogenicDict else None)

cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(filePathDict['CANCER_TYPE_INFO'])
clonalityMaf['cancerType'] = clonalityMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
"""
summaryMaf = summarize_clonality_info(clonalityMaf)

summaryMaf['clonal'] = summaryMaf['isClonal'].apply(lambda x: 1 if x == True else 0 if x == False else None)
summaryMaf = summaryMaf[['clonal', 'Tumor_Sample_Barcode', 'driverType', 'geneType', 'mutationType', 'Hugo_Symbol']]

  sigsDf = pd.read_table(impactSigsPath)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [49]:
writePath = os.path.join(writeDir, 'figure_3z3.tsv')
summaryMaf.to_csv(writePath, index=False, sep='\t')