In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re
import math
from collections import Counter

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import configuration_util

filePathDict = configuration_util.get_all_files_path_dict()
import analysis_utils
import get_gene_and_cohort_list_utils

## FIGURE 2B: RELATED VS UNRELATED

In [14]:
#
####
############

#summarize d driver/d vus by gene type

def summarize_related_unrelated_driver_frac(maf, relatedGenesD, tmbDict):
    listOfDicts = []
    cntr = 0
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    #maf = maf[maf['Hugo_Symbol'].isin(tsgs)]
    for case in set(maf['Tumor_Sample_Barcode']):
        #cntr += 1
        #if cntr %100 == 0: print cntr
        
        caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
        cancerType = caseMaf['cancerType'].iloc[0]
        tmb = tmbDict[case] if case in tmbDict else None #todo get the real TMB
        relatedGenes = relatedGenesD[cancerType]
        hypermutationStatus = caseMaf['hypermutationStatus'].iloc[0]
        
        caseMafRelated = caseMaf[caseMaf['Hugo_Symbol'].isin(relatedGenes)]
        caseMafUnrelated = caseMaf[~caseMaf['Hugo_Symbol'].isin(relatedGenes)]
        relatedDrivers = caseMafRelated[caseMafRelated['oncogenic'].notnull()]
        unrelatedDrivers = caseMafUnrelated[caseMafUnrelated['oncogenic'].notnull()]
        
        nRelatedDrivers = 1.0*relatedDrivers.shape[0]
        nUnrelatedDrivers = 1.0*unrelatedDrivers.shape[0]
        nTotalRelated = 1.0*caseMafRelated.shape[0]
        nTotalUnrelated = 1.0*caseMafUnrelated.shape[0]
        
        ratioRelated = None if nTotalRelated == 0 else nRelatedDrivers/(nTotalRelated)
        ratioUnrelated = None if nTotalUnrelated == 0 else nUnrelatedDrivers/(nTotalUnrelated)
        
        listOfDicts.append({'Tumor_Sample_Barcode': case, 'cancerType': cancerType,
                            'TMB': tmb, 'hypermutationStatus': hypermutationStatus,
                            'fracDriverRelated': ratioRelated, 'fracDriverUnrelated': ratioUnrelated})
        
        #if cntr > 5000:
        #    break
        
    df = pd.DataFrame(listOfDicts)
    return df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
  df = pd.read_table(tmbFilePath)


In [15]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = dict(get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO']))
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
allImpactMutsMaf['hypermutationStatus'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x:
    'hypermutated' if x in hypermutantIds else 'normal' if x in normalIds else 'Intermediate')

hypermutationMaf = allImpactMutsMaf[allImpactMutsMaf['hypermutationStatus'] == 'hypermutated']
relatedGenesDict = get_related_genes_by_cancer_type(allImpactMutsMaf, thresh = 1.0/30.0)
tmbDict = get_gene_and_cohort_list_utils.get_all_tmb_info(tmbFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'])
df = summarize_related_unrelated_driver_frac(hypermutationMaf, relatedGenesDict, tmbDict)

df.to_csv('~/Desktop/WORK/dataForLocalPlotting/relatedUnrelated.tsv', index=False, sep='\t')


  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


## OTHER ANALYSES

In [2]:
def get_n_consequential_mut_count():
    
    allImpactMuts = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/data_mutations_extended_annotated_sigContext_nov19_2019.maf')
    im3Genes = set(['ABL1', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BAP1', 'BARD1', 'BBC3', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CARD11', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79B', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSF1R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EP300', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERG', 'ESR1', 'ETV1', 'ETV6', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXP1', 'FUBP1', 'GATA1', 'GATA2', 'GATA3', 'GNA11', 'GNAQ', 'GNAS', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3B', 'HNF1A', 'HRAS', 'ICOSLG', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INPP4A', 'INPP4B', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAPK1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MITF', 'MLH1', 'MLL', 'MLL2', 'MLL3', 'MPL', 'MRE11A', 'MSH2', 'MSH6', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOR1', 'NF1', 'NF2', 'NFE2L2', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTRK1', 'NTRK2', 'NTRK3', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLE', 'PPP2R1A', 'PRDM1', 'PRKAR1A', 'PTCH1', 'PTEN', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAC1', 'RAD50', 'RAD51', 'RAD51B', 'RAD51C', 'RAD51D', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RUNX1', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SETD2', 'SF3B1', 'SH2D1A', 'SHQ1', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SOCS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SRC', 'STAG2', 'STK11', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TBX3', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP63', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'VHL', 'VTCN1', 'WT1', 'XIAP', 'XPO1', 'YAP1', 'YES1'])
    allImpactMuts341 = allImpactMuts[allImpactMuts['Hugo_Symbol'].isin(im3Genes)]

    nmutDict = dict(allImpactMuts341['Tumor_Sample_Barcode'].value_counts())
    hotspotDict = dict(allImpactMuts341[allImpactMuts341['is-a-hotspot'] == 'Y']['Tumor_Sample_Barcode'].value_counts())
    oncogenicDict = dict(allImpactMuts341[allImpactMuts341['oncogenic'].notnull()]['Tumor_Sample_Barcode'].value_counts())
    stopGainDict = dict(allImpactMuts341[allImpactMuts341['Consequence'] == 'stop_gained']['Tumor_Sample_Barcode'].value_counts())

    #add entries with the value 0 where necessary
    for case in set(expectedDf['case']):
        if case not in hotspotDict:
            hotspotDict[case] = 0
        if case not in oncogenicDict:
            oncogenicDict[case] = 0
        if case not in stopGainDict:
            stopGainDict[case] = 0
    return hotspotDict, oncogenicDict, stopGainDict

In [3]:
def enumerate_related_and_weakly_related_genes(df, mode='dnds', cancerTypes = ['Endometrial Cancer', 'Colorectal Cancer']):
    
    if mode == 'dnds':
        dictOfDicts = {}
        for cancerType in cancerTypes:
            cancerTypeDf = df[df['cancerType'] == cancerType]
            normal = set(cancerTypeDf[cancerTypeDf['qglobal_cv.Normal'] <= .01]['gene_name'])
            normalAndHyperStrong = normal | set(cancerTypeDf[cancerTypeDf['qglobal_cv.Hypermutated'] <= .01]['gene_name'])
            normalAndHyperStrongAndWeak = normalAndHyperStrong | set(cancerTypeDf[cancerTypeDf['qglobal_cv.Hypermutated'] <= .1]['gene_name'])
            dictOfDicts[cancerType] = {'normal': normal, 'normalAndHyperStrong': normalAndHyperStrong, 'normalAndHyperStrongAndWeak': normalAndHyperStrongAndWeak}
        
        return dictOfDicts
        
    elif mode == 'counts':
        relatedThresh = .1
        unrelatedThresh = .01
    
    else:
        print 'error invalid mode specified'


In [4]:
def get_median_expected_rate(expectedDf, mode='oncogenic'):
    cntr = 0
    rates = []
    col = None
    if mode == 'oncogenic':
        col = 'oncogenicChance'
    for case in set(expectedDf['case']):
        cntr += 1
        if cntr%25==0:print cntr,
        caseExpectation = expectedDf[expectedDf['case'] == case]
        rates.append(sum(caseExpectation[col]))
    return np.nanmedian(rates)

In [5]:
def summarize_obs_vs_expected_divergence_explained_by_gene_classes(expectedDf, mutsDf, geneDicts,
                                                                    mode='oncogenic'):
    listOfDicts = []
    cntr = 0
    
    im3Genes = set(['ABL1', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BAP1', 'BARD1', 'BBC3', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CARD11', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79B', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSF1R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EP300', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERG', 'ESR1', 'ETV1', 'ETV6', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXP1', 'FUBP1', 'GATA1', 'GATA2', 'GATA3', 'GNA11', 'GNAQ', 'GNAS', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3B', 'HNF1A', 'HRAS', 'ICOSLG', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INPP4A', 'INPP4B', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAPK1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MITF', 'MLH1', 'MLL', 'MLL2', 'MLL3', 'MPL', 'MRE11A', 'MSH2', 'MSH6', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOR1', 'NF1', 'NF2', 'NFE2L2', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTRK1', 'NTRK2', 'NTRK3', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLE', 'PPP2R1A', 'PRDM1', 'PRKAR1A', 'PTCH1', 'PTEN', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAC1', 'RAD50', 'RAD51', 'RAD51B', 'RAD51C', 'RAD51D', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RUNX1', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SETD2', 'SF3B1', 'SH2D1A', 'SHQ1', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SOCS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SRC', 'STAG2', 'STK11', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TBX3', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP63', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'VHL', 'VTCN1', 'WT1', 'XIAP', 'XPO1', 'YAP1', 'YES1'])
    mutsDf = mutsDf[mutsDf['Hugo_Symbol'].isin(im3Genes)]
    mutsDf = mutsDf[mutsDf['Variant_Type'] == 'SNP']
    nmutDict = dict(mutsDf['Tumor_Sample_Barcode'].value_counts())
    
    medianExp = None
    if mode == 'oncogenic':
        mutsDf = mutsDf[mutsDf['oncogenic'].notnull()]
        #medianExp = get_median_expected_rate(expectedDf)
        medianExp = 0.05 #about what it was 
    
    for case in set(mutsDf['Tumor_Sample_Barcode']):
    
        if cntr %100 == 0: print cntr,
        cntr +=1

        caseDf = mutsDf[mutsDf['Tumor_Sample_Barcode'] == case]
        if caseDf.shape[0] > 0:
    
            caseExpectation = expectedDf[expectedDf['case'] == case]
            #NOTE we need to cancer cancer type to be underscore separated
            cancerType = caseDf['cancerType'].iloc[0].split(' ')[0] + '_' + caseDf['cancerType'].iloc[0].split(' ')[1]
            nmutObs = caseDf.shape[0]
            
            #this can be added if needed
            #.drop_duplicates(subset=['Hugo_Symbol'])
            nmutNormal = caseDf[caseDf['Hugo_Symbol'].isin(geneDicts[cancerType]['normal'])].shape[0]
            nmutNormalAndHyperStrong = caseDf[caseDf['Hugo_Symbol'].isin(geneDicts[cancerType]['normalAndHyperStrong'])].shape[0]
            nmutNormalAndHyperAll = caseDf[caseDf['Hugo_Symbol'].isin(geneDicts[cancerType]['normalAndHyperStrongAndWeak'])].shape[0]

            nmutCase = nmutDict[case]
            
            oncogenicExpectation = nmutCase * medianExp
            if case in set(expectedDf['case']):
                hotspotExpectation = nmutCase *sum(caseExpectation['hotspotChance'])
                oncogenicExpectation = nmutCase *sum(caseExpectation['oncogenicChance'])
            expected = 0
            if mode == 'oncogenic':
                expected = oncogenicExpectation

            listOfDicts.append({'nmut': nmutCase, 'Tumor_Sample_Barcode': case, 'cancerType': cancerType,
                               'dif': nmutObs - expected, 'observed': nmutObs, 'expected': expected,
                                'nmutNormal': nmutNormal, 'nmutNormalAndHyperStrong': nmutNormalAndHyperStrong,
                               'nmutNormalAndHyperAll': nmutNormalAndHyperAll})

    df = pd.DataFrame(listOfDicts)
    return df
    

In [5]:
expectedDf = pd.read_table(filePathDict['EXPECTED_MUTATION_INFO_BY_GENE'])


  """Entry point for launching an IPython kernel.


In [4]:
hotspotDict, oncogenicDict, stopGainDict = get_n_consequential_mut_count()

  if self.run_code(code, result):


In [6]:
dndsDf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/dndsHypermutants.tsv')

In [5]:
allImpactMuts = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/data_mutations_extended_annotated_sigContext_nov19_2019.maf')


NameError: name 'pathPrefix' is not defined

In [34]:
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/cancerTypeInfo_asOfNov192019.txt')
allImpactMuts['cancerType'] = allImpactMuts['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)


In [58]:
#cTypes = ['Endometrial_Cancer', 'Colorectal_Cancer', 'Prostate_Cancer', 'Esophagogastric_Cancer', 'Bladder_Cancer']
cTypes = ['Endometrial_Cancer', 'Colorectal_Cancer']
dictOfDicts = enumerate_related_and_weakly_related_genes(dndsDf, mode='dnds', cancerTypes=cTypes)
cTypesIMForm = [i.split('_')[0] + ' ' + i.split('_')[1] for i in cTypes]

In [None]:
dfDiv = summarize_obs_vs_expected_divergence_explained_by_gene_classes(expectedDf, allImpactMuts[allImpactMuts['cancerType'].isin(cTypesIMForm)],
                                                        dictOfDicts, mode='oncogenic')

In [101]:
dfDiv.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/observedVsExpectedByType.tsv', index=False, sep='\t')


In [102]:
get_median_expected_rate(expectedDf, mode='oncogenic')

25 50 75 100 125 150 175 200 225 250 275 300 325 350 375 400 425 450 475 500 525 550 575 600 625 650 675 700 725 750 775 800 825 850 875 900 925 950 975 1000 1025 1050 1075 1100 1125 1150 1175 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425 1450 1475 1500 1525 1550

0.05644723732825163




In [3]:
#
###
######
#############
#####
###
#

In [58]:
def summarize_relative_mutation_prevalence_hyper_vs_nonhyper(oncMutsDf, cancerTypes, dndsDict):
    listOfDicts = []
    for cancerType in cancerTypes:
        hypermutantMaf = oncMutsDf[(oncMutsDf['hypermutantStatus'] == 'Hypermutated') &
                                   (oncMutsDf['cancerType'] == cancerType)]
        normalMaf = oncMutsDf[(oncMutsDf['hypermutantStatus'] == 'Normal') &
                             (oncMutsDf['cancerType'] == cancerType)]
        dndsKey = '_'.join(cancerType.split(' '))
        ctDNDS = dndsDict[dndsKey]
        
        nHyperCases = 1.0*len(set(hypermutantMaf['Tumor_Sample_Barcode'])) 
        nNormalCases = 1.0*len(set(normalMaf['Tumor_Sample_Barcode']))
        
        for gene in set(normalMaf['Hugo_Symbol']):
            geneMafHyper = hypermutantMaf[hypermutantMaf['Hugo_Symbol'] == gene]
            geneMafNormal = normalMaf[normalMaf['Hugo_Symbol'] == gene]
            
            percentHyper = geneMafHyper.drop_duplicates(subset=['Tumor_Sample_Barcode']).shape[0]/nHyperCases
            percentNormal = geneMafNormal.drop_duplicates(subset=['Tumor_Sample_Barcode']).shape[0]/nNormalCases
            
            if gene in ctDNDS:
                listOfDicts.append({'Gene': gene, 'ratio': percentHyper/percentNormal,
                                   'cancerType': cancerType, 'dndsScore': ctDNDS[gene],
                                   'percentHyper': percentHyper})
    return pd.DataFrame(listOfDicts)
    

In [13]:
def convert_dnds_data_to_dict(dndsDf, cancerTypes):
    d = {}
    for cancerType in cancerTypes:
        cancerTypeDnds = dndsDf[dndsDf['cancerType'] == cancerType]
        localD = dict(zip(cancerTypeDnds['gene_name'], cancerTypeDnds['qglobal_cv.Normal']))
        d[cancerType] = localD
    return d
        
    

In [8]:
dfDNDSHyper = pd.read_table('~/Desktop/offlineFilesForVirus/dndsHypermutants.tsv')
allImpactMutsMaf = pd.read_table('/Users/friedman/Desktop/offlineFilesForVirus/data_mutations_extended_annotated_sigContext_nov19_2019.maf')

  """Entry point for launching an IPython kernel.
  
  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
allHypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=pathPrefix + '/juno/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds')


In [None]:
reload(get_gene_and_cohort_list_utils)
allNormalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=pathPrefix + '/juno/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds')


In [7]:
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/cancerTypeInfo_asOfNov192019.txt')
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)


In [63]:
dndsDictForm = convert_dnds_data_to_dict(dfDNDSHyper, ['Endometrial_Cancer', 'Colorectal_Cancer', 'Bladder_Cancer', 'Prostate_Cancer', 'Esophagogastric_Cancer', 'Glioma'])

## GENE TYPE and n muts explained

In [6]:
def create_prevalence_curves(maf, cancerTypes):
    
    def get_ordering(oMaf):
        ordering = [i[0] for i in
                              Counter(oMaf['Hugo_Symbol']).most_common(
            len(set(oMaf['Hugo_Symbol'])))]
        return ordering
    
    def generate_prevalence_curve_data(maf, orderedGenes, geneType, cancerType, mutBurdenType):
        listOfDicts = []
        cntr = 0
        runningSum = 0.0
        nTotalOncogenic = maf.shape[0]
        mutCounter = Counter(maf['Hugo_Symbol'])
        for gene in orderedGenes:
            nMuts = mutCounter[gene]
            listOfDicts.append({
                'Hugo_Symbol': gene, 'n': cntr,'frac': runningSum/nTotalOncogenic,
                'geneType': geneType, 'cancerType': cancerType, 'mutBurdenType': mutBurdenType,
                'cohort': geneType + '_' + mutBurdenType
            })
            cntr += 1
            runningSum += nMuts
        df = pd.DataFrame(listOfDicts)
        return df
    
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
    
    maf = maf[maf['oncogenic'].notnull()] #only look at oncogenic mutations
    listOfDfs = []
    for cancerType in cancerTypes:
        hypermutatedIds = get_gene_and_cohort_list_utils.get_ids_by_hypermutant_status(
            hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'], cancerType=cancerType, hypermutantStatus = 'Hypermutated')
        nonHypermutatedIds = get_gene_and_cohort_list_utils.get_ids_by_hypermutant_status(
            hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'], cancerType=cancerType, hypermutantStatus = 'Normal')
        
        hypermutantMaf = maf[maf['Tumor_Sample_Barcode'].isin(hypermutatedIds)]
        normalMaf = maf[maf['Tumor_Sample_Barcode'].isin(nonHypermutatedIds)]
        
        #divide into oncogenes and tumor suppressors
        hyperOncogenes = hypermutantMaf[hypermutantMaf['Hugo_Symbol'].isin(oncogenes)]
        hyperTsgs = hypermutantMaf[hypermutantMaf['Hugo_Symbol'].isin(tsgs)]
        normalOncogenes = normalMaf[normalMaf['Hugo_Symbol'].isin(oncogenes)]
        normalTsgs = normalMaf[normalMaf['Hugo_Symbol'].isin(tsgs)]
        
        listOfDfs.append(generate_prevalence_curve_data(normalOncogenes, get_ordering(normalOncogenes), 'oncogene', cancerType, 'non-Hypermutated'))
        listOfDfs.append(generate_prevalence_curve_data(normalTsgs, get_ordering(normalTsgs), 'tsg', cancerType, 'non-Hypermutated'))
        listOfDfs.append(generate_prevalence_curve_data(hyperOncogenes, get_ordering(hyperOncogenes), 'oncogene', cancerType, 'Hypermutated'))
        listOfDfs.append(generate_prevalence_curve_data(hyperTsgs, get_ordering(hyperTsgs), 'tsg', cancerType, 'Hypermutated'))
      
    return pd.concat(listOfDfs)
    
    

In [8]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(
    impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO'])
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x:
    cancerTypeDict[x] if x in cancerTypeDict else None)

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
cancerTypesToAnalyze = ['Colorectal Cancer', 'Endometrial Cancer',
    'Bladder Cancer', 'Glioma', 'Prostate Cancer', 'Esophagogastric Cancer']
df = create_prevalence_curves(allImpactMutsMaf, cancerTypesToAnalyze)

  df = pd.read_table(path)


In [10]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/plotGeneFractions.tsv', index=False, sep='\t')

## Supplementary figure: percent drivers related/unrelated depending on relatedness threshold

In [13]:
def enumerate_related_genes(maf, thresh):
    nCases = len(set(maf['Tumor_Sample_Barcode']))
    oncMaf = maf[maf['oncogenic'].notnull()]
    oncMaf['caseGene'] = oncMaf['Tumor_Sample_Barcode'] + '_' + oncMaf['Hugo_Symbol']
    geneCounter = Counter(oncMaf.drop_duplicates(subset=['caseGene'])['Hugo_Symbol'])
    
    return [gene for gene, count in geneCounter.items() if (1.0*count)/nCases > thresh]
    
def summarize_related_gene_fracs(maf, cancerTypes):
    thresholds = [0.1, 0.05, .02, .01]
    listOfDicts = []
    for ct in cancerTypes:
        for thresh in thresholds:
            ctMaf = maf[maf['cancerType'] == ct]
            normalMaf = ctMaf[ctMaf['hypermutationStatus'] == 'normal']
            hypermutantMaf = ctMaf[ctMaf['hypermutationStatus'] == 'hypermutated']
            normalMafOnc = normalMaf[normalMaf['oncogenic'].notnull()]
            hypermutantMafOnc = hypermutantMaf[hypermutantMaf['oncogenic'].notnull()]
            nOncNormal = 1.0*normalMafOnc.shape[0]
            nOncHyper = 1.0*hypermutantMafOnc.shape[0]
            
            relatedGenes = enumerate_related_genes(normalMaf, thresh)
            nRelatedNormal = normalMafOnc[normalMafOnc['Hugo_Symbol'].isin(relatedGenes)].shape[0]
            nRelatedHyper = hypermutantMafOnc[hypermutantMafOnc['Hugo_Symbol'].isin(relatedGenes)].shape[0]
            
            listOfDicts.append({'cancerType': ct, 'hypermutantStats': 'normal', 
                                'cohort': str(thresh) + '_' +  ct + '_normal', 'cohortThresh': str(thresh) + '_normal',
                                'fracRelated': nRelatedNormal/nOncNormal, 'thresh': thresh})
            listOfDicts.append({'cancerType': ct, 'hypermutantStats': 'hypermutated', 
                                'cohort': str(thresh) + '_' + ct + '_hypermutated', 'cohortThresh': str(thresh) + '_hypermutated',
                                'fracRelated': nRelatedHyper/nOncHyper, 'thresh': thresh})
            
    df = pd.DataFrame(listOfDicts)
    return df
    
#returns a dictionary of related/unrelated genes
def get_related_genes_by_cancer_type(maf, thresh = 1.0/30.0):
    cancerTypes = set(maf['cancerType'])
    d = {}
    for ct in cancerTypes:
        ctMaf = maf[maf['cancerType'] == ct]
        normalMaf = ctMaf[ctMaf['hypermutationStatus'] == 'normal']
        relatedGenes = enumerate_related_genes(normalMaf, thresh)
        d[ct] = relatedGenes
    return d


In [9]:
#allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = dict(get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO']))
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
allImpactMutsMaf['hypermutationStatus'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x:
    'hypermutated' if x in hypermutantIds else 'normal' if x in normalIds else 'Intermediate')



  df = pd.read_table(path)


In [115]:
df = summarize_related_gene_fracs(allImpactMutsMaf,
        ['Endometrial Cancer', 'Colorectal Cancer', 'Glioma', 'Prostate Cancer',
         'Esophagogastric Cancer', 'Bladder Cancer'])
#allImpactMutsMaf.columns.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [116]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/relatedGeneFracs.tsv', index=False, sep='\t')

In [44]:
df['cancerTypeAdj'] = df['cancerType'].apply(lambda x: x if x in
        ['Glioma'] else 'Other')

In [45]:
df.to_csv('~/Desktop/WORK/dataForLocalPlotting/relatedUnrelated.tsv', index=False, sep='\t')

## Supplementary figure: percent drivers in TSGs by TMB

In [117]:
tsgs = get_gene_and_cohort_list_utils.get_tsgs()
oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
tmbDict = get_gene_and_cohort_list_utils.get_all_tmb_info(tmbFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'])

  df = pd.read_table(tmbFilePath)


In [127]:
tmzIds = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_11')
mmrIds = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_MMR')
poleIds = get_gene_and_cohort_list_utils.get_impact_signature_cohort(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], 'mean_10')


In [128]:
oncMaf = allImpactMutsMaf[allImpactMutsMaf['oncogenic'].notnull()]
listOfDicts = []
cntr = 0

nOncCntr = Counter(oncMaf['Tumor_Sample_Barcode'])
tsgMaf = oncMaf[oncMaf['Hugo_Symbol'].isin(tsgs)]
nTsgCntr = Counter(tsgMaf['Tumor_Sample_Barcode'])
for case, nDriver in nOncCntr.items():
    if cntr%500==0:print cntr,
    nTsgDriver = 0.0
    if case in nTsgCntr: nTsgDriver = 1.0*nTsgCntr[case]
        
    signatureCohort = 'other'
    if case in tmzIds: signatureCohort = 'TMZ'
    elif case in mmrIds: signatureCohort = 'MMR'
    elif case in poleIds: signatureCohort = 'POLE'
    listOfDicts.append({'Tumor_Sample_Barcode': case, 'fracTsg': nTsgDriver/nDriver, 'tmb': tmbDict[case],
                       'nDriver': nDriver, 'nTsgDriver': nTsgDriver, 'signatureCohort': signatureCohort})
    cntr += 1
df = pd.DataFrame(listOfDicts)

0 500 1000 1500 2000 2500 3000 3500 4000 4500 5000 5500 6000 6500 7000 7500 8000 8500 9000 9500 10000 10500 11000 11500 12000 12500 13000 13500 14000 14500 15000 15500 16000 16500 17000 17500 18000 18500 19000 19500 20000 20500 21000 21500 22000 22500 23000 23500 24000 24500 25000 25500 26000 26500 27000 27500 28000 28500 29000 29500 30000 30500 31000 31500 32000 32500 33000 33500 34000 34500 35000 35500


In [129]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/tsgGeneFracs.tsv', index=False, sep='\t')

### Understanding possible driver mutations

In [3]:
expectedDf = pd.read_table(filePathDict['EXPECTED_MUTATION_INFO_BY_GENE'])

  """Entry point for launching an IPython kernel.


In [18]:
tsgs = get_gene_and_cohort_list_utils.get_tsgs()
oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
print sum(expectedDf[expectedDf['gene'].isin(tsgs)]['oncogenicChance'])
print sum(expectedDf[expectedDf['gene'].isin(oncogenes)]['oncogenicChance'])


79.89552231
11.7206735652


In [21]:
signaturesOfInterest = ['mean_MMR', 'mean_10', 'mean_11', 'mean_SMOKING', 'mean_7', 'mean_APOBEC']
for sig in signaturesOfInterest:
    sigCases = get_gene_and_cohort_list_utils.get_impact_signature_cohort(
        filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'], sig)
    expectedDfSig = expectedDf[expectedDf['case'].isin(sigCases)]
    
    #tsgOncogeneicSum = 1.0*sum(expectedDfSig[expectedDfSig['gene'].isin(tsgs)]['oncogenicChance'])
    #oncogeneOncogenicSum = 1.0*sum(expectedDfSig[expectedDfSig['gene'].isin(oncogenes)]['oncogenicChance'])
    #tsgFrac = tsgOncogeneicSum/(tsgOncogeneicSum + oncogeneOncogenicSum)
    
    tsgOncogeneicSum = 1.0*sum(expectedDfSig[expectedDfSig['gene'].isin(tsgs)]['truncatingChance'])
    oncogeneOncogenicSum = 1.0*sum(expectedDfSig[expectedDfSig['gene'].isin(oncogenes)]['truncatingChance'])
    tsgFrac = tsgOncogeneicSum/(tsgOncogeneicSum + oncogeneOncogenicSum)
    
    print sig, tsgFrac
    
    
    

mean_MMR 0.594180571963
mean_10 0.613140760188
mean_11 0.559559648132
mean_SMOKING 0.588946915015
mean_7 0.568592567065
mean_APOBEC 0.626794478839
