In [2]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re
import scipy.stats as stats

from collections import Counter

pathPrefix = '/Users/friedman/Desktop/mnt'

sys.path.append(pathPrefix + '/juno/work/taylorlab/friedman/myUtils')
import analysis_utils 
import get_gene_and_cohort_list_utils

In [None]:
def get_n_consequential_mut_count():
    
    allImpactMuts = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/data_mutations_extended_annotated_sigContext_nov19_2019.maf')
    im3Genes = set(['ABL1', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BAP1', 'BARD1', 'BBC3', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CARD11', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79B', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSF1R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EP300', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERG', 'ESR1', 'ETV1', 'ETV6', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXP1', 'FUBP1', 'GATA1', 'GATA2', 'GATA3', 'GNA11', 'GNAQ', 'GNAS', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3B', 'HNF1A', 'HRAS', 'ICOSLG', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INPP4A', 'INPP4B', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAPK1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MITF', 'MLH1', 'MLL', 'MLL2', 'MLL3', 'MPL', 'MRE11A', 'MSH2', 'MSH6', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOR1', 'NF1', 'NF2', 'NFE2L2', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTRK1', 'NTRK2', 'NTRK3', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLE', 'PPP2R1A', 'PRDM1', 'PRKAR1A', 'PTCH1', 'PTEN', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAC1', 'RAD50', 'RAD51', 'RAD51B', 'RAD51C', 'RAD51D', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RUNX1', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SETD2', 'SF3B1', 'SH2D1A', 'SHQ1', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SOCS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SRC', 'STAG2', 'STK11', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TBX3', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP63', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'VHL', 'VTCN1', 'WT1', 'XIAP', 'XPO1', 'YAP1', 'YES1'])
    allImpactMuts341 = allImpactMuts[allImpactMuts['Hugo_Symbol'].isin(im3Genes)]

    nmutDict = dict(allImpactMuts341['Tumor_Sample_Barcode'].value_counts())
    hotspotDict = dict(allImpactMuts341[allImpactMuts341['is-a-hotspot'] == 'Y']['Tumor_Sample_Barcode'].value_counts())
    oncogenicDict = dict(allImpactMuts341[allImpactMuts341['oncogenic'].notnull()]['Tumor_Sample_Barcode'].value_counts())
    stopGainDict = dict(allImpactMuts341[allImpactMuts341['Consequence'] == 'stop_gained']['Tumor_Sample_Barcode'].value_counts())

    #add entries with the value 0 where necessary
    for case in set(expectedDf['case']):
        if case not in hotspotDict:
            hotspotDict[case] = 0
        if case not in oncogenicDict:
            oncogenicDict[case] = 0
        if case not in stopGainDict:
            stopGainDict[case] = 0
    return hotspotDict, oncogenicDict, stopGainDict

In [4]:
def summarize_dominant_signatures_of_cases(mutClassificationDir = '/Users/friedman/Desktop/hypermutationStatusIds/'):
    d = {}
    for f in os.listdir(mutClassificationDir):
        cancerType = re.sub('.tsv', '', f)
        filePath = os.path.join(mutClassificationDir, f)
        df = pd.read_table(filePath)
        hypermutatedDf = df[df['hypermutantClassification'] == 'Hypermutated']
        domSigDict = dict(zip(hypermutatedDf['Tumor_Sample_Barcode'], hypermutatedDf['dominantSignature']))
        for tsb, domSig in domSigDict.items():
            d[tsb] = domSig
    return d

In [None]:
#TODO--run this code on all impact cases
expectedDf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/hypermutationAnalysisProj/mutSimulation/expectedMutationTables/allHypermutatorsExpectedGeneMutInfo.tsv')


In [None]:
listOfDicts = []
cntr = 0
for case in set(expectedDf['case']):
    
    if cntr %100 == 0: print cntr
    cntr +=1
    
    nmutCase = nmutDict[case]
    caseExpectation = expectedDf[expectedDf['case'] == case]
    hotspotExpectation = nmutCase *sum(caseExpectation['hotspotChance'])
    oncogenicExpectation = nmutCase *sum(caseExpectation['oncogenicChance'])
    
    observedHotspots = hotspotDict[case]
    observedOncogenic = oncogenicDict[case]
    observedStopGain = stopGainDict[case]

    listOfDicts.append({'obsHotspot':observedHotspots, 'obsOncogenic': observedOncogenic, 'obsStopGain': observedStopGain,
                        'expectedHotspot': hotspotExpectation, 'expectedOncogenic': oncogenicExpectation,
                       'nmut': nmutCase, 'Tumor_Sample_Barcode': case})
    
df = pd.DataFrame(listOfDicts)
    

In [None]:
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/cancerTypeInfo_asOfNov192019.txt')


In [None]:
domSigDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/impactSignatureCalls_Nov20_2019.tsv')

In [None]:
df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x: domSigDict[x] if x in domSigDict else None)
df['cancerType'] = df['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)


In [None]:
#reduce stuff to other
domSigDict = {'mean_1': 'MMR', 'mean_10': 'POLE', 'mean_11': 'TMZ',
             'mean_14': 'POLE', 'mean_7': 'UV', 'mean_MMR': 'MMR',
              'mean_SMOKING': 'SMOKING', 'mean_APOBEC': 'APOBEC'}
df['dominantSignature'] = df['dominantSignature'].apply(lambda x: domSigDict[x] if x in domSigDict
                                                       else 'Other')

In [None]:
cancerTypesToFocusOn = set(['Non-Small Cell Lung Cancer', 'Colorectal Cancer', 'Prostate Cancer',
                           'Glioma', 'Endometrial Cancer', 'Esophagogastric Cancer'])
df['cancerTypeAdj'] = df['cancerType'].apply(lambda x: x if x in cancerTypesToFocusOn else 'Other')

In [None]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/observedVsExpected.tsv', index=False, sep='\t')


In [5]:
#
##
#######
###############
####################
df = pd.read_table('/Users/friedman/Desktop/WORK/dataForLocalPlotting/observedVsExpected.tsv')

In [6]:
dominantSignatureDict = summarize_dominant_signatures_of_cases(mutClassificationDir = '/Users/friedman/Desktop/hypermutationStatusIds/')

In [9]:
df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x: dominantSignatureDict[x] if x in dominantSignatureDict else None)

df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/observedVsExpected.tsv', index=False, sep='\t')


In [10]:
Counter(df['dominantSignature'])

Counter({None: 669,
         'mean_1': 125,
         'mean_10': 58,
         'mean_11': 52,
         'mean_12': 5,
         'mean_14': 13,
         'mean_16': 2,
         'mean_18': 6,
         'mean_22': 2,
         'mean_23': 1,
         'mean_3': 3,
         'mean_30': 1,
         'mean_5': 2,
         'mean_7': 41,
         'mean_8': 3,
         'mean_9': 3,
         'mean_APOBEC': 46,
         'mean_MMR': 498,
         'mean_SMOKING': 28})