In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
from collections import Counter
pathPrefix = '/Users/friedman/Desktop/mnt'

sys.path.append(pathPrefix + '/ifs/work/taylorlab/friedman/myUtils')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import clonality_analysis_util

from numpy import array, linspace
from sklearn.neighbors.kde import KernelDensity
from scipy.signal import argrelextrema

In [None]:
allImpactMuts = analysis_utils.load_in_df_with_progress(filePath = pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/all_impact_mutations_annotated_cohort.maf', nLinesFile = 275000)

In [3]:
endometrialDf = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/subsettedMafs/Endometrial_HypermutantCaseMuts_MAF_ANNO.maf')
colorectalDf = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/subsettedMafs/Colorectal_HypermutantCaseMuts_MAF_ANNO.maf')
gliomaDf = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/subsettedMafs/Glioma_HypermutantCaseMuts_MAF_ANNO.maf')


  interactivity=interactivity, compiler=compiler, result=result)


**Subclone analysis**
We manually identify cases with obvious subclones by going through cbio portal and marking cases with obvious bimodal distributions
We then do a clustering of the VAFs to identify the clonal and subclonal mutations

In [4]:
#NOTE THIS METHOD ONLY PROVIDES REASONABLE RESULTS ON CASES ALREADY PRECURATED TO HAVE A SUBCLONE

from sklearn.cluster import MeanShift
def assign_variants_to_clonal_cluster(vafs, ids):
    
    #mark which clusters returned by the clustering are clonal by iterating over clusters by mean vaf 
    #and returning the clusters once we have at least minClonalMuts mutations accumulated
    def assign_clonal_subclonal_clusters(clusterDf, minClonalMuts = 10):
        l = []
        for cluster in set(df['cluster']):
            clusterDf = df[df['cluster'] == cluster]
            l.append((np.nanmean(clusterDf['vaf']), clusterDf.shape[0], cluster))
        runningMutSum = 0
        clonalClusters = []
        for meanVaf, nMut, cluster in sorted(l, reverse=True):
            clonalClusters.append(cluster)
            runningMutSum += nMut
            if runningMutSum >= minClonalMuts:
                return clonalClusters
    
    a = np.array(vafs).reshape(-1, 1)
    clustering = MeanShift().fit(a)
    prediction = clustering.predict(a)
    
    #We make a dataframe 
    listOfDicts = []
    la = list(a)
    lp = list(prediction)
    for i in range(0, len(list(a))):
        listOfDicts.append({
            'vaf': la[i], 'cluster': lp[i], 'varUuid': ids[i]
        })
    df = pd.DataFrame(listOfDicts)
    
    minCMut = max(.1*df.shape[0], 10) #at least 10% of mutation in every case are called clonal
    clonalClusters = assign_clonal_subclonal_clusters(df, minClonalMuts = minCMut)
    df['clonal'] = df['cluster'].apply(lambda x: True if x in clonalClusters else False)
    return dict(zip(df['varUuid'], df['clonal']))

In [5]:
endometrialDf['varUuid'] = endometrialDf['Hugo_Symbol'] + '_' + endometrialDf['HGVSp_Short']
endometrialDf['varCaseUuid'] = endometrialDf['Tumor_Sample_Barcode'] + '_' + endometrialDf['varUuid']
endometrialDf = endometrialDf[endometrialDf['varUuid'].notnull()]
endometrialDf['isBalanced'] = endometrialDf.apply(lambda row: True if row['tcn'] - row['lcn'] == row['lcn'] else False, axis=1)
endometrialDfBalanced = endometrialDf[endometrialDf['isBalanced'] == True]
#TODO filter out MVISH mutations as well

In [6]:
colorectalDf['varUuid'] = colorectalDf['Hugo_Symbol'] + '_' + colorectalDf['HGVSp_Short']
colorectalDf['varCaseUuid'] = colorectalDf['Tumor_Sample_Barcode'] + '_' + colorectalDf['varUuid']
colorectalDf = colorectalDf[colorectalDf['varUuid'].notnull()]
colorectalDf['isBalanced'] = colorectalDf.apply(lambda row: True if row['tcn'] - row['lcn'] == row['lcn'] else False, axis=1)
colorectalDfBalanced = colorectalDf[colorectalDf['isBalanced'] == True]

In [7]:
f = open(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/endometrialHypermutatedWithSubclones.txt')
lines = f.readlines()
endoSubcloneCases = set([])
for line in lines:
    endoSubcloneCases.add(line.strip('\n'))


In [None]:
#DO the same thing for colon

In [8]:
f = open(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/colorectalHypermutatedWithSubclones.txt')
lines = f.readlines()
coloSubcloneCases = set([])
for line in lines:
    coloSubcloneCases.add(line.strip('\n'))

In [9]:
def add_clonal_calls_to_maf(maf, cases):
    maf['isClonal'] = None
    cntr = 0
    for case in cases:
        cntr += 1
        if cntr%5 == 0: 
            print cntr
        caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
        if caseMaf.shape[0] > 0:
            caseIds = list(caseMaf['varUuid'])
            caseVafs = list(caseMaf['t_var_freq'])
            isClonalDict = assign_variants_to_clonal_cluster(caseVafs, caseIds)
            maf['isClonal'] = maf.apply(lambda row: 
                                        isClonalDict[row['varUuid']] if row['Tumor_Sample_Barcode'] == case and row['varUuid'] in isClonalDict 
                                        else row['isClonal'], axis=1)
    return maf


In [10]:
#TODO signatures of each clone
def assess_tumor_suppressor_characteristics_of_mutations(analyzeMaf, geneSizeInfo):
    tumorSuppressors = set(['ERRFI1', 'ASXL2', 'PMAIP1', 'ACTG1', 'SUFU', 'FBXO11', 'MEN1', 'FAM58A', 'B2M', 'RB1', 'DUSP22', 'SESN1', 'GPS2', 'RAD51D', 'SMG1', 'CDC73', 'MAP3K1', 'SMARCB1', 'INPP4B', 'PARK2', 'SMAD4', 'CBFB', 'CDH1', 'PPP6C', 'SETDB1', 'SETDB2', 'NF2', 'CDKN2B', 'CDKN2C', 'CDKN2A', 'DDX3X', 'PIK3R1', 'BARD1', 'PDS5B', 'KLF4', 'SPRED1', 'VHL', 'SMAD2', 'PMS1', 'PMS2', 'SETD2', 'GATA3', 'TBL1XR1', 'MUTYH', 'SOCS1', 'FAM175A', 'ROBO1', 'ARID1B', 'ARID1A', 'TCF7L2', 'STK11', 'FOXA1', 'PTEN', 'FAT1', 'FAS', 'CYLD', 'MAX', 'SH2D1A', 'APC', 'NTHL1', 'CTCF', 'KDM5C', 'KMT2C', 'ZFHX3', 'FOXP1', 'PIGA', 'CDKN1B', 'CDKN1A', 'FUBP1', 'MSH2', 'ID3', 'TNFRSF14', 'TRAF3', 'EP400', 'BRIP1', 'ARID4A', 'ARID4B', 'XRCC2', 'DAXX', 'SDHAF2', 'ASXL1', 'AMER1', 'RASA1', 'EGR1', 'MST1', 'SOX17', 'RUNX1', 'PIK3R3', 'NCOR1', 'NF1', 'JAK1', 'PTPRD', 'CHEK2', 'CHEK1', 'SMC1A', 'TMEM127', 'STAG1', 'RAD51', 'TCF3', 'STAG2', 'ARID2', 'RAD50', 'RNF43', 'PARP1', 'BLM', 'CUX1', 'RECQL', 'RAD21', 'PTPN2', 'PTPN1', 'SLX4', 'INHA', 'PAX5', 'IRF1', 'TP53', 'HLA-A', 'IRF8', 'CBL', 'TOP1', 'SHQ1', 'PRDM1', 'NSD1', 'ATXN2', 'CREBBP', 'HDAC4', 'SESN2', 'PPP2R1A', 'EPHA7', 'ATM', 'EPHA3', 'POT1', 'SMAD3', 'MOB3B', 'TBX3', 'POLE', 'ATR', 'FANCD2', 'FH', 'BCORL1', 'SOX9', 'IKZF3', 'TSC1', 'TP63', 'MRE11A', 'SDHC', 'BTG1', 'POLD1', 'CIITA', 'SMC3', 'SAMHD1', 'RTEL1', 'ECT2L', 'PIK3R2', 'CRBN', 'FANCC', 'NBN', 'FANCA', 'HLA-B', 'RECQL4', 'DUSP4', 'ERCC2', 'FBXW7', 'TGFBR2', 'TGFBR1', 'MSH3', 'RBM15', 'TET1', 'TET3', 'SESN3', 'MGA', 'LTB', 'FOXL2', 'SH2B3', 'BCOR', 'HIST1H1D', 'ATRX', 'EP300', 'RAD51C', 'RAD51B', 'HIST1H1B', 'TNFAIP3', 'DICER1', 'ARID5B', 'LATS2', 'FOXO1', 'KEAP1', 'EZH2', 'SP140', 'NKX3-1', 'PBRM1', 'PALB2', 'CIC', 'BRCA1', 'DTX1', 'FLCN', 'SPEN', 'CD58', 'ERCC3', 'ERCC4', 'MSH6', 'BCL11B', 'BMPR1A', 'ERF', 'BRCA2', 'NOTCH2', 'EED', 'MITF', 'ELF3', 'SMARCA4', 'BBC3', 'ANKRD11', 'CEBPA', 'BCL2L11', 'AXIN2', 'AXIN1', 'CDK12', 'ESCO2', 'MLH1', 'SDHB', 'MED12', 'HNF1A', 'RYBP', 'ATP6V1B2', 'DNMT3B', 'KMT2B', 'KMT2A', 'DNMT3A', 'NFKBIA', 'TRAF5', 'KMT2D', 'SPOP', 'RBM10', 'P2RY8', 'TP53BP1', 'TSC2', 'KDM6A', 'EPCAM', 'PHOX2B', 'NPM1', 'BCL10', 'LATS1', 'HOXB13', 'ARID3A', 'PTPRT', 'PTPRS', 'INPPL1', 'NOTCH4', 'TET2', 'NOTCH1', 'CASP8', 'NOTCH3', 'GRIN2A', 'MAP2K4', 'WT1', 'BACH2', 'SDHA', 'BAP1', 'PTCH1', 'SDHD'])
    
    #MAKE THE TUMOR suppressors specific to the panel in question
    im3TumorSuppressors = tumorSuppressors & set(geneSizeInfo.keys()) & set(['ABL1', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BAP1', 'BARD1', 'BBC3', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CARD11', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79B', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSF1R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EP300', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERG', 'ESR1', 'ETV1', 'ETV6', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXP1', 'FUBP1', 'GATA1', 'GATA2', 'GATA3', 'GNA11', 'GNAQ', 'GNAS', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3B', 'HNF1A', 'HRAS', 'ICOSLG', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INPP4A', 'INPP4B', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAPK1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MITF', 'MLH1', 'MLL', 'MLL2', 'MLL3', 'MPL', 'MRE11A', 'MSH2', 'MSH6', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOR1', 'NF1', 'NF2', 'NFE2L2', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTRK1', 'NTRK2', 'NTRK3', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLE', 'PPP2R1A', 'PRDM1', 'PRKAR1A', 'PTCH1', 'PTEN', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAC1', 'RAD50', 'RAD51', 'RAD51B', 'RAD51C', 'RAD51D', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RUNX1', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SETD2', 'SF3B1', 'SH2D1A', 'SHQ1', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SOCS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SRC', 'STAG2', 'STK11', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TBX3', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP63', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'VHL', 'VTCN1', 'WT1', 'XIAP', 'XPO1', 'YAP1', 'YES1'])
    im5TumorSuppressors = tumorSuppressors & set(geneSizeInfo.keys()) & set(['ABL1', 'ACVR1', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'ANKRD11', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BAP1', 'BARD1', 'BBC3', 'BCL10', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BIRC3', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CALR', 'CARD11', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79A', 'CD79B', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CEBPA', 'CENPA', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSF1R', 'CSF3R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'CXCR4', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNAJB1', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EIF4A2', 'EIF4E', 'EP300', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHA7', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERG', 'ERRFI1', 'ESR1', 'ETV1', 'ETV6', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXO1', 'FOXP1', 'FUBP1', 'FYN', 'GATA1', 'GATA2', 'GATA3', 'GLI1', 'GNA11', 'GNAQ', 'GNAS', 'GPS2', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3A', 'H3F3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3A', 'HIST1H3B', 'HIST1H3C', 'HIST1H3D', 'HIST1H3E', 'HIST1H3F', 'HIST1H3G', 'HIST1H3H', 'HIST1H3I', 'HIST1H3J', 'HIST2H3C', 'HIST2H3D', 'HIST3H3', 'HLA-A', 'HNF1A', 'HOXB13', 'HRAS', 'ICOSLG', 'ID3', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INHA', 'INHBA', 'INPP4A', 'INPP4B', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'MALT1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAP3K14', 'MAPK1', 'MAPK3', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MGA', 'MITF', 'MLH1', 'MLL', 'MLL2', 'MLL3', 'MPL', 'MRE11A', 'MSH2', 'MSH6', 'MST1', 'MST1R', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOA3', 'NCOR1', 'NEGR1', 'NF1', 'NF2', 'NFE2L2', 'NFKBIA', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTRK1', 'NTRK2', 'NTRK3', 'NUP93', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PGR', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLCG2', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLD1', 'POLE', 'PPM1D', 'PPP2R1A', 'PPP6C', 'PRDM1', 'PRKAR1A', 'PTCH1', 'PTEN', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAB35', 'RAC1', 'RAD21', 'RAD50', 'RAD51', 'RAD51B', 'RAD51C', 'RAD51D', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHEB', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RUNX1', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SETD2', 'SF3B1', 'SH2B3', 'SH2D1A', 'SHQ1', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SOCS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SRC', 'SRSF2', 'STAG2', 'STAT3', 'STAT5A', 'STAT5B', 'STK11', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TBX3', 'TCEB1', 'TCF3', 'TCF7L2', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP63', 'TRAF2', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'VEGFA', 'VHL', 'VTCN1', 'WT1', 'XIAP', 'XPO1', 'XRCC2', 'YAP1', 'YES1', 'ZFHX3', 'ZRSR2'])
    im6TumorSuppressors = tumorSuppressors & set(geneSizeInfo.keys()) & set(['ABL1', 'ACVR1', 'AGO2', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'AMER1', 'ANKRD11', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BABAM1', 'BAP1', 'BARD1', 'BBC3', 'BCL10', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BIRC3', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CALR', 'CARD11', 'CARM1', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79A', 'CD79B', 'CDC42', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CEBPA', 'CENPA', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSDE1', 'CSF1R', 'CSF3R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'CXCR4', 'CYLD', 'CYSLTR2', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNAJB1', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'DROSHA', 'DUSP4', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EIF4A2', 'EIF4E', 'ELF3', 'EP300', 'EPAS1', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHA7', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERF', 'ERG', 'ERRFI1', 'ESR1', 'ETV1', 'ETV6', 'EZH1', 'EZH2', 'FAM175A', 'FAM46C', 'FAM58A', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXO1', 'FOXP1', 'FUBP1', 'FYN', 'GATA1', 'GATA2', 'GATA3', 'GLI1', 'GNA11', 'GNAQ', 'GNAS', 'GPS2', 'GREM1', 'GRIN2A', 'GSK3B', 'GTF2I', 'H3F3A', 'H3F3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3A', 'HIST1H3B', 'HIST1H3C', 'HIST1H3D', 'HIST1H3E', 'HIST1H3F', 'HIST1H3G', 'HIST1H3H', 'HIST1H3I', 'HIST1H3J', 'HIST2H3C', 'HIST2H3D', 'HIST3H3', 'HLA-A', 'HLA-B', 'HNF1A', 'HOXB13', 'HRAS', 'ICOSLG', 'ID3', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INHA', 'INHBA', 'INPP4A', 'INPP4B', 'INPPL1', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KNSTRN', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'LYN', 'MALT1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAP3K14', 'MAPK1', 'MAPK3', 'MAPKAP1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MGA', 'MITF', 'MLH1', 'MLL', 'MLL2', 'MLL3', 'MLL4', 'MPL', 'MRE11A', 'MSH2', 'MSH3', 'MSH6', 'MSI1', 'MSI2', 'MST1', 'MST1R', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOA3', 'NCOR1', 'NEGR1', 'NF1', 'NF2', 'NFE2L2', 'NFKBIA', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTHL1', 'NTRK1', 'NTRK2', 'NTRK3', 'NUF2', 'NUP93', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDCD1LG2', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PGR', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLCG2', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLD1', 'POLE', 'PPARG', 'PPM1D', 'PPP2R1A', 'PPP4R2', 'PPP6C', 'PRDM1', 'PRDM14', 'PREX2', 'PRKAR1A', 'PRKCI', 'PRKD1', 'PTCH1', 'PTEN', 'PTP4A1', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAB35', 'RAC1', 'RAC2', 'RAD21', 'RAD50', 'RAD51', 'RAD51B', 'RAD51C', 'RAD51D', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHEB', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RRAGC', 'RRAS', 'RRAS2', 'RTEL1', 'RUNX1', 'RXRA', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SESN1', 'SESN2', 'SESN3', 'SETD2', 'SETD8', 'SF3B1', 'SH2B3', 'SH2D1A', 'SHOC2', 'SHQ1', 'SLX4', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SMYD3', 'SOCS1', 'SOS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SPRED1', 'SRC', 'SRSF2', 'STAG2', 'STAT3', 'STAT5A', 'STAT5B', 'STK11', 'STK19', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TAP1', 'TAP2', 'TBX3', 'TCEB1', 'TCF3', 'TCF7L2', 'TEK', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP53BP1', 'TP63', 'TRAF2', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'UPF1', 'VEGFA', 'VHL', 'VTCN1', 'WHSC1', 'WHSC1L1', 'WT1', 'WWTR1', 'XIAP', 'XPO1', 'XRCC2', 'YAP1', 'YES1', 'ZFHX3', 'ZRSR2']) 

    listOfDicts = []
    for case in set(analyzeMaf['Tumor_Sample_Barcode']):
        caseMaf = analyzeMaf[analyzeMaf['Tumor_Sample_Barcode'] == case]
        caseTumorSuppressors = None
        if 'IM3' in case: caseTumorSuppressors = im3TumorSuppressors
        elif 'IM5' in case: caseTumorSuppressors = im5TumorSuppressors
        elif 'IM6' in case: caseTumorSuppressors = im6TumorSuppressors
        
        #count n hotspots, n oncogenic
        allClonalMuts = caseMaf[caseMaf['isClonal'] == True]
        allSubclonalMuts = caseMaf[caseMaf['isClonal'] == False]
        clonalHotspotMuts = allClonalMuts[allClonalMuts['is-a-hotspot'] == 'Y']
        subclonalHotspotMuts = allSubclonalMuts[allSubclonalMuts['is-a-hotspot'] == 'Y']
        clonalOncMuts = allClonalMuts[(allClonalMuts['oncogenic'].notnull())]
        subclonalOncMuts = allSubclonalMuts[allSubclonalMuts['oncogenic'].notnull()]
        subclonalHotspotGenes = ';'.join(list(subclonalHotspotMuts['Hugo_Symbol']))
        
        #we subset a maf to be only oncogenuc tumor suppressors from here on out only after counting hotspots etc
        caseMafOnc = caseMaf[caseMaf['oncogenic'].notnull()]
        tsMaf = caseMafOnc[caseMafOnc['Hugo_Symbol'].isin(caseTumorSuppressors)]
        clonalOncMuts = tsMaf[tsMaf['isClonal'] == True]
        subclonalOncMuts = tsMaf[tsMaf['isClonal'] == False]
        
        #SECOND HITS to tumor suppressors
        isSingleOncMutatedDict = dict(clonalOncMuts['Hugo_Symbol'].value_counts() < 2)
        genesSingleMutatedClonal = set([i[0] for i in isSingleOncMutatedDict.items() if i[1] == True])
        genesNotSingleMutatedClonal = caseTumorSuppressors - genesSingleMutatedClonal
        genesSubclonallyMutated = set(subclonalOncMuts['Hugo_Symbol'])
        subclonalSecondHits = genesSingleMutatedClonal & genesSubclonallyMutated
        subclonalNotSecondHits = genesSubclonallyMutated - subclonalSecondHits
        
        #Double mutated de novo
        subclonalNmutDict = dict(subclonalOncMuts['Hugo_Symbol'].value_counts() > 1)
        subclonalDoubleMuts = [key for key, value in subclonalNmutDict.items() if value]
        
        #Susceptibility of bases to second hits
        nBasesSusceptibleToSecondHit = sum([geneSizeInfo[i] for i in genesSingleMutatedClonal])
        nBasesNotSusceptibleToSecondHit = sum([geneSizeInfo[i] for i in genesNotSingleMutatedClonal])
        nSecondHits = len(subclonalSecondHits)
        nNotSecondHits = len(subclonalNotSecondHits)
        
        secondHitRate = None
        notSecondHitRate = None
        if nBasesSusceptibleToSecondHit > 0:
            secondHitRate = 1.0*nSecondHits/nBasesSusceptibleToSecondHit
            notSecondHitRate = 1.0*nNotSecondHits/nBasesNotSusceptibleToSecondHit
        
        #WE append two lines to the dictionary:
        #1 for clonal data and one for subclonal data
        
        #CLONAL
        listOfDicts.append({'clonalStatus': 'clonal', 'Tumor_Sample_Barcode': case,
                            'nmut': allClonalMuts.shape[0], 
                            'nHotspots': clonalHotspotMuts.shape[0], 
                            'nDriverMuts': clonalOncMuts.shape[0]
                           })
                           
        #SUBCLONAL
        listOfDicts.append({'clonalStatus': 'subclonal', 'Tumor_Sample_Barcode': case,
                            'nmut': allSubclonalMuts.shape[0],
                            'nHotspots': subclonalHotspotMuts.shape[0],
                            'nDriverMuts': subclonalOncMuts.shape[0],
                            #LATE DRIVERS: hotspots and de-novo TS biallelic inactivation
                            'nDoubleMutations': len(subclonalDoubleMuts), 'doubleMutationNames': ';'.join(subclonalDoubleMuts),
                            'hotspotGenes': subclonalHotspotGenes,
                            #TUMOR SUPPRESSOR SECOND HIT INFO
                            'secondHitGenes': ';'.join(list(subclonalSecondHits)),
                            'secondHitRate': secondHitRate,
                            'notSecondHitRate': notSecondHitRate,
                            'nSecondHits': nSecondHits, 'nNotSecondHits': nNotSecondHits,
                            'nBasesSusceptibleToSecondHit': nBasesSusceptibleToSecondHit, 'nBasesNotSusceptibleToSecondHit': nBasesNotSusceptibleToSecondHit
                           })
        
    df = pd.DataFrame(listOfDicts)
    return df
        
        
    

In [11]:
mutSimulationSummary = pd.read_table('/Users/friedman/Desktop/workOffline/mutSimulationInfoIncludingHotspots.tsv')
geneSizeDict = dict(zip(mutSimulationSummary['Hugo_Symbol'], mutSimulationSummary['nPossibleMuts']))

In [12]:
analyzeMafEndo = add_clonal_calls_to_maf(endometrialDfBalanced, endoSubcloneCases)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


5
10
15
20
25
30
35
40
45
50
55
60
65


In [47]:
analyzeMafEndo = analyzeMafEndo[(analyzeMafEndo['isClonal'].notnull()) & #only take mutations for which we identified a clonal and subclonal population
                 (analyzeMafEndo['Consequence'] != 'splice_region_variant,intron_variant')] #ignore this type of mutation cause it is always filtered out

summaryDfEndo = assess_tumor_suppressor_characteristics_of_mutations(analyzeMafEndo, geneSizeDict)
summaryDfEndo['displayName'] = summaryDfEndo['Tumor_Sample_Barcode'] + '_' + summaryDfEndo['clonalStatus']


In [None]:
analyzeMafColo = add_clonal_calls_to_maf(colorectalDfBalanced, coloSubcloneCases)
analyzeMafColo = analyzeMafColo[#(analyzeMafColo['oncogenic'].notnull()) & #only take oncogenic mutations
                 (analyzeMafColo['isClonal'].notnull()) & #only take mutations for which we identified a clonal and subclonal population
                 (analyzeMafColo['Consequence'] != 'splice_region_variant,intron_variant')] #ignore this type of mutation cause it is always filtered out

summaryDfColo = assess_tumor_suppressor_characteristics_of_mutations(analyzeMafColo, geneSizeDict)


In [48]:
summaryDfEndo.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/subclonalSecondHitsEndo.tsv', index=False, sep='\t')

In [None]:
summaryDfColo.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/subclonalSecondHitsColo.tsv', index=False, sep='\t')


In [None]:
impactSigs = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/signatures_from_unfiltered_maf.txt')


In [19]:
print analyzeMafEndo[(analyzeMafEndo['Tumor_Sample_Barcode'] == 'P-0032589-T01-IM6') & (analyzeMafEndo['is-a-hotspot'] == 'Y')][['Hugo_Symbol', 'HGVSp_Short']]

      Hugo_Symbol HGVSp_Short
43544        PTEN     p.R173C
44235      NFE2L2      p.Q26H
44245       CASP8     p.R292Q
44329     SMARCB1     p.R377C
44387      PIK3CB     p.R321Q
44394      PIK3CA      p.R88Q
44395      PIK3CA     p.E365K
44397      PIK3CA    p.Y1021C
44473       FBXW7     p.R689W


In [15]:
print analyzeMafEndo['Tumor_Sample_Barcode']

1        P-0021220-T01-IM6
4        P-0021220-T01-IM6
6        P-0021220-T01-IM6
7        P-0021220-T01-IM6
10       P-0021220-T01-IM6
18       P-0021220-T01-IM6
27       P-0021220-T01-IM6
29       P-0021220-T01-IM6
31       P-0021220-T01-IM6
36       P-0021220-T01-IM6
37       P-0021220-T01-IM6
46       P-0021220-T01-IM6
55       P-0021220-T01-IM6
67       P-0021220-T01-IM6
70       P-0021220-T01-IM6
78       P-0021220-T01-IM6
83       P-0021220-T01-IM6
91       P-0021220-T01-IM6
97       P-0021220-T01-IM6
102      P-0021220-T01-IM6
103      P-0021220-T01-IM6
111      P-0021220-T01-IM6
119      P-0021220-T01-IM6
121      P-0021220-T01-IM6
122      P-0021220-T01-IM6
123      P-0021220-T01-IM6
125      P-0021220-T01-IM6
126      P-0021220-T01-IM6
130      P-0021220-T01-IM6
131      P-0021220-T01-IM6
               ...        
72959    P-0019093-T01-IM6
72964    P-0019093-T01-IM6
72966    P-0019093-T01-IM6
72984    P-0019093-T01-IM6
72990    P-0019093-T01-IM6
72992    P-0019093-T01-IM6
7