In [33]:
import pandas as pd
import os
import sys

sys.path.append('/Users/friedman/Desktop/mnt/juno/work/taylorlab/friedman/myUtils')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import mutation_modeling_util
import get_gene_and_cohort_list_utils
import numpy as np
import scipy.stats as stats
import math

pathPrefix = '/Users/friedman/Desktop/mnt'
from collections import Counter

In [65]:
def summarize_cna_stats_for_cancer_type(pcawgInfo, cancerType):
    timingData = pcawgData[pcawgData['histology_abbreviation'] == cancerType]
    infoDict = {}
    chromosomes = [str(i) for i in range(1,22)] + ['X', 'Y']
    for chromosome in chromosomes:
        infoDict[chromosome] = np.nanmean(timingData[timingData['chr'] == chromosome]['time'])
    return infoDict

In [86]:
def summarize_differences_in_cnas_between_hyper_and_not_hyper(cnasHyper, cnasNormal, summaryDict):
    listOfDicts = []
    chromosomes = [str(i) for i in range(1,22)] + ['X', 'Y']
    lossStates = set(['CNLOH', 'CNLOH & GAIN', 'HETLOSS'])
    nNormalCases = len(set(cnasNormal['Tumor_Sample_Barcode']))
    nHyperCases = len(set(cnasHyper['Tumor_Sample_Barcode']))
    
    for chromosome in chromosomes:
        pcawgTiming = summaryDict[chromosome]
        normalAllCna = cnasNormal[(cnasNormal['cn_state'] != 'DIPLOID') & (cnasNormal['chromosome'] == chromosome)]
        hyperAllCna = cnasHyper[(cnasHyper['cn_state'] != 'DIPLOID') & (cnasHyper['chromosome'] == chromosome)]
        listOfDicts.append({'Chromosome': chromosome, 'timingScore': pcawgTiming,
                            'nCNA_normal': normalAllCna.shape[0], 'nCNA_hyper': hyperAllCna.shape[0],
                            'fracCNA_normal': (1.0*normalAllCna.shape[0])/nNormalCases, 'fracCNA_hyper': (1.0*hyperAllCna.shape[0])/nHyperCases
                           })
    return pd.DataFrame(listOfDicts)

In [3]:
armLevelDf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/feb_1_cohort_compilation.arm_level.txt')

In [20]:
qcDf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/feb_1_cohort_compilation.cohort.txt')

In [46]:
pcawgData = pd.read_csv(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/pcawgPaperCNATiming.csv', header=0)

In [80]:
endometrialHyperIds = get_gene_and_cohort_list_utils.get_ids_by_hypermutant_status(hypermutantIdDir= pathPrefix + '/juno/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Endometrial Cancer', hypermutantStatus = 'Hypermutated')
colorectalHyperIds = get_gene_and_cohort_list_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/juno/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Colorectal Cancer', hypermutantStatus = 'Hypermutated')
endometrialNormalIds = get_gene_and_cohort_list_utils.get_ids_by_hypermutant_status(hypermutantIdDir= pathPrefix + '/juno/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Endometrial Cancer', hypermutantStatus = 'Normal')
colorectalNormalIds = get_gene_and_cohort_list_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/juno/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Colorectal Cancer', hypermutantStatus = 'Normal')

armLevelDf['Tumor_Sample_Barcode'] = armLevelDf['sample'].apply(lambda x: x[:17])
armLevelDf['chromosome'] = armLevelDf['arm'].apply(lambda x: x.strip('p').strip('q'))
qcDf['Tumor_Sample_Barcode'] = qcDf['tumor_sample_id'].apply(lambda x: x[:17])
passIds = set(qcDf[qcDf['facets_suite_qc'] == True]['Tumor_Sample_Barcode'])
endometrialHyperIds = endometrialHyperIds & passIds
colorectalHyperIds = colorectalHyperIds & passIds
endometrialNormalIds = endometrialNormalIds & passIds
colorectalNormalIds = colorectalNormalIds & passIds

armLevelEndoHyper = armLevelDf[armLevelDf['Tumor_Sample_Barcode'].isin(endometrialHyperIds)]
armLevelColoHyper = armLevelDf[armLevelDf['Tumor_Sample_Barcode'].isin(colorectalHyperIds)]
armLevelEndoNormal = armLevelDf[armLevelDf['Tumor_Sample_Barcode'].isin(endometrialNormalIds)]
armLevelColoNormal = armLevelDf[armLevelDf['Tumor_Sample_Barcode'].isin(colorectalNormalIds)]


In [89]:
endoDict = summarize_cna_stats_for_cancer_type(pcawgData, 'Uterus-AdenoCA')
coloDict = summarize_cna_stats_for_cancer_type(pcawgData, 'ColoRect-AdenoCA')

  


In [90]:
endometrialDf = summarize_differences_in_cnas_between_hyper_and_not_hyper(armLevelEndoHyper, armLevelEndoNormal, endoDict)
colorectalDf = summarize_differences_in_cnas_between_hyper_and_not_hyper(armLevelColoHyper, armLevelColoNormal, coloDict)


In [109]:
endometrialDf.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/endoTiming.tsv', index=False, sep='\t')
colorectalDf.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/coloTiming.tsv', index=False, sep='\t')


In [92]:
allImpactMutsMaf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/data_mutations_extended_annotated_sigContext_nov19_2019.maf')

  interactivity=interactivity, compiler=compiler, result=result)


In [97]:
allImpactMutsMaf['geneChr'] = allImpactMutsMaf.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['Chromosome']), axis=1)

In [None]:
#
#####
###########
###################
##########################
################################
##########################
###################
##########
###
#
#MUTATION ATTIBUTION AND CNAs

In [309]:
def summarize_mutation_and_cna_occurences(maf, armCnaSummary, typeCol = 'geneAndCause'):
    
    #MAKE sure we only look at CNAs for cases of interst
    armCnaSummary = armCnaSummary[armCnaSummary['Tumor_Sample_Barcode'].isin(set(maf['Tumor_Sample_Barcode']))]
    
    #make dicts ahead of time to speed up stuff
    allIds = set(maf['Tumor_Sample_Barcode']) #this shou
    if len(allIds) != len(set(armCnaSummary['Tumor_Sample_Barcode'])):
        print 'error lengths arent the same ', len(allIds), len(set(armCnaSummary['Tumor_Sample_Barcode']))
    
    listOfDicts = []
    for gene in set(maf['Hugo_Symbol']):
        geneMaf = maf[maf['Hugo_Symbol'] == gene]
        y1y2 = geneMaf[(geneMaf['hypermutationInduced'] == 'hyperInduced') & (geneMaf['armState'] != 'DIPLOID')].shape[0]
        y1n2 = geneMaf[(geneMaf['hypermutationInduced'] == 'hyperInduced') & (geneMaf['armState'] == 'DIPLOID')].shape[0]
        n1y2 = geneMaf[(geneMaf['hypermutationInduced'] == 'notHyperAttributable') & (geneMaf['armState'] != 'DIPLOID')].shape[0]
        n1n2 = geneMaf[(geneMaf['hypermutationInduced'] == 'notHyperAttributable') & (geneMaf['armState'] == 'DIPLOID')].shape[0]
        
        table = [[y1y2, y1n2], [n1y2, n1n2]]
        oddsratio, pvalue = stats.fisher_exact(table)
        
        listOfDicts.append({'gene': gene, 
                            'oddsratio': oddsratio, 'pvalue': pvalue,
                            'contingencyTable': str(y1y2) + '_' + str(y1n2) + '\n' + str(n1y2) + '_' + str(n1n2)})

    return pd.DataFrame(listOfDicts)
    
    """geneDict = {}
    for geneClass in set(maf[typeCol]):
        geneClassYesIds = set(maf[maf[typeCol] == geneClass]['Tumor_Sample_Barcode'])
        geneDict[geneClass + '_yes'] = geneClassYesIds
        geneDict[geneClass + '_no'] = allIds - geneClassYesIds
    
    armCnaDict = {}
    for chrArm in set(armCnaSummary['arm']):
        cnaArmYesIds = set(armCnaSummary[(armCnaSummary['cn_state'] != 'DIPLOID')
                & (armCnaSummary['arm'] == chrArm)]['Tumor_Sample_Barcode'])
        armCnaDict[chrArm + '_yes'] = cnaArmYesIds
        armCnaDict[chrArm + '_no'] = allIds - cnaArmYesIds
    
    print 'doing fisher tests'
    listOfDicts = []
    cntr = 0
    """
        
    """for geneClass in set(maf[typeCol]):
        if cntr%50 == 0: print cntr, len(set(maf[typeCol]))
        cntr += 1
        for chrArm in set(armCnaSummary['arm']):
            y1y2 = len(geneDict[geneClass + '_yes'] & armCnaDict[chrArm + '_yes'])
            y1n2 = len(geneDict[geneClass + '_yes'] & armCnaDict[chrArm + '_no'])
            n1y2 = len(geneDict[geneClass + '_no'] & armCnaDict[chrArm + '_yes'])
            n1n2 = len(geneDict[geneClass + '_no'] & armCnaDict[chrArm + '_no'])
            
            table = [[y1y2, y1n2], [n1y2, n1n2]]
            oddsratio, pvalue = stats.fisher_exact(table)
            listOfDicts.append({'geneClass': geneClass, 'chrArm': chrArm,
                                'oddsratio': oddsratio, 'pvalue': pvalue,
                                'gene': geneClass.split('_')[0],
                               'contingencyTable': str(y1y2) + '_' + str(y1n2) + '\n' + str(n1y2) + '_' + str(n1n2)})
    """
    
    
    
    
    

In [118]:
mafWithAttribution = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/mafWithMutationAttribution.tsv')


  interactivity=interactivity, compiler=compiler, result=result)


In [119]:
reload(analysis_utils)
chrArmDict = analysis_utils.map_impact_genes_to_chromosome_arm(armFilePath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/IMPACTv6_gene_cytoband.txt')

In [121]:
armLevelDf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/feb_1_cohort_compilation.arm_level.txt')

In [123]:
qcDf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/feb_1_cohort_compilation.cohort.txt')
armLevelDf['Tumor_Sample_Barcode'] = armLevelDf['sample'].apply(lambda x: x[:17])
qcDf['Tumor_Sample_Barcode'] = qcDf['tumor_sample_id'].apply(lambda x: x[:17])
passIds = set(qcDf[qcDf['facets_suite_qc'] == True]['Tumor_Sample_Barcode'])

In [124]:
armLevelDf = armLevelDf[armLevelDf['Tumor_Sample_Barcode'].isin(passIds)]

In [129]:
combinedIds = set(armLevelDf['Tumor_Sample_Barcode']) & set(mafWithAttribution['Tumor_Sample_Barcode'])
armLevelDfRed = armLevelDf[armLevelDf['Tumor_Sample_Barcode'].isin(combinedIds)]
mafWithAttribution = mafWithAttribution[mafWithAttribution['Tumor_Sample_Barcode'].isin(combinedIds)]

In [131]:
mafWithAttribution['chrArm'] = mafWithAttribution['Hugo_Symbol'].apply(lambda x:
    chrArmDict[x] if x in chrArmDict else None)
cancerTypeDict = analysis_utils.get_cancer_type_information(cancerTypeDfPath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/cancerTypeInfo_asOfNov192019.txt', mode='pid')
mafWithAttribution['pid'] = mafWithAttribution['Tumor_Sample_Barcode'].apply(lambda x: x[:9])
mafWithAttribution['cancerType'] = mafWithAttribution['pid'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

In [306]:
#df = summarize_mutation_and_cna_occurences(mafWithAttribution[mafWithAttribution['cancerType'] == 'Endometrial Cancer'],
#        armLevelDfRed, typeCol = 'geneAndCause')


TypeError: summarize_mutation_and_cna_occurences() got an unexpected keyword argument 'mode'

In [185]:
armLevelDfRed['armCase'] = armLevelDfRed['Tumor_Sample_Barcode'] + '_' + armLevelDfRed['arm']
armDict = dict(zip(armLevelDfRed['armCase'], armLevelDfRed['cn_state']))
mafWithAttribution['armCase'] = mafWithAttribution['Tumor_Sample_Barcode'] + '_' + mafWithAttribution['chrArm']
mafWithAttribution['armState'] = mafWithAttribution['armCase'].apply(lambda x: 
            armDict[x] if x in armDict else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [286]:
reducedMaf = mafWithAttribution[['hypermutationInduced', 'mutAttribution',
    'chrArm', 'cancerType', 'Tumor_Sample_Barcode', 'armState', 'Hugo_Symbol', 'HGVSp_Short']]

In [325]:
df = summarize_mutation_and_cna_occurences(reducedMaf[reducedMaf['cancerType'] == 'Endometrial Cancer'],
        armLevelDfRed, typeCol = 'geneAndCause')

In [None]:
#
#####
#############
######################
##############################
#######################
#############
######
#

In [261]:
alexMatrix = mafWithAttribution[mafWithAttribution['oncogenic'].notnull()]
alexMatrix = alexMatrix[['hypermutationInduced', 'mutAttribution',
    'chrArm', 'cancerType', 'Tumor_Sample_Barcode', 'armState', 'Hugo_Symbol', 'HGVSp_Short']]

In [262]:
genes = set(['ABL1', 'ACVR1', 'AGO2', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'ANKRD11', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BABAM1', 'BAP1', 'BARD1', 'BBC3', 'BCL10', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BIRC3', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CALR', 'CARD11', 'CARM1', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79A', 'CD79B', 'CDC42', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CEBPA', 'CENPA', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSDE1', 'CSF1R', 'CSF3R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'CXCR4', 'CYLD', 'CYSLTR2', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNAJB1', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'DROSHA', 'DUSP4', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EIF4A2', 'EIF4E', 'ELF3', 'EP300', 'EPAS1', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHA7', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERF', 'ERG', 'ERRFI1', 'ESR1', 'ETV1', 'ETV6', 'EZH1', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FAM58A', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXO1', 'FOXP1', 'FUBP1', 'FYN', 'GATA1', 'GATA2', 'GATA3', 'GLI1', 'GNA11', 'GNAQ', 'GNAS', 'GPS2', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3A', 'H3F3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3A', 'HIST1H3B', 'HIST1H3C', 'HIST1H3D', 'HIST1H3E', 'HIST1H3F', 'HIST1H3G', 'HIST1H3H', 'HIST1H3I', 'HIST1H3J', 'HIST2H3C', 'HIST2H3D', 'HIST3H3', 'HLA-A', 'HLA-B', 'HNF1A', 'HOXB13', 'HRAS', 'ICOSLG', 'ID3', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INHA', 'INHBA', 'INPP4A', 'INPP4B', 'INPPL1', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KMT2B', 'KMT5A', 'KNSTRN', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'LYN', 'MALT1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAP3K14', 'MAPK1', 'MAPK3', 'MAPKAP1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MGA', 'MITF', 'MLH1', 'KMT2A', 'KMT2B', 'KMT2C', 'MPL', 'MRE11A', 'MSH2', 'MSH3', 'MSH6', 'MSI1', 'MSI2', 'MST1', 'MST1R', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOA3', 'NCOR1', 'NEGR1', 'NF1', 'NF2', 'NFE2L2', 'NFKBIA', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTHL1', 'NTRK1', 'NTRK2', 'NTRK3', 'NUF2', 'NUP93', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDCD1LG2', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PGR', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLCG2', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLD1', 'POLE', 'PPARG', 'PPM1D', 'PPP2R1A', 'PPP4R2', 'PPP6C', 'PRDM1', 'PRDM14', 'PREX2', 'PRKAR1A', 'PRKCI', 'PRKD1', 'PTCH1', 'PTEN', 'PTP4A1', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAB35', 'RAC1', 'RAC2', 'RAD21', 'RAD50', 'RAD51', 'RAD51C', 'RAD51L1', 'RAD51L3', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHEB', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RRAGC', 'RRAS', 'RRAS2', 'RTEL1', 'RUNX1', 'RXRA', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SESN1', 'SESN2', 'SESN3', 'SETD2', 'SF3B1', 'SH2B3', 'SH2D1A', 'SHOC2', 'SHQ1', 'SLX4', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SMYD3', 'SOCS1', 'SOS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SPRED1', 'SRC', 'SRSF2', 'STAG2', 'STAT3', 'STAT5A', 'STAT5B', 'STK11', 'STK19', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TAP1', 'TAP2', 'TBX3', 'TCEB1', 'TCF3', 'TCF7L2', 'TEK', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP53BP1', 'TP63', 'TRAF2', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'UPF1', 'VEGFA', 'VHL', 'VTCN1', 'WHSC1', 'WHSC1L1', 'WT1', 'WWTR1', 'XIAP', 'XPO1', 'XRCC2', 'YAP1', 'YES1', 'ZFHX3', 'ZRSR2'])
listOfDicts = []
cntr = 0
alexMatrix['geneCase'] = alexMatrix.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['Tumor_Sample_Barcode']), axis=1)
caseGeneSet = set(alexMatrix['geneCase'])
for gene in genes:
    cntr += 1
    if cntr %100 == 0: print cntr
    for case in set(alexMatrix['Tumor_Sample_Barcode']):
        if gene + '_' + case not in caseGeneSet:
            listOfDicts.append({'Tumor_Sample_Barcode': case, 'Hugo_Symbol':gene, 
                                'hypermutationInduced': 'WILD-TYPE'})
nonMutatedGenes = pd.DataFrame(listOfDicts)
        
        

100
200
300
400


In [263]:
combinedDf = pd.concat([alexMatrix, nonMutatedGenes])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [264]:
#RECREATE THE WHEEL FOR THE COMBINED STUFF
combinedDf['chrArm'] = combinedDf['Hugo_Symbol'].apply(lambda x:
    chrArmDict[x] if x in chrArmDict else None)
#cancerTypeDict = analysis_utils.get_cancer_type_information(cancerTypeDfPath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/cancerTypeInfo_asOfNov192019.txt', mode='pid')
combinedDf['pid'] = combinedDf['Tumor_Sample_Barcode'].apply(lambda x: x[:9])
combinedDf['cancerType'] = combinedDf['pid'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

In [265]:
combinedDf['armCase'] = combinedDf['Tumor_Sample_Barcode'] + '_' + combinedDf['chrArm']
combinedDf['armState'] = combinedDf['armCase'].apply(lambda x: 
            armDict[x] if x in armDict else None)

combinedDf = combinedDf[combinedDf['armState'].notnull()]
combinedDf['armCNAStatus'] = combinedDf['armState'].apply(lambda x: 'DIPLOID' if x == 'DIPLOID' else 'CNA')

In [268]:
combinedDfZ = combinedDf[['Hugo_Symbol', 'Tumor_Sample_Barcode', 'armState'
                        , 'chrArm', 'cancerType', 'hypermutationInduced',
                        'mutAttribution', 'armCNAStatus', 'HGVSp_Short']]
combinedDfZ['caseGene'] = combinedDfZ['Tumor_Sample_Barcode'] + '_' + combinedDfZ['Hugo_Symbol']
mutCountsDict = dict(combinedDfZ['caseGene'].value_counts())
combinedDfZ['mutationCount'] = combinedDfZ.apply(lambda row: 0 if row['hypermutationInduced'] == 'WILD-TYPE'
            else mutCountsDict[row['caseGene']], axis=1)
combinedDfZ['isMultiplet'] = combinedDfZ['mutationCount'].apply(lambda x: True if x > 1 else False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [220]:
combinedDfZ.to_csv(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/cnaAndMutTable.tsv', index=False, sep='\t')

In [271]:
combinedDfZ.to_csv(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/cnaAndMutTable_v2.tsv', index=False, sep='\t')

In [251]:


eDf = combinedDfZ[combinedDfZ['cancerType'] == 'Endometrial Cancer']

#print 'not diploid ', Counter(eDf[(eDf['Hugo_Symbol'] == 'PTEN') & (eDf['armState'] != 'DIPLOID')]['hypermutationInduced'])
#print 'diploid', Counter(eDf[(eDf['Hugo_Symbol'] == 'PTEN') & (eDf['armState'] == 'DIPLOID')]['hypermutationInduced'])

Counter(eDf['armState']).most_common(10)
lossStates = set(['HETLOSS', 'CNLOH', 'LOSS AFTER', 'CNLOH & GAIN', 'LOSS BEFORE'])
print 'not diploid ', Counter(eDf[(eDf['Hugo_Symbol'] == 'PTEN') & (eDf['armState'].isin(lossStates))]['hypermutationInduced'])
print 'diploid', Counter(eDf[(eDf['Hugo_Symbol'] == 'PTEN') & (eDf['armState'] == 'DIPLOID')]['hypermutationInduced'])


not diploid  Counter({'notHyperAttributable': 19, 'hyperInduced': 13, 'WILD-TYPE': 5})
diploid Counter({'hyperInduced': 83, 'notHyperAttributable': 72, 'WILD-TYPE': 17, 'unclear': 1})


In [None]:
#
######
##############
######################
##################################
#########################################
###################################
#####################
##############
#######
#

#CNA TIMING ANALYSIS

In [191]:
#does fisher tests to see whether there are some genes that preferentially have onc mutations before gains
def do_onc_and_cna_timing_analysis(maf):
    listOfDicts = []
    cntr = 0
    for gene in set(maf['Hugo_Symbol']):
        
        chrArm = None
        if gene in armDict:
            chrArm = armDict[gene]
        
        if cntr%100==0: print cntr,
        cntr += 1
        geneMaf = maf[maf['Hugo_Symbol'] == gene]
        y1y2 = geneMaf[(geneMaf['oncogenic'].notnull()) & (geneMaf['mutTiming'] == 'beforeGain')].shape[0]
        y1n2 = geneMaf[(geneMaf['oncogenic'].notnull()) & (geneMaf['mutTiming'] != 'beforeGain')].shape[0]
        n1y2 = geneMaf[(geneMaf['oncogenic'].isnull()) & (geneMaf['mutTiming'] == 'beforeGain')].shape[0]
        n1n2 = geneMaf[(geneMaf['oncogenic'].isnull()) & (geneMaf['mutTiming'] != 'beforeGain')].shape[0]
        
        table = [[y1y2, y1n2], [n1y2, n1n2]]
        oddsratio, pvalue = stats.fisher_exact(table)
        
        listOfDicts.append({'gene': gene, 
                            'oddsratio': oddsratio, 'pvalue': pvalue,  'chrArm': chrArm,
                            'contingencyTable': str(y1y2) + '_' + str(y1n2) + '\n' + str(n1y2) + '_' + str(n1n2)})

    return pd.DataFrame(listOfDicts)


def do_onc_and_cna_timing_analysis_arm_mode(maf, armDict):
    listOfDicts = []
    cntr = 0
    for gene in set(maf['Hugo_Symbol']):
        if cntr%100==0: print cntr,
        cntr += 1
        if gene in armDict:
            chrArm = armDict[gene]
            armMaf = maf[maf['chrArm'] == chrArm]

            y1y2 = armMaf[((armMaf['oncogenic'].notnull()) & (armMaf['Hugo_Symbol'] == gene)) & (armMaf['mutTiming'] == 'beforeGain')].shape[0]
            y1n2 = armMaf[((armMaf['oncogenic'].notnull()) & (armMaf['Hugo_Symbol'] == gene))  & (armMaf['mutTiming'] != 'beforeGain')].shape[0]
            n1y2 = armMaf[((armMaf['oncogenic'].isnull()) | (armMaf['Hugo_Symbol'] != gene)) & (armMaf['mutTiming'] == 'beforeGain')].shape[0]
            n1n2 = armMaf[((armMaf['oncogenic'].isnull()) | (armMaf['Hugo_Symbol'] != gene)) & (armMaf['mutTiming'] != 'beforeGain')].shape[0]

            table = [[y1y2, y1n2], [n1y2, n1n2]]
            oddsratio, pvalue = stats.fisher_exact(table)

            listOfDicts.append({'gene': gene, 
                                'oddsratio': oddsratio, 'pvalue': pvalue, 'chrArm': chrArm,
                                'contingencyTable': str(y1y2) + '_' + str(y1n2) + '\n' + str(n1y2) + '_' + str(n1n2)})

    return pd.DataFrame(listOfDicts)
  
        

In [166]:
def summarize_gain_info_by_case(maf):
    for case in set(maf['Tumor_Sample_Barcode']):
        caseMafUnbalanced = maf[(maf['Tumor_Sample_Barcode'] == case) & (maf['mcn'] != lcn)]
        print caseMafUnbalanced[caseMafUnbalanced['mcn']].shape

In [3]:
chrArmDict = analysis_utils.map_impact_genes_to_chromosome_arm(armFilePath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/IMPACTv6_gene_cytoband.txt')
armLevelDf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/feb_1_cohort_compilation.arm_level.txt')
qcDf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/feb_1_cohort_compilation.cohort.txt')


In [None]:
clonalityAnnotatedMaf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/filteredMaf_Nov19_2019_hypermutantOnly_withCNCFAnnotation.maf')

In [61]:
armLevelDf['Tumor_Sample_Barcode'] = armLevelDf['sample'].apply(lambda x: x[:17])
armLevelDf['armCase'] = armLevelDf['Tumor_Sample_Barcode'] + '_' + armLevelDf['arm']
armDict = dict(zip(armLevelDf['armCase'], armLevelDf['cn_state']))
clonalityAnnotatedMaf['armCase'] = clonalityAnnotatedMaf['Tumor_Sample_Barcode'] + '_' + clonalityAnnotatedMaf['chrArm']
clonalityAnnotatedMaf['armState'] = clonalityAnnotatedMaf['armCase'].apply(lambda x: 
            armDict[x] if x in armDict else None)

In [62]:
clonalityAnnotatedMaf['mcn'] = clonalityAnnotatedMaf['tcn'] - clonalityAnnotatedMaf['lcn']

In [63]:
clonalityAnnotatedMaf['chrArm'] = clonalityAnnotatedMaf['Hugo_Symbol'].apply(lambda x:
            chrArmDict[x] if x in chrArmDict else None)

In [183]:
clonalityAnnotatedMaf['mutTiming'] = clonalityAnnotatedMaf.apply(lambda row:
                            'beforeGain' if row['mcn'] > 1 and row['expected_alt_copies'] > 1
                            else 'afterGain' if row['mcn'] > 1 and row['expected_alt_copies'] == 1
                            else 'noGain', axis=1)

In [192]:
analyzeMaf = clonalityAnnotatedMaf[clonalityAnnotatedMaf['mutTiming'] != 'noGain']

In [193]:
listOfDfs1 = []
listOfDfs2 = []

topHyperCancerTypes = ['Endometrial Cancer', 'Colorectal Cancer', 'Glioma', 'Bladder Cancer',
                      'Esophagogastric Cancer', 'Prostate Cancer', 'Bladder Cancer']
for cancerType in topHyperCancerTypes:
    hyperIds = get_gene_and_cohort_list_utils.get_ids_by_hypermutant_status(hypermutantIdDir= pathPrefix + '/juno/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType=cancerType, hypermutantStatus = 'Hypermutated')
    ctHyperMaf = analyzeMaf[analyzeMaf['Tumor_Sample_Barcode'].isin(hyperIds)]
    
    ctDf1 = do_onc_and_cna_timing_analysis_arm_mode(ctHyperMaf, chrArmDict)
    ctDf1['cancerType'] = cancerType
    listOfDfs1.append(ctDf1)
    
    ctDf2 = do_onc_and_cna_timing_analysis(ctHyperMaf)
    ctDf2['cancerType'] = cancerType
    listOfDfs2.append(ctDf2)
    
df1 = pd.concat(listOfDfs1)    
df2 = pd.concat(listOfDfs2)

0 100 200 0 100 200 0 100 200 300 0 100 200 300 0 100 200 300 0 100 200 300 0 100 0 100 0 0 0 0 0 100 0 100


In [194]:
df1.to_csv('/Users/friedman/Desktop/WORK/geneCNATiming_gainTiming.tsv', index=False, sep='\t')
df2.to_csv('/Users/friedman/Desktop/WORK/geneCNATiming_gainAndOncogenic.tsv', index=False, sep='\t')

In [195]:
#
##
######

mafWithAttribution = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/mafWithMutationAttribution.tsv')


  interactivity=interactivity, compiler=compiler, result=result)


In [201]:
mafWithAttribution['mutIdentifier'] = mafWithAttribution.apply(lambda row: str(row['Tumor_Sample_Barcode']) + 
 '_'  + str(row['Start_Position']) + ':' + str(row['Reference_Allele']) + '->' + str(row['Tumor_Seq_Allele2']), axis=1)

clonalityAnnotatedMaf['mutIdentifier'] = clonalityAnnotatedMaf.apply(lambda row: str(row['Tumor_Sample_Barcode']) + 
 '_'  + str(row['Start_Position']) + ':' + str(row['Reference_Allele']) + '->' + str(row['Tumor_Seq_Allele2']), axis=1)


In [203]:
attributionDict = dict(zip(mafWithAttribution['mutIdentifier'], mafWithAttribution['hypermutationInduced']))
clonalityAnnotatedMaf['attribution'] = clonalityAnnotatedMaf['mutIdentifier'].apply(lambda x: attributionDict[x] if x in attributionDict else None)

In [221]:
analyzeMaf = clonalityAnnotatedMaf[(clonalityAnnotatedMaf['mutTiming'] != 'noGain') & (clonalityAnnotatedMaf['attribution'] != 'unclear')]
analyzeMaf['isGained'] = analyzeMaf['mutTiming'].apply(lambda x: 1 if x == 'beforeGain' else 0)
oncogenicAnnotations = ['Likely Oncogenic', 'Oncogenic', 'Predicted Oncogenic']
analyzeMaf['analysisGroup'] = analyzeMaf.apply(lambda row:
        'hyperPassenger' if row['attribution'] == 'hyperInduced' and row['oncogenic'] not in oncogenicAnnotations
        else 'hyperDriver' if row['attribution'] == 'hyperInduced' and row['oncogenic'] in oncogenicAnnotations 
        else 'notHyperPassenger' if row['attribution'] != 'hyperInduced' and row['oncogenic'] not in oncogenicAnnotations
        else 'notHyperDriver' if row['attribution'] != 'hyperInduced' and row['oncogenic'] in oncogenicAnnotations 
        else None, axis=1)                   
                                               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [222]:
analyzeMaf = analyzeMaf[['Tumor_Sample_Barcode', 'isGained', 'Hugo_Symbol', 'analysisGroup', 'oncogenic']]

In [223]:
cancerTypeDict = analysis_utils.get_cancer_type_information(cancerTypeDfPath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/cancerTypeInfo_asOfNov192019.txt', mode='pid')
analyzeMaf['pid'] = analyzeMaf['Tumor_Sample_Barcode'].apply(lambda x: x[:9])
analyzeMaf['cancerType'] = analyzeMaf['pid'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

In [224]:
cancerTypesToPlot = set(['Colorectal Cancer', 'Endometrial Cancer', 'Glioma', 'Prostate Cancer',
                         'Esophagogastric Cancer', 'Bladder Cancer', 'Small Bowel Cancer'])
analyzeMaf = analyzeMaf[analyzeMaf['cancerType'].isin(cancerTypesToPlot)]

In [225]:
analyzeMaf.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/gainsByMutType.tsv', index=False, sep='\t')

In [265]:
for case in set(analyzeMaf['Tumor_Sample_Barcode']):
    caseMaf = analyzeMaf[analyzeMaf['Tumor_Sample_Barcode'] == case]
    print 'frac muts before gain', round(1.0*(caseMaf[caseMaf['isGained'] == 1].shape[0])/caseMaf.shape[0], 2), 'nmut', caseMaf.shape[0], case
                                         

frac muts before gain 0.0 nmut 2 P-0009152-T01-IM5
frac muts before gain 0.0 nmut 2 P-0025195-T01-IM6
frac muts before gain 0.08 nmut 12 P-0003292-T01-IM5
frac muts before gain 0.67 nmut 3 P-0020064-T01-IM6
frac muts before gain 0.03 nmut 30 P-0002695-T01-IM3
frac muts before gain 1.0 nmut 1 P-0029398-T01-IM6
frac muts before gain 0.0 nmut 6 P-0034915-T01-IM6
frac muts before gain 0.17 nmut 6 P-0005285-T01-IM5
frac muts before gain 0.5 nmut 2 P-0021692-T01-IM6
frac muts before gain 0.6 nmut 5 P-0001494-T01-IM3
frac muts before gain 0.69 nmut 13 P-0017986-T02-IM6
frac muts before gain 0.25 nmut 4 P-0013112-T02-IM6
frac muts before gain 0.5 nmut 2 P-0026221-T01-IM6
frac muts before gain 0.78 nmut 9 P-0023250-T01-IM6
frac muts before gain 1.0 nmut 2 P-0025240-T01-IM6
frac muts before gain 0.0 nmut 3 P-0036369-T01-IM6
frac muts before gain 0.5 nmut 2 P-0012848-T01-IM5
frac muts before gain 1.0 nmut 2 P-0017300-T01-IM6
frac muts before gain 0.17 nmut 23 P-0028474-T01-IM6
frac muts before ga

frac muts before gain 0.0 nmut 7 P-0028014-T01-IM6
frac muts before gain 1.0 nmut 3 P-0025669-T01-IM6
frac muts before gain 1.0 nmut 1 P-0014495-T01-IM6
frac muts before gain 0.38 nmut 66 P-0015699-T01-IM6
frac muts before gain 0.67 nmut 3 P-0022010-T01-IM6
frac muts before gain 0.2 nmut 15 P-0019475-T01-IM6
frac muts before gain 1.0 nmut 1 P-0005842-T01-IM5
frac muts before gain 1.0 nmut 2 P-0018350-T01-IM6
frac muts before gain 0.37 nmut 30 P-0006960-T01-IM5
frac muts before gain 0.43 nmut 7 P-0019846-T01-IM6
frac muts before gain 0.67 nmut 3 P-0031089-T01-IM6
frac muts before gain 0.12 nmut 33 P-0002186-T01-IM3
frac muts before gain 0.5 nmut 14 P-0012115-T01-IM5
frac muts before gain 0.17 nmut 6 P-0026118-T01-IM6
frac muts before gain 0.0 nmut 1 P-0001882-T03-IM6
frac muts before gain 0.31 nmut 13 P-0027335-T01-IM6
frac muts before gain 0.33 nmut 3 P-0001685-T02-IM3
frac muts before gain 0.4 nmut 15 P-0035005-T01-IM6
frac muts before gain 1.0 nmut 1 P-0035180-T01-IM6
frac muts befor

In [268]:
print analyzeMaf[analyzeMaf['Tumor_Sample_Barcode'] == 'P-0030372-T01-IM6'][['isGained', 'Hugo_Symbol']]

      isGained Hugo_Symbol
7624         1       KDM5A
7625         1       KDM5A
7626         1       CCND2
7627         1      CDKN1B
7628         0     PIK3C2G
7629         1     PIK3C2G
7630         1        KRAS
7631         0       ARID2
7632         1       ARID2
7633         1       ARID2
7634         0       KMT2D
7635         1       KMT2D
7636         0       KMT2D
7637         1       KMT2D
7638         1       KMT2D
7639         1       KMT2D
7640         1     SMARCD1
7641         1     SMARCD1
7642         1        GLI1
7643         1        IGF1
7644         0        IGF1
7645         1       SH2B3
7646         1      PTPN11
7647         1        TBX3
7648         1        TBX3
7649         0       HNF1A
7650         1       HNF1A
7651         1        POLE
7652         1        POLE
7653         1        POLE
...        ...         ...
7946         1      NOTCH4
7947         1        TAP2
7948         0        TAP2
7949         1        TAP1
7950         1        TAP1
7

In [None]:
#
######
##############
######################
##################################
#########################################
###################################
#####################
##############
#######
#

In [238]:
#armLevelDf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/feb_1_cohort_compilation.arm_level.t
fracAlteredDict = {}
cntr = 0
allCases = set(armLevelDf['Tumor_Sample_Barcode'])
nonDiploidCases = armLevelDf[armLevelDf['cn_state'] != 'DIPLOID']
nonDiploidSegmentCounts = dict(nonDiploidCases['Tumor_Sample_Barcode'].value_counts())
for case in allCases:
    if case in nonDiploidSegmentCounts:
        nNonDiploidSegments = nonDiploidSegmentCounts[case]
        fracAlteredDict[case] = nNonDiploidSegments/46.0


In [242]:
reload(get_gene_and_cohort_list_utils)
tmbDict = get_gene_and_cohort_list_utils.get_all_tmb_info(tmbFilePath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/mutations_TMB_and_MSI_stats.txt')


In [254]:
#allHypermutatorIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=pathPrefix + '/juno/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds')    
listOfDicts = []
for caseId in allHypermutatorIds:
#    print caseId
    if caseId[:9] in cancerTypeDict and caseId in tmbDict and caseId in fracAlteredDict:
        listOfDicts.append({'Tumor_Sample_Barcode': caseId, 'TMB': tmbDict[caseId],
                           'cancerType': cancerTypeDict[caseId[:9]], 'fracAltered': fracAlteredDict[caseId]})
df = pd.DataFrame(listOfDicts)

In [258]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/tmbAndCNA.tsv', index=False, sep='\t')