In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re

from collections import Counter

pathPrefix = '/Users/friedman/Desktop/mnt'
sys.path.append(pathPrefix + '/ifs/work/taylorlab/friedman/myUtils')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import mutation_modeling_util  
import signature_attribution_util
import clonality_analysis_util

In [131]:
def assign_hotspot_freq_dict(df):
    d = {}
    for index, row in df.iterrows():
        refAminoAcid = row['ref']
        gene = row['Hugo_Symbol']
        position = row['Amino_Acid_Position']
        for entry in row['Var_AA'].split('|'):
            fullAltName = ''
            altAminoAcid, count = entry.split(':')
            fullAltName = gene + ':' + refAminoAcid + position + altAminoAcid
            d[fullAltName] = float(count)/47000
    return d

#def create_hotspot_ranking_df(hotspotD):
#    listOfDicts = []
#    for key, value in hotspotD.items():
#        gene, allele = key.split(':')
#        listOfDicts.append({
#                'gene': gene,
#                'allele': allele,
#                'hotspot': key,
#                'freq': value
#        })
#    return pd.DataFrame(listOfDicts)

#returns a dictionary mapping a hotspot to its pancan rank order 
def rank_hotspots(hotspotIncidenceD):
    d = {}
    sortedHotspots = sorted(zip(hotspotIncidenceD.keys(), hotspotIncidenceD.values()), key= lambda x: x[1], reverse=True)
    i = 0
    for allele, freq in sortedHotspots:
        d[allele] = i
        i += 1
    return d

#given a df and hotspots, for the 3 hypermutator cancer types assigns hotspots percentiles
def assign_percentiles_to_df(df):
    for ct in ['Endometrial_Cancer', 'Colorectal_Cancer', 'Glioma']:
        ctRelated = df[df[ct + '_RelatedHotspot'] == True]
        ctNotRelated = df[df[ct + '_RelatedHotspot'] == False]
        percentilesRelated = dict(zip(ctRelated['hotspot'], ctRelated['freq'].rank(pct=True)))
        percentilesNotRelated = dict(zip(ctNotRelated['hotspot'], ctNotRelated['freq'].rank(pct=True)))
        df[ct + '_related_percentile'] = df.apply(lambda row:
                                               None if row[ct + '_RelatedHotspot'] == False
                                               else percentilesRelated[row['hotspot']], axis=1)
        df[ct + '_not_related_percentile'] = df.apply(lambda row:
                                               None if row[ct + '_RelatedHotspot'] == True
                                               else percentilesNotRelated[row['hotspot']], axis=1)
    return df 


In [3]:
hotspotsDf = pd.read_table(pathPrefix + '/home/gavrilae/snp_output_final_pancan.txt')


In [4]:
hotspotIncidenceD = assign_hotspot_freq_dict(hotspotsDf)

In [132]:
hotspotRankingD = rank_hotspots(hotspotIncidenceD)

In [8]:
df = create_hotspot_ranking_df(hotspotIncidenceD)

In [None]:
filteredMafDf = analysis_utils.load_in_df_with_progress(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/all_impact_mutations_annotated_cohort.maf', 275171)
genesImplicatedInCancerTypes= maf_analysis_utils.create_dictionary_mapping_genes_to_cancer_types_with_implication(filteredMafDf, pathPrefix=pathPrefix, cancerTypes=['Endometrial Cancer', 'Colorectal Cancer', 'Glioma'], t=0.04)
hypermutationInitiatingGenes = set(['MSH6', 'MLH1', 'MSH2', 'PMS2', 'POLE'])
relatedGenesDict = {}
for cancerType, genes in genesImplicatedInCancerTypes.items():
    if cancerType == 'Colorectal Cancer' or 'Endometrial Cancer':
        genes = genes | hypermutationInitiatingGenes
        relatedGenesDict[cancerType] = genes
    else:
        relatedGenesDict[cancerType] = genes

In [232]:
#manual adjustments
relatedGenesDict['Glioma'] = relatedGenesDict['Glioma'] | set(['FUBP1', 'BRAF', 'CDKN2A']) - set(['POLE'])
relatedGenesDict['Colorectal Cancer'] = relatedGenesDict['Colorectal Cancer'] | set(['CTNNB1', 'AKT1']) 
relatedGenesDict['Endometrial Cancer'] = relatedGenesDict['Endometrial Cancer'] | set(['ERBB2', 'ERBB3']) 

In [None]:
#Run stuff on filtered maf df
filteredMafDf['pid'] = filteredMafDf['Tumor_Sample_Barcode'].apply(lambda x: x[:9])
ctDict = analysis_utils.get_cancer_type_information(cancerTypeDfPath = pathPrefix + '/ifs/work/taylorlab/friedman/mskImpactAsOfMarch2019/dmp/mskimpact/data_clinical_sample.txt', mode='pid')
filteredMafDf['cancer_type'] = filteredMafDf['pid'].apply(lambda x: ctDict[x] if x in ctDict else None)

In [272]:
def assign_hotspot_mutation_utilization_curves(hotspotMafDf, hotspotRankingD, cancerTypeName):
    
    def normalize_hotspot_rankings(df):
        l = [] #create a list of tuples of percentiles and hotspots
        n = 0
        df = df.drop_duplicates(subset=['hotspotName'])
        sortedHotspotRanks = sorted(zip(df['hotspotName'], df['hotspotRank']), key= lambda x: x[1])
        nHotspotsUnderConsideration = len(sortedHotspotRanks)
        cntr = 0
        for name, rank in sortedHotspotRanks:
            l.append((name, 1.0*cntr/nHotspotsUnderConsideration))
            cntr += 1
        return l
    
    relatedHotspotMaf = hotspotMafDf[hotspotMafDf['isCtypeCancerGene'] == True]
    unrelatedHotspotMaf = hotspotMafDf[hotspotMafDf['isCtypeCancerGene'] == False]
    
    relatedHotspotMaf['hotspotRank'] = relatedHotspotMaf['hotspotName'].apply(lambda x: hotspotRankingD[x] if x in hotspotRankingD else None)
    unrelatedHotspotMaf['hotspotRank'] = unrelatedHotspotMaf['hotspotName'].apply(lambda x: hotspotRankingD[x] if x in hotspotRankingD else None)
    #FILTER out hotspots that we do not have a ranking for
    relatedHotspotMaf = relatedHotspotMaf[relatedHotspotMaf['hotspotRank'].notnull()]
    unrelatedHotspotMaf = unrelatedHotspotMaf[unrelatedHotspotMaf['hotspotRank'].notnull()]
    
    nRelatedHotspots = relatedHotspotMaf.shape[0]
    nUnrelatedHotspots = unrelatedHotspotMaf.shape[0]
    
    relatedHotspotRankings = normalize_hotspot_rankings(relatedHotspotMaf)
    unrelatedHotspotRankings = normalize_hotspot_rankings(unrelatedHotspotMaf)
    
    listOfDicts = [{'hotspot':'NA_origin_for_path_Related', 'class': 'related', 'val':0, 'percentile': 0},
                  {'hotspot':'NA_origin_for_path_Unrelated', 'class': 'unrelated', 'val':0, 'percentile': 0}] #what we return
    runningSum = 0
    for hotspot, percentile in relatedHotspotRankings:
        frac = 1.0*relatedHotspotMaf[relatedHotspotMaf['hotspotName'] == hotspot].shape[0]/nRelatedHotspots
        runningSum += frac
        listOfDicts.append({
         'hotspot': hotspot,
         'class': 'related',
         'val': runningSum,
         'percentile': percentile
        })
        
    runningSum = 0
    for hotspot, percentile in unrelatedHotspotRankings:
        nTimesHotspotOccurs = unrelatedHotspotMaf[unrelatedHotspotMaf['hotspotName'] == hotspot].shape[0]
        frac = 1.0*nTimesHotspotOccurs/nUnrelatedHotspots
        runningSum += frac
        listOfDicts.append({
         'hotspot': hotspot,
         'class': 'unrelated',
        'val': runningSum,
        'percentile': percentile
        })
      
    return pd.DataFrame(listOfDicts)
    

In [234]:
def prepare_cancer_type_maf_for_analysis(maf, cType):
    cancerTypeHypermutatorIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir= pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType=cType, hypermutantStatus = 'Hypermutated')
    cancerTypeHypermutatorDf = filteredMafDf[filteredMafDf['Tumor_Sample_Barcode'].isin(cancerTypeHypermutatorIds)]
    cancerTypeHypermutatorHotspots = cancerTypeHypermutatorDf[cancerTypeHypermutatorDf['is-a-hotspot'] == 'Y']
    cancerTypeHypermutatorHotspots['hotspotName'] = cancerTypeHypermutatorHotspots.apply(lambda row:
                                                                                     row['Hugo_Symbol'] + ':' + row['HGVSp_Short'].strip('p.'), axis=1)
    cancerTypeHypermutatorHotspots['isCtypeCancerGene'] = cancerTypeHypermutatorHotspots['Hugo_Symbol'].apply(lambda x: True if x in relatedGenesDict[cType] else False)
    return cancerTypeHypermutatorHotspots

In [273]:
endometrialHypermutatorHotspots = prepare_cancer_type_maf_for_analysis(filteredMafDf, 'Endometrial Cancer')
colorectalHypermutatorHotspots = prepare_cancer_type_maf_for_analysis(filteredMafDf, 'Colorectal Cancer')
gliomaHypermutatorHotspots = prepare_cancer_type_maf_for_analysis(filteredMafDf, 'Glioma')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [274]:
dfColo = assign_hotspot_mutation_utilization_curves(colorectalHypermutatorHotspots, hotspotRankingD, 'Colorectal_Cancer')
dfEndo = assign_hotspot_mutation_utilization_curves(endometrialHypermutatorHotspots, hotspotRankingD, 'Endometrial_Cancer')
dfGlio = assign_hotspot_mutation_utilization_curves(gliomaHypermutatorHotspots, hotspotRankingD, 'Glioma')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [278]:
dfEndo.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/EndoHotspotUtilizationCurve.tsv', index=False, sep='\t')
dfColo.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/ColoHotspotUtilizationCurve.tsv', index=False, sep='\t')
dfGlio.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/GlioHotspotUtilizationCurve.tsv', index=False, sep='\t')

**Do analyses of Gene mutation frequency/recurrence**<br/><br/><br/>


TODO THIS LOWER PART HAS GOTTEN VERY BIG CODEWISE, MAYBE MOVE IT???

In [2]:
allImpactMuts = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/all_impact_mutations_annotated_cohort.maf')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
allImpactMuts = maf_analysis_utils.fix_mll_genes(allImpactMuts)
impact368genes = set(['ABL1', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BAP1', 'BARD1', 'BBC3', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CARD11', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79B', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSF1R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EP300', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERG', 'ESR1', 'ETV1', 'ETV6', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXP1', 'FUBP1', 'GATA1', 'GATA2', 'GATA3', 'GNA11', 'GNAQ', 'GNAS', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3B', 'HNF1A', 'HRAS', 'ICOSLG', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INPP4A', 'INPP4B', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAPK1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MITF', 'MLH1', 'MLL', 'MLL2', 'MLL3', 'MPL', 'MRE11A', 'MSH2', 'MSH6', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOR1', 'NF1', 'NF2', 'NFE2L2', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTRK1', 'NTRK2', 'NTRK3', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLE', 'PPP2R1A', 'PRDM1', 'PRKAR1A', 'PTCH1', 'PTEN', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAC1', 'RAD50', 'RAD51', 'RAD51B', 'RAD51C', 'RAD51D', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RUNX1', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SETD2', 'SF3B1', 'SH2D1A', 'SHQ1', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SOCS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SRC', 'STAG2', 'STK11', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TBX3', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP63', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'VHL', 'VTCN1', 'WT1', 'XIAP', 'XPO1', 'YAP1', 'YES1'])
#ONLY do analysis on impact 368 genes
allImpactMuts = allImpactMuts[allImpactMuts['Hugo_Symbol'].isin(impact368genes)]


In [4]:
endometrialNormalIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Endometrial Cancer', hypermutantStatus = 'Normal')
colorectalNormalIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Colorectal Cancer', hypermutantStatus = 'Normal')
gliomaNormalIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Glioma', hypermutantStatus = 'Normal')

endoNormalDf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(endometrialNormalIds)]
colorectalNormalDf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(colorectalNormalIds)]
gliomaNormalDf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(gliomaNormalIds)]

endoNormalRanking = maf_analysis_utils.enumerate_gene_mut_ranking_for_cohort(endoNormalDf)
colorectalNormalRanking = maf_analysis_utils.enumerate_gene_mut_ranking_for_cohort(colorectalNormalDf)
gliomaNormalRanking = maf_analysis_utils.enumerate_gene_mut_ranking_for_cohort(gliomaNormalDf)

In [12]:
xSort = sorted(list(zip(gliomaNormalRanking.values(), gliomaNormalRanking.keys())))
for i in xSort:
    print i[1]

TERT
TP53
IDH1
PTEN
ATRX
EGFR
NF1
CIC
PIK3CA
PIK3R1
RB1
PTPN11
NOTCH1
BRAF
SETD2
CDKN2A
FUBP1
STAG2
PDGFRA
ARID1A
BCOR
IDH2
ARID2
FGFR1
KRAS
ARID1B
DNMT3A
ATM
FBXW7
SMARCA4
MED12
SMARCB1
CDKN2C
APC
PIK3CB
BRCA2
CBL
PBRM1
TSC2
TSC1
FAT1
NOTCH2
NSD1
CREBBP
TET2
ASXL1
MAX
FGFR3
MTOR
CDC73
CHEK2
BLM
ERCC2
SUFU
NRAS
EIF1AX
CDH1
MYCN
FLT3
NBN
EP300
NOTCH3
PTPRD
PTCH1
PMAIP1
ARID5B
CDKN1B
MAP3K1
SOX9
PARK2
BRCA1
MAP2K1
SPOP
AKT1
MLH1
HNF1A
SYK
NCOR1
FANCC
KDM6A
HRAS
DIS3
NF2
MEN1
KIT
CDKN1A
DAXX
JAK3
JAK1
CDK4
EPHB1
PARP1
DDR2
PALB2
SHQ1
TBX3
FH
RBM10
ERCC3
RAD51D
RAD51C
CUL3
FLCN
INPP4B
EPCAM
CASP8
GATA1
PMS1
PMS2
KDM5C
FOXP1
EED
SPEN
PIK3R2
GRIN2A
AR
EPHA3
BRIP1
TP63
FANCA
TGFBR2
TGFBR1
ASXL2
DICER1
KEAP1
RAD50
CDK8
EZH2
HIST1H3B
NOTCH4
SDHD
TAP1
ERRFI1
STK19
CRLF2
CRKL
STK11
PPP4R2
B2M
TAP2
AXL
GPS2
DNMT1
PIK3CD
PIK3CG
FGFR2
INHBA
JAK2
HIST1H1C
H3F3B
ERBB2
ERBB3
IGF1R
KMT5A
H3F3A
ERBB4
RAD51L3
WHSC1L1
NUP93
PIK3C2G
NKX3-1
SMARCD1
BRD4
TEK
MDM2
PIK3C3
GATA2
GATA3
ZRSR2
MYD88
CCND1
CCND3
CCN

In [5]:
endometrialHyperIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Endometrial Cancer', hypermutantStatus = 'Hypermutated')
colorectalHyperIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Colorectal Cancer', hypermutantStatus = 'Hypermutated')
gliomaHyperIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Glioma', hypermutantStatus = 'Hypermutated')

endoHyperDf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(endometrialHyperIds)]
colorectalHyperDf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(colorectalHyperIds)]
gliomaHyperDf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(gliomaHyperIds)]


In [18]:
#this generates data that would show us what mutation acquisition would look like under different assumptions
#MAYBE THSI IS NO LONGER NECESSARY:(
def assign_synthetic_mutation_curves(rankingDict, realData, maxNmut = 40):
    
    #tells us what data would like if we just progressively acquired oncogenic mutations in order of recurrence
    def get_synthetic_data_muts_in_order(rDict, nmut):
        listOfDicts = [] #what we are going to build into the dataframe
        #FIRST do the always pick the next most common mutation
        listOfTuples = [(k, v) for k, v in rankingDict.items()] 
        sortedL = sorted(listOfTuples, key = lambda x: x[1])
        cntr = 1
        for gene, val in sortedL[:maxNmut]:
            listOfDicts.append({'ranking': val, 'mutNumber': cntr, 'class': 'mutsInOrder'})
            cntr += 1
        return pd.DataFrame(listOfDicts)
    
    #makes a curve given the assumption we have k mutations accruing at the normal rate then the rest up to the kth mutation at random
    def get_synthetic_data_k_observed_then_random_n(k, n, realData):
        className = k + '_real_muts_then_random'
        listOfDicts = []
        for i in range(1, k + 1):
            observedMean = np.nanmean(realData[realData['mutNumber'] == i]['ranking'])
            listOfDicts.append({'ranking': observedMean, 'mutNumber': i, 'class': className})
        
            
    
    dfMutsInOrder = get_synthetic_data_muts_in_order(rankingDict, maxNmut)
    
    get_synthetic_data_k_observed_then_random_n(5, maxNmut, realData)
    
    return dfMutsInOrder


def asses_per_case_mut_ranking(maf, rankingDict, dominantSigInfo, mode='hotspot'):
    listOfDicts = []
    cases = set(maf['Tumor_Sample_Barcode'])
    
    print 'calculating mut rankings from real data'
    iterCntr = 0 
    nmuts = []
    
    for case in cases:
        
        if iterCntr%50 == 0:
            print 'on case number ', iterCntr, ' out of ', len(cases)
        iterCntr+=1
        
        dominantSig = None
        if case in dominantSigInfo: 
            dominantSig = dominantSigInfo[case]
        caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
        nmutCase = caseMaf.shape[0]
        nmuts.append(nmutCase)
        
        if mode == 'hotspot':
            caseMafCondition = caseMaf[(caseMaf['is-a-hotspot'] == 'Y')]
        elif mode == 'oncogenic':
            caseMafCondition = caseMaf[(caseMaf['oncogenic'].notnull())]
        else: #else do this for literally every mutation
            caseMafCondition = caseMaf[(caseMaf['Tumor_Sample_Barcode'] == case)]
        
        nConditionMuts = caseMafCondition.shape[0]
        caseMafCondition['ranking'] = caseMafCondition['Hugo_Symbol'].apply(lambda x: rankingDict[x] if x in rankingDict else None)
        caseMafCondition = caseMafCondition[caseMafCondition['ranking'].notnull()]
        rankingList = zip(caseMafCondition['ranking'], caseMafCondition['Hugo_Symbol'])
        sortedList = sorted(rankingList)
        vals = caseMafCondition['ranking']
        
        cntr = 1.0
        for ranking, gene in sortedList:
            mutPercentile = cntr/len(sortedList)
            listOfDicts.append({'Tumor_Sample_Barcode': case, 'ranking':ranking, 'class': 'realData', 'caseNmut': nmutCase, 'nConditionMuts': nConditionMuts,
                                'Hugo_Symbol':gene, 'mutNumber': cntr, 'dominantSig': dominantSig, 'mutPercentile': mutPercentile})
            cntr += 1
    
    df = pd.DataFrame(listOfDicts)
    cohortMedianNmut = np.nanmedian(nmuts)
    df['nmutCategory'] = df['caseNmut'].apply(lambda x: '>median' if x > cohortMedianNmut else '<median')
    return df
    #dfRealData = pd.DataFrame(listOfDicts)
    #dfSyntheticData = assign_synthetic_mutation_curves(rankingDict, dfRealData)
    #return pd.concat([dfRealData, dfSyntheticData])
        

In [6]:
def regularize_df_by_possible_muts(possibleGeneMutSummaryDict, rankingDict):
    
    nPossibleOncogenic = sum(possibleGeneMutSummaryDict.values())
    listOfTuples = [(k, v) for k, v in rankingDict.items()] 
    sortedL = sorted(listOfTuples, key = lambda x: x[1])
    
    #we create a dataframe
    listOfDicts = []
    runningSum = 0
    for gene, rank in sortedL:
        if gene in possibleGeneMutSummaryDict.keys():
            runningSum += possibleGeneMutSummaryDict[gene]
            listOfDicts.append({'Hugo_Symbol': gene, 'runningSum': runningSum})
    
    df = pd.DataFrame(listOfDicts)
    df['percentile'] = df['runningSum'].apply(lambda x: (1.0*x)/max(df['runningSum']))
    return dict(zip(df['Hugo_Symbol'], df['percentile']))

#TDOO tomorrow: write this function and other goodies--should give us a way to quantify median gene
#we also want to restrict everything to the impact 368 panel

In [7]:
def rank_muts_by_size(possibleMutInfo):
    
    d = {}
    
    nPossibleMuts = sum(possibleMutInfo['nPossibleMuts'])
    l = zip(possibleMutInfo['nPossibleMuts'], possibleMutInfo['Hugo_Symbol'])
    sortedList = sorted(l, reverse=True)
    
    runningSum = 0
    for nmut, gene in sortedList:
        runningSum += nmut
        frac = (1.0*runningSum)/nPossibleMuts
        val = 1 - frac
        d[gene] = val
    
    return d

In [21]:
impactSigs = pd.read_table(pathPrefix + '/ifs/res/taylorlab/impact_sigs/mixedpact_data_mutations_unfiltered.sigs.tab.txt')

In [22]:
impactSigs = mutationSigUtils.merge_signature_columns(impactSigs, mode='Stratton', drop=True, smokingMerge=False, confidence=True, mean=True, prefix='mean_')
impactSigs['dominantSig'] = impactSigs.apply(lambda row: mutationSigUtils.get_dominant_signature(row.to_dict(), cols=None, prefix='mean', notEnoughMuts= True), axis=1)
impactSigs['dominantSig'] = impactSigs['dominantSig'].apply(lambda x: 'mean_MMR' if x == 'mean_1' else x) #fix MMR signatures


In [23]:
dominantSignatureDict = dict(zip(impactSigs['Tumor_Sample_Barcode'], impactSigs['dominantSig']))

<br/><br/><br/><br/><br/>run area

In [8]:
mutSimulationSummary = pd.read_table('/Users/friedman/Desktop/workOffline/mutSimulationInfoIncludingHotspots.tsv')
#only use impact 368 genes
impact368genes = set(['ABL1', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BAP1', 'BARD1', 'BBC3', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CARD11', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79B', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSF1R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EP300', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERG', 'ESR1', 'ETV1', 'ETV6', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXP1', 'FUBP1', 'GATA1', 'GATA2', 'GATA3', 'GNA11', 'GNAQ', 'GNAS', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3B', 'HNF1A', 'HRAS', 'ICOSLG', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INPP4A', 'INPP4B', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAPK1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MITF', 'MLH1', 'MLL', 'MLL2', 'MLL3', 'MPL', 'MRE11A', 'MSH2', 'MSH6', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOR1', 'NF1', 'NF2', 'NFE2L2', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTRK1', 'NTRK2', 'NTRK3', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLE', 'PPP2R1A', 'PRDM1', 'PRKAR1A', 'PTCH1', 'PTEN', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAC1', 'RAD50', 'RAD51', 'RAD51B', 'RAD51C', 'RAD51D', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RUNX1', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SETD2', 'SF3B1', 'SH2D1A', 'SHQ1', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SOCS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SRC', 'STAG2', 'STK11', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TBX3', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP63', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'VHL', 'VTCN1', 'WT1', 'XIAP', 'XPO1', 'YAP1', 'YES1'])
mutSimulationSummary368 = mutSimulationSummary[mutSimulationSummary['Hugo_Symbol'].isin(impact368genes)]


In [9]:
mode = 'all'
mutationSizeDict = rank_muts_by_size(mutSimulationSummary368)
endometrialRanking = mutationSizeDict
colorectalRanking = mutationSizeDict
gliomaRanking = mutationSizeDict


In [10]:
mode = 'oncogenic'
endometrialRanking = regularize_df_by_possible_muts(oncPossibleDict, endoNormalRanking)
colorectalRanking = regularize_df_by_possible_muts(oncPossibleDict, colorectalNormalRanking)
gliomaRanking = regularize_df_by_possible_muts(oncPossibleDict, gliomaNormalRanking)


NameError: name 'oncPossibleDict' is not defined

In [16]:
hotspotPossibleDict = dict(zip(mutSimulationSummary368['Hugo_Symbol'], mutSimulationSummary368['nHotspot']))
mode = 'hotspot'
endometrialRanking = regularize_df_by_possible_muts(hotspotPossibleDict, endoNormalRanking)
colorectalRanking = regularize_df_by_possible_muts(hotspotPossibleDict, colorectalNormalRanking)
gliomaRanking = regularize_df_by_possible_muts(hotspotPossibleDict, gliomaNormalRanking)


In [24]:
endoOccurenceDf = asses_per_case_mut_ranking(endoHyperDf, endometrialRanking, dominantSignatureDict, mode=mode)
endoOccurenceDf['cancer_type'] = 'Endometrial Cancer'

coloOccurenceDf = asses_per_case_mut_ranking(colorectalHyperDf, colorectalRanking, dominantSignatureDict, mode=mode)
coloOccurenceDf['cancer_type'] = 'Colorectal Cancer'

gliomaOccurenceDf = asses_per_case_mut_ranking(gliomaHyperDf, gliomaRanking, dominantSignatureDict, mode=mode)
gliomaOccurenceDf['cancer_type'] = 'Glioma'

calculating mut rankings from real data
on case number  0  out of  237


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


on case number  50  out of  237
on case number  100  out of  237
on case number  150  out of  237
on case number  200  out of  237
calculating mut rankings from real data
on case number  0  out of  261
on case number  50  out of  261
on case number  100  out of  261
on case number  150  out of  261
on case number  200  out of  261
on case number  250  out of  261
calculating mut rankings from real data
on case number  0  out of  53
on case number  50  out of  53


In [25]:
combinedDf = pd.concat([endoOccurenceDf, coloOccurenceDf, gliomaOccurenceDf])

In [277]:
#endoOccurenceDf.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/plotHotspotOccurrenceFreq.tsv', index=False, sep='\t')
combinedDf.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/plotOncogenicOccurrenceFreq.tsv', index=False, sep='\t')

In [266]:
combinedDf.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/plotVUSOccurrenceFreq.tsv', index=False, sep='\t')

In [26]:
combinedDf.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/plotHOTSPOTOccurrenceFreq.tsv', index=False, sep='\t')

In [29]:
endometrialRanking

{'ABL1': 0.9543668782109399,
 'AKT1': 0.29359323058325776,
 'AKT2': 0.9184043517679057,
 'AKT3': 0.8739800543970988,
 'ALK': 0.8980054397098821,
 'ALOX12B': 0.8813841039589,
 'APC': 0.319431852523421,
 'AR': 0.95164702326987,
 'ARAF': 0.7491689332124509,
 'ARID1A': 0.20761559383499548,
 'ARID1B': 0.4339679661529163,
 'ARID2': 0.7518887881535207,
 'ARID5B': 0.4876095497129042,
 'ASXL1': 0.528105167724388,
 'ASXL2': 0.9434874584466606,
 'ATM': 0.3046237533998187,
 'ATR': 0.5661831368993654,
 'ATRX': 0.5305228165608945,
 'AURKA': 0.9756724085826534,
 'AURKB': 0.9756724085826534,
 'AXIN1': 0.7704744635841644,
 'AXIN2': 0.9756724085826534,
 'AXL': 0.7762163795708673,
 'B2M': 0.7744031429434874,
 'BAP1': 0.7712299788455727,
 'BARD1': 0.7491689332124509,
 'BBC3': 0.9749168933212451,
 'BCL2': 0.9176488365064974,
 'BCL2L1': 0.9176488365064974,
 'BCL2L11': 0.7695678452704745,
 'BCL6': 0.7473556965850711,
 'BCOR': 0.2707766696887277,
 'BLM': 0.7307343608340888,
 'BMPR1A': 0.7518887881535207,
 'BR