In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re

from collections import Counter

pathPrefix = '/Users/friedman/Desktop/mnt'
sys.path.append(pathPrefix + '/ifs/work/taylorlab/friedman/myUtils')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import mutation_modeling_util  
import signature_attribution_util
import clonality_analysis_util

In [131]:
def assign_hotspot_freq_dict(df):
    d = {}
    for index, row in df.iterrows():
        
        refAminoAcid = row['ref']
        gene = row['Hugo_Symbol']
        position = row['Amino_Acid_Position']
        for entry in row['Var_AA'].split('|'):
            fullAltName = ''
            altAminoAcid, count = entry.split(':')
            fullAltName = gene + ':' + refAminoAcid + position + altAminoAcid
            d[fullAltName] = float(count)/47000
    return d

#def create_hotspot_ranking_df(hotspotD):
#    listOfDicts = []
#    for key, value in hotspotD.items():
#        gene, allele = key.split(':')
#        listOfDicts.append({
#                'gene': gene,
#                'allele': allele,
#                'hotspot': key,
#                'freq': value
#        })
#    return pd.DataFrame(listOfDicts)

#returns a dictionary mapping a hotspot to its pancan rank order 
def rank_hotspots(hotspotIncidenceD):
    d = {}
    sortedHotspots = sorted(zip(hotspotIncidenceD.keys(), hotspotIncidenceD.values()), key= lambda x: x[1], reverse=True)
    i = 0
    for allele, freq in sortedHotspots:
        d[allele] = i
        i += 1
    return d

#given a df and hotspots, for the 3 hypermutator cancer types assigns hotspots percentiles
def assign_percentiles_to_df(df):
    for ct in ['Endometrial_Cancer', 'Colorectal_Cancer', 'Glioma']:
        ctRelated = df[df[ct + '_RelatedHotspot'] == True]
        ctNotRelated = df[df[ct + '_RelatedHotspot'] == False]
        percentilesRelated = dict(zip(ctRelated['hotspot'], ctRelated['freq'].rank(pct=True)))
        percentilesNotRelated = dict(zip(ctNotRelated['hotspot'], ctNotRelated['freq'].rank(pct=True)))
        df[ct + '_related_percentile'] = df.apply(lambda row:
                                               None if row[ct + '_RelatedHotspot'] == False
                                               else percentilesRelated[row['hotspot']], axis=1)
        df[ct + '_not_related_percentile'] = df.apply(lambda row:
                                               None if row[ct + '_RelatedHotspot'] == True
                                               else percentilesNotRelated[row['hotspot']], axis=1)
    return df 


In [3]:
hotspotsDf = pd.read_table(pathPrefix + '/home/gavrilae/snp_output_final_pancan.txt')


In [4]:
hotspotIncidenceD = assign_hotspot_freq_dict(hotspotsDf)

In [132]:
hotspotRankingD = rank_hotspots(hotspotIncidenceD)

In [8]:
df = create_hotspot_ranking_df(hotspotIncidenceD)

In [None]:
filteredMafDf = analysis_utils.load_in_df_with_progress(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/all_impact_mutations_annotated_cohort.maf', 275171)
genesImplicatedInCancerTypes= maf_analysis_utils.create_dictionary_mapping_genes_to_cancer_types_with_implication(filteredMafDf, pathPrefix=pathPrefix, cancerTypes=['Endometrial Cancer', 'Colorectal Cancer', 'Glioma'], t=0.04)
hypermutationInitiatingGenes = set(['MSH6', 'MLH1', 'MSH2', 'PMS2', 'POLE'])
relatedGenesDict = {}
for cancerType, genes in genesImplicatedInCancerTypes.items():
    if cancerType == 'Colorectal Cancer' or 'Endometrial Cancer':
        genes = genes | hypermutationInitiatingGenes
        relatedGenesDict[cancerType] = genes
    else:
        relatedGenesDict[cancerType] = genes

In [232]:
#manual adjustments
relatedGenesDict['Glioma'] = relatedGenesDict['Glioma'] | set(['FUBP1', 'BRAF', 'CDKN2A']) - set(['POLE'])
relatedGenesDict['Colorectal Cancer'] = relatedGenesDict['Colorectal Cancer'] | set(['CTNNB1', 'AKT1']) 
relatedGenesDict['Endometrial Cancer'] = relatedGenesDict['Endometrial Cancer'] | set(['ERBB2', 'ERBB3']) 

In [None]:
#Run stuff on filtered maf df
filteredMafDf['pid'] = filteredMafDf['Tumor_Sample_Barcode'].apply(lambda x: x[:9])
ctDict = analysis_utils.get_cancer_type_information(cancerTypeDfPath = pathPrefix + '/ifs/work/taylorlab/friedman/mskImpactAsOfMarch2019/dmp/mskimpact/data_clinical_sample.txt', mode='pid')
filteredMafDf['cancer_type'] = filteredMafDf['pid'].apply(lambda x: ctDict[x] if x in ctDict else None)

In [272]:
def assign_hotspot_mutation_utilization_curves(hotspotMafDf, hotspotRankingD, cancerTypeName):
    
    def normalize_hotspot_rankings(df):
        l = [] #create a list of tuples of percentiles and hotspots
        n = 0
        df = df.drop_duplicates(subset=['hotspotName'])
        sortedHotspotRanks = sorted(zip(df['hotspotName'], df['hotspotRank']), key= lambda x: x[1])
        nHotspotsUnderConsideration = len(sortedHotspotRanks)
        cntr = 0
        for name, rank in sortedHotspotRanks:
            l.append((name, 1.0*cntr/nHotspotsUnderConsideration))
            cntr += 1
        return l
    
    relatedHotspotMaf = hotspotMafDf[hotspotMafDf['isCtypeCancerGene'] == True]
    unrelatedHotspotMaf = hotspotMafDf[hotspotMafDf['isCtypeCancerGene'] == False]
    
    relatedHotspotMaf['hotspotRank'] = relatedHotspotMaf['hotspotName'].apply(lambda x: hotspotRankingD[x] if x in hotspotRankingD else None)
    unrelatedHotspotMaf['hotspotRank'] = unrelatedHotspotMaf['hotspotName'].apply(lambda x: hotspotRankingD[x] if x in hotspotRankingD else None)
    #FILTER out hotspots that we do not have a ranking for
    relatedHotspotMaf = relatedHotspotMaf[relatedHotspotMaf['hotspotRank'].notnull()]
    unrelatedHotspotMaf = unrelatedHotspotMaf[unrelatedHotspotMaf['hotspotRank'].notnull()]
    
    nRelatedHotspots = relatedHotspotMaf.shape[0]
    nUnrelatedHotspots = unrelatedHotspotMaf.shape[0]
    
    relatedHotspotRankings = normalize_hotspot_rankings(relatedHotspotMaf)
    unrelatedHotspotRankings = normalize_hotspot_rankings(unrelatedHotspotMaf)
    
    listOfDicts = [{'hotspot':'NA_origin_for_path_Related', 'class': 'related', 'val':0, 'percentile': 0},
                  {'hotspot':'NA_origin_for_path_Unrelated', 'class': 'unrelated', 'val':0, 'percentile': 0}] #what we return
    runningSum = 0
    for hotspot, percentile in relatedHotspotRankings:
        frac = 1.0*relatedHotspotMaf[relatedHotspotMaf['hotspotName'] == hotspot].shape[0]/nRelatedHotspots
        runningSum += frac
        listOfDicts.append({
         'hotspot': hotspot,
         'class': 'related',
         'val': runningSum,
         'percentile': percentile
        })
        
    runningSum = 0
    for hotspot, percentile in unrelatedHotspotRankings:
        nTimesHotspotOccurs = unrelatedHotspotMaf[unrelatedHotspotMaf['hotspotName'] == hotspot].shape[0]
        frac = 1.0*nTimesHotspotOccurs/nUnrelatedHotspots
        runningSum += frac
        listOfDicts.append({
         'hotspot': hotspot,
         'class': 'unrelated',
        'val': runningSum,
        'percentile': percentile
        })
      
    return pd.DataFrame(listOfDicts)
    

In [234]:
def prepare_cancer_type_maf_for_analysis(maf, cType):
    cancerTypeHypermutatorIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir= pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType=cType, hypermutantStatus = 'Hypermutated')
    cancerTypeHypermutatorDf = filteredMafDf[filteredMafDf['Tumor_Sample_Barcode'].isin(cancerTypeHypermutatorIds)]
    cancerTypeHypermutatorHotspots = cancerTypeHypermutatorDf[cancerTypeHypermutatorDf['is-a-hotspot'] == 'Y']
    cancerTypeHypermutatorHotspots['hotspotName'] = cancerTypeHypermutatorHotspots.apply(lambda row:
                                                                                     row['Hugo_Symbol'] + ':' + row['HGVSp_Short'].strip('p.'), axis=1)
    cancerTypeHypermutatorHotspots['isCtypeCancerGene'] = cancerTypeHypermutatorHotspots['Hugo_Symbol'].apply(lambda x: True if x in relatedGenesDict[cType] else False)
    return cancerTypeHypermutatorHotspots

In [273]:
endometrialHypermutatorHotspots = prepare_cancer_type_maf_for_analysis(filteredMafDf, 'Endometrial Cancer')
colorectalHypermutatorHotspots = prepare_cancer_type_maf_for_analysis(filteredMafDf, 'Colorectal Cancer')
gliomaHypermutatorHotspots = prepare_cancer_type_maf_for_analysis(filteredMafDf, 'Glioma')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [274]:
dfColo = assign_hotspot_mutation_utilization_curves(colorectalHypermutatorHotspots, hotspotRankingD, 'Colorectal_Cancer')
dfEndo = assign_hotspot_mutation_utilization_curves(endometrialHypermutatorHotspots, hotspotRankingD, 'Endometrial_Cancer')
dfGlio = assign_hotspot_mutation_utilization_curves(gliomaHypermutatorHotspots, hotspotRankingD, 'Glioma')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [278]:
dfEndo.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/EndoHotspotUtilizationCurve.tsv', index=False, sep='\t')
dfColo.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/ColoHotspotUtilizationCurve.tsv', index=False, sep='\t')
dfGlio.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/GlioHotspotUtilizationCurve.tsv', index=False, sep='\t')

**WORKSPACE**


In [268]:
#a util function to sanity check this data
def check_impactful_muts(df):
    prev = 0
    sortedPercentiles = sorted(zip(df['val'], df['percentile'], df['hotspot']), key = lambda x: x[1])
    for val, percentile, allele in sortedPercentiles:
        dif = val - prev
        if dif > .02: print allele, dif
        prev = val
    #for 
    #    print row['val']

In [269]:
check_impactful_muts(dfEndo[dfEndo['class'] == 'unrelated'])


XPO1:E571K 0.0202702702703
ATM:R337C 0.0202702702703
MAX:R60Q 0.0236486486486
NFE2L2:E82D 0.027027027027
PIK3R2:G373R 0.0236486486486
CASP8:R127Q 0.0371621621622
FUBP1:R430C 0.0202702702703
ACVR1:R206H 0.027027027027
XPO1:R749Q 0.0236486486486
CCND1:P287S 0.027027027027
