In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import math
import re
import sys
from collections import Counter

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import configuration_util
filePathDict = configuration_util.get_all_files_path_dict()
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import clonality_analysis_util
import get_gene_and_cohort_list_utils

In [121]:
def enumerate_related_genes(maf, thresh):
    nCases = len(set(maf['Tumor_Sample_Barcode']))
    oncMaf = maf[maf['oncogenic'].notnull()]
    oncMaf['caseGene'] = oncMaf['Tumor_Sample_Barcode'] + '_' + oncMaf['Hugo_Symbol']
    geneCounter = Counter(oncMaf.drop_duplicates(subset=['caseGene'])['Hugo_Symbol'])
    
    return [gene for gene, count in geneCounter.items() if (1.0*count)/nCases > thresh]

def get_related_genes_by_cancer_type(maf, thresh = 1.0/30.0):
    cancerTypes = set(maf['cancerType'])
    d = {}
    for ct in cancerTypes:
        ctMaf = maf[maf['cancerType'] == ct]
        normalMaf = ctMaf[ctMaf['hypermutationStatus'] == 'normal']
        relatedGenes = enumerate_related_genes(normalMaf, thresh)
        d[ct] = relatedGenes
    return d

def identify_parallel_evolution(maf, trunkId, branchId, pid):
    oncoMaf = maf[maf['oncogenic'].notnull()]
    otherBranchesMaf = oncoMaf[(~oncoMaf['adjLabel'].isin([trunkId, branchId])) &
                    (oncoMaf['pid'] == pid)]
    branchMaf = oncoMaf[oncoMaf['adjLabel'] == branchId]
    if len(set(otherBranchesMaf['Hugo_Symbol']) & set(branchMaf['Hugo_Symbol'])) > 0:
        return True
    else:
        return False
    
#todo this could return something non-boolean
def identify_tsg_second_hit(maf, trunkId, branchId):
    oncoMaf = maf[maf['oncogenic'].notnull()]
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    oncoTsgMaf = oncoMaf[oncoMaf['Hugo_Symbol'].isin(tsgs)]
    trunkOncoTsg = oncoTsgMaf[oncoTsgMaf['adjLabel'] == trunkId].drop_duplicates(subset=['varUuid']) #NOTE all mutations are at least duplicated in the truncal maf, fix it
    branchOncoTsg = oncoTsgMaf[oncoTsgMaf['adjLabel'] == branchId]
    
    oneHitTrunkTsgs = [gene for gene, count in dict(trunkOncoTsg['Hugo_Symbol'].value_counts()).items() if count == 1]
    secondHitTsgs = set(branchOncoTsg['Hugo_Symbol']) & set(oneHitTrunkTsgs)
    if len(secondHitTsgs) > 0:
        return True
    else:
        return False
    
def identify_denovo_biallelic_inactivation(maf, trunkId, branchId):
    oncoMaf = maf[maf['oncogenic'].notnull()]
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    oncoTsgMaf = oncoMaf[oncoMaf['Hugo_Symbol'].isin(tsgs)]
    trunkOncoTsg = oncoTsgMaf[oncoTsgMaf['adjLabel'] == trunkId].drop_duplicates(subset=['varUuid']) #NOTE all mutations are at least duplicated in the truncal maf, fix it
    branchOncoTsg = oncoTsgMaf[oncoTsgMaf['adjLabel'] == branchId]
    
    doubleHitBranchTsgs = [gene for gene, count in dict(branchOncoTsg['Hugo_Symbol'].value_counts()).items() if count > 1]
    denovoBranchBiallelic = set(doubleHitBranchTsgs) - set(trunkOncoTsg['Hugo_Symbol'])
    if len(denovoBranchBiallelic) > 0:
        return True
    else:
        return False

def summarize_mut_branch_info(maf, relatedGenesDict):
    listOfDicts = []
    for pid in set(maf['pid']):
        patientMaf = maf[maf['pid'] == pid]
        trunkLabel = pid + '_trunk'
        branchNumber = 1
        for branch in set(patientMaf['adjLabel']) - set([trunkLabel]):
            trunkMaf = mutsLabeledByBranch[mutsLabeledByBranch['adjLabel'] == trunkLabel]
            branchMaf = mutsLabeledByBranch[mutsLabeledByBranch['Tumor_Sample_Barcode'] == branch]
            
            cancerType = branchMaf['cancerType'].iloc[0]
            relatedGenes = relatedGenesDict[cancerType]
            
            #Number of related and unrelated drivers
            nRelatedDriverTrunk = trunkMaf[(trunkMaf['oncogenic'].notnull()) & (trunkMaf['Hugo_Symbol'].isin(relatedGenes))].shape[0]
            nUnrelatedDriverTrunk = trunkMaf[(trunkMaf['oncogenic'].notnull()) & (~trunkMaf['Hugo_Symbol'].isin(relatedGenes))].shape[0]    
            nRelatedDriverBranch = branchMaf[(branchMaf['oncogenic'].notnull()) & (branchMaf['Hugo_Symbol'].isin(relatedGenes))].shape[0]
            nUnrelatedDriverBranch = branchMaf[(branchMaf['oncogenic'].notnull()) & (~branchMaf['Hugo_Symbol'].isin(relatedGenes))].shape[0]    
            
            convergentEvolution = identify_parallel_evolution(mutsLabeledByBranch, trunkLabel, branch, pid)
            tsgSecondHit = identify_tsg_second_hit(mutsLabeledByBranch, trunkLabel, branch)
            denovoBiallelicInactivation = identify_denovo_biallelic_inactivation(mutsLabeledByBranch, trunkLabel, branch)
            
            listOfDicts.append({'cancerType': cancerType, 'branchNumber': branchNumber,
                               'trunkId': trunkLabel, 'nRelatedTrunk': nRelatedDriverTrunk, 'nUnrelatedTrunk': nUnrelatedDriverTrunk,
                                'nRelatedBranch': nRelatedDriverBranch, 'nUnrelatedBranch': nUnrelatedDriverBranch,
                                'branchId': branch, 'convergentEvolution': convergentEvolution,
                                'secondHitTSG': tsgSecondHit, 'denovoBiallelicInactivation': denovoBiallelicInactivation
                               })
            
            branchNumber += 1
            
    df = pd.DataFrame(listOfDicts)
    
    cancerTypeOrdering = {'Bladder Cancer': '1', 'Colorectal Cancer': '2', 'Endometrial Cancer': '3',
                         'Esophagogastric Cancer': '4', 'Glioma': '5', 'Prostate Cancer': '6'}
    
    df['orderingVal'] = df.apply(lambda row: str(cancerTypeOrdering[row['cancerType']]) +
                re.sub("[^0-9]", '', str(row['trunkId'])) + str(row['branchNumber']), axis=1)
    return df
            
            
            

In [38]:
mutsLabeledByBranch = pd.read_table('/Users/friedman/Desktop/offlineFilesForVirus/hypermutantMultipleSamplesLabeledByBranch.tsv')
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
cancerTypeDict = dict(get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO']))
allImpactMutsMaf['cancerType'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

mutsLabeledByBranch['barcodeOld'] = mutsLabeledByBranch['adjLabel'].apply(lambda x: x.split('_')[0])
mutsLabeledByBranch['cancerType'] = mutsLabeledByBranch['barcodeOld'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
allImpactMutsMaf['hypermutationStatus'] = allImpactMutsMaf['Tumor_Sample_Barcode'].apply(lambda x:
    'hypermutated' if x in hypermutantIds else 'normal' if x in normalIds else 'Intermediate')


  """Entry point for launching an IPython kernel.
  


In [24]:
relatedGenesDict = get_related_genes_by_cancer_type(allImpactMutsMaf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [122]:
df = summarize_mut_branch_info(mutsLabeledByBranch, relatedGenesDict)

In [124]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/multipleSampleSummary.tsv', index=False, sep='\t')

In [125]:
df

Unnamed: 0,branchId,branchNumber,cancerType,convergentEvolution,denovoBiallelicInactivation,nRelatedBranch,nRelatedTrunk,nUnrelatedBranch,nUnrelatedTrunk,secondHitTSG,trunkId,orderingVal
0,P-0026297-T02-IM6_branch,1,Endometrial Cancer,False,False,0,24,7,70,True,P-0026297_trunk,300262971
1,P-0026297-T01-IM6_branch,2,Endometrial Cancer,False,False,1,24,1,70,False,P-0026297_trunk,300262972
2,P-0018106-T02-IM6_branch,1,Endometrial Cancer,False,False,0,8,8,20,False,P-0018106_trunk,300181061
3,P-0018106-T01-IM6_branch,2,Endometrial Cancer,False,False,0,8,2,20,True,P-0018106_trunk,300181062
4,P-0008682-T02-IM6_branch,1,Prostate Cancer,False,True,1,6,7,10,True,P-0008682_trunk,600086821
5,P-0008682-T01-IM5_branch,2,Prostate Cancer,False,False,0,6,2,10,False,P-0008682_trunk,600086822
6,P-0019199-T01-IM6_branch,1,Prostate Cancer,True,False,1,2,1,8,False,P-0019199_trunk,600191991
7,P-0019199-T02-IM6_branch,2,Prostate Cancer,True,False,4,2,2,8,False,P-0019199_trunk,600191992
8,P-0040250-T01-IM6_branch,1,Colorectal Cancer,True,False,0,8,3,24,True,P-0040250_trunk,200402501
9,P-0040250-T02-IM6_branch,2,Colorectal Cancer,True,False,0,8,5,24,False,P-0040250_trunk,200402502
