In [None]:
import pandas as pd
import os
import subprocess
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data_dir = ""

In [None]:
# Read in the posterior estimates of the disease specific mean 
# differences and standard deviations.
pth = os.path.join(data_dir, 'post-stan-v8-results-2019-01-20.hd5')
res = pd.read_hdf(pth, key='results')

In [None]:
pth = '2018-12-28-pdx-clinical-final-for-paper.txt'

pdx_clinical = pd.read_csv(pth, sep='\t')

#pdx_clinical = pdx_clinical[pdx_clinical['Histology'].isin(hists)]
pdx_clinical = pdx_clinical[pdx_clinical['RNA Part of PPTC'] == 'yes']

In [None]:
hist_drops = []
_low_num = (pdx_clinical.groupby('Histology-Detailed').count() <= 2).any(axis=1)
for hist, is_low in _low_num.iteritems():
    if is_low == True:
        hist_drops.append(hist)

In [None]:
cmd = """
Rscript
fastgsea-no-min.R
filtered-tiger-tissgdb-gene-sets.gmt
/tmp/fgsea-analysis-stan-res.rnk
/tmp/fgsea-analysis-stan-res.fgsea
""".strip().split('\n')

fgseas = {}
for disease, _res in res.groupby('disease'):
    # Filter out small effects
    _res['abs-effect'] = _res['effect'].abs()
    _res.sort_values('abs-effect', ascending=False, inplace=True)
    _res.drop_duplicates('hugo', keep='first', inplace=True)
    _res.sort_values('effect', ascending=False, inplace=True)
    _res[['hugo', 'effect']].to_csv('/tmp/fgsea-analysis-stan-res.rnk',
                                     header=None,
                                     sep='\t',
                                     index=False)
    
    subprocess.check_call(cmd)
    
    fgsea = pd.read_csv('/tmp/fgsea-analysis-stan-res.fgsea')
    
    fgseas[disease] = fgsea
    
    os.remove('/tmp/fgsea-analysis-stan-res.rnk')
    os.remove('/tmp/fgsea-analysis-stan-res.fgsea')

In [None]:
# Obtain the set of all significantly enriched tissue gene sets. 
# The threshold for significants is an adjustest p-value less than
# 0.01 and a normalized enrichment greater than 2.

alpha = 0.01
sig_gss = set()
for disease, fgsea in fgseas.items():
    mask = (fgsea['padj'] < alpha) & (fgsea['NES'] > 2.0)
    _sigs = fgsea[mask]
    sig_gss.update(list(_sigs['pathway'].values))
    
sig_gss

In [None]:
# Iterate over the fgsea results and 
# pull out all of the statistically significant 
# enrichment scores across all PDX diseases.

gs_df = pd.DataFrame(index=list(fgseas.keys()), 
                     columns=list(sig_gss))
for disease, fgsea in fgseas.items():
    for gs in sig_gss:
        # Raise an error if a sample has a NaN gene set enrichment score
        if pd.isnull(fgsea.loc[fgsea['pathway'] == gs, 'NES']).any():
            raise ValueError()
            
        # Raise an error if a sample is missing a gene set enrichment score
        if fgsea.loc[fgsea['pathway'] == gs, 'NES'].shape == (0, ):
            raise ValueError()
            
        # Raise an error if a sample has more than one enrichment score
        if fgsea.loc[fgsea['pathway'] == gs, 'NES'].shape[0] > 1:
            raise ValueError()
        
        
        nes = fgsea.loc[fgsea['pathway'] == gs, 'NES'].item()
        pvalue = fgsea.loc[fgsea['pathway'] == gs, 'padj'].item()
        
        # If an enrichment is statistically significant report it
        # otherwise set the enrichment to zero.
        if pvalue < alpha:
            gs_df.loc[disease, gs] = nes
            
        else:
            gs_df.loc[disease, gs] = 0.0

# Set the columns to numeric for plotting
for c in gs_df.columns:
    gs_df[c] = pd.to_numeric(gs_df[c])

In [None]:
cmap = sns.diverging_palette(240, 10, sep=100, n=9)

sns.clustermap(gs_df.T,
               cmap=cmap,
               figsize=(10, 10),
               method='ward',
               vmin=-4.0,
               vmax=4.0, 
               linewidths=1.0)

pth = os.path.join(data_dir, 'fgsea-analysis-stan-v8-tiger-2019-01-20.svg')
plt.savefig(pth, format='svg', bbox_inches='tight')