In [1]:
import os
import numpy as np
import pandas as pd
import pickle

In [2]:
data_dir = "../../data"

```cat stan-data-v8.output | grep -m 1 lp__ | tr ',' '\n' | cat -n | grep -P "d\." | cut -f 1 > stan-data-v8-columns```

```cat stan-data-v8.output | grep -v "#" | cut -d ',' -f <start>-<stop> > stan-data-v8.disease_effect```

In [3]:
# Load in a processed stan output file where only columns that 
# start with d\. are included. The data file is too large otherwise.
pth = os.path.join(data_dir, 'stan-data-v8-2019-07-30.disease.hd5')
df = pd.read_hdf(pth, key='pdx')

In [4]:
# Read in the mapping to the original gene ids etc.
pth = os.path.join(data_dir, 'stan-data-v8-map-2019-07-30.pkl')
with open(pth, 'rb') as f:
    stan_map = pickle.load(f)

In [5]:
# Remove columns that are not disease specific
deffects = df.filter(regex="d\.\d*\.\d*")

# Delete the original df to save memory
del df

In [6]:
# Take the mean and standard deviation 
# from the posterior distributions
means = deffects.mean(axis=0)
stds = deffects.std(axis=0)

In [7]:
# Map Ensembl IDs to Hugo IDs
pth = '../../data/EnsGeneID_Hugo_Observed_Conversions.txt'
ens_to_hugo = {}
with open(pth) as f:
    for line in f:
        h, e = line.strip().split('\t')
        ens_to_hugo[e] = h

In [8]:
stan_map

{'tissue': {1: 'b-cell ALL',
  2: 't-cell ALL',
  3: 'soft sarcoma',
  4: 'brain',
  5: 'bone',
  6: 'neuroblastoma',
  7: 'renal',
  8: 'carcinoma',
  9: 'hepatoblastoma'},
 'disease': {1: 'BCP-ALL',
  2: 'MLL-ALL',
  3: 'Ph+-ALL',
  4: 'T-ALL',
  5: 'Ph-likeALL',
  6: 'ASPS',
  7: 'CNS embryonal NOS',
  8: 'ATRT',
  9: 'Ewing Sarcoma',
  10: 'Neuroblastoma',
  11: 'ETP-ALL',
  12: 'Extracranial Rhabdoid',
  13: 'ETMR',
  14: 'DIPG',
  15: 'Glioblastoma',
  16: 'Ependymoma',
  17: 'Ependymoblastoma',
  18: 'CNS EFT-CIC',
  19: 'Astrocytoma',
  20: 'CNS germinoma',
  21: 'Medulloblastoma',
  22: 'Fusion- RMS',
  23: 'Wilms',
  24: 'Fusion+ RMS',
  25: 'Small Cell Carcinoma',
  26: 'Colon Carcinoma',
  27: 'Clear Cell Sarcoma',
  28: 'Hepatoblastoma',
  29: 'Osteosarcoma'},
 'gene': {1: 'ENSG00000019582.14',
  2: 'ENSG00000107447.7',
  3: 'ENSG00000204287.13',
  4: 'ENSG00000281204.1',
  5: 'ENSG00000111348.8',
  6: 'ENSG00000167244.17',
  7: 'ENSG00000169442.8',
  8: 'ENSG00000164692.1

In [9]:
# Convienence function for organizing data
def grab_data(effect, stan_map, means, stds, hugo_map):
    _, disease, gene = effect.split('.')
    disease = stan_map['disease'][int(disease)]
    gene = stan_map['gene'][int(gene)]
    hugo = ens_to_hugo[gene]
    _mean = means[effect]
    _std = stds[effect]
    return disease, gene, _mean, _std, hugo

In [10]:
from multiprocessing import Pool

def helper(args):
    return grab_data(*args)

pool = Pool(8)
_input = [(x, stan_map, means, stds, ens_to_hugo,) for x in means.index.values]
outs = pool.map_async(helper, _input)
res = pd.DataFrame(columns = ['disease', 'gene', 'effect', 'error', 'hugo'],
                   data=outs.get())

In [11]:
res

Unnamed: 0,disease,gene,effect,error,hugo
0,BCP-ALL,ENSG00000019582.14,8.102033,0.089688,CD74
1,MLL-ALL,ENSG00000019582.14,7.628470,0.163798,CD74
2,Ph+-ALL,ENSG00000019582.14,8.201045,0.161292,CD74
3,T-ALL,ENSG00000019582.14,-0.993488,0.152176,CD74
4,Ph-likeALL,ENSG00000019582.14,7.920443,0.096277,CD74
5,ASPS,ENSG00000019582.14,-1.151003,0.217790,CD74
6,CNS embryonal NOS,ENSG00000019582.14,-0.966888,0.341261,CD74
7,ATRT,ENSG00000019582.14,-2.202800,0.170736,CD74
8,Ewing Sarcoma,ENSG00000019582.14,-3.049180,0.084718,CD74
9,Neuroblastoma,ENSG00000019582.14,-2.540554,0.073662,CD74


In [12]:
pth = os.path.join(data_dir, 'post-stan-v8-results-2019-07-30.hd5')
res.to_hdf(pth, key='results')