In [None]:
import os
import numpy as np
import pandas as pd
import pickle

In [None]:
data_dir = ""

```cat stan-data-v8.output | grep -m 1 lp__ | tr ',' '\n' | cat -n | grep -P "d\." | cut -f 1 > stan-data-v8-columns```

```cat stan-data-v8.output | grep -v "#" | cut -d ',' -f <start>-<stop> > stan-data-v8.disease_effect```

In [None]:
# Load in a processed stan output file where only columns that 
# start with d\. are included. The data file is too large otherwise.
pth = os.path.join(data_dir, 'stan-data-v8-2019-01-20.disease_effect.hd5')
df = pd.read_hdf(pth, key='data')

In [None]:
# Read in the mapping to the original gene ids etc.
pth = os.path.join(data_dir, 'stan-data-v8-map-2019-01-20.pkl')
with open(pth, 'rb') as f:
    stan_map = pickle.load(f)

In [None]:
# Remove columns that are not disease specific
deffects = df.filter(regex="d\.\d*\.\d*")

# Delete the original df to save memory
del df

In [None]:
# Take the mean and standard deviation 
# from the posterior distributions
means = deffects.mean(axis=0)
stds = deffects.std(axis=0)

In [None]:
# Map Ensembl IDs to Hugo IDs
pth = os.path.join(data_dir, 'EnsGeneID_Hugo_Observed_Conversions.txt')
ens_to_hugo = {}
with open(pth) as f:
    for line in f:
        h, e = line.strip().split('\t')
        ens_to_hugo[e] = h

In [None]:
stan_map

In [None]:
# Convienence function for organizing data
def grab_data(effect, stan_map, means, stds, hugo_map):
    _, disease, gene = effect.split('.')
    disease = stan_map['disease'][int(disease)]
    gene = stan_map['gene'][int(gene)]
    hugo = gene
    _mean = means[effect]
    _std = stds[effect]
    return disease, gene, _mean, _std, hugo

In [None]:
from multiprocessing import Pool

def helper(args):
    return grab_data(*args)

pool = Pool(8)
_input = [(x, stan_map, means, stds, ens_to_hugo,) for x in means.index.values]
outs = pool.map_async(helper, _input)
res = pd.DataFrame(columns = ['disease', 'gene', 'effect', 'error', 'hugo'],
                   data=outs.get())

In [None]:
res

In [None]:
pth = os.path.join(data_dir, 'post-stan-v8-results-2019-01-20.hd5')
res.to_hdf(pth, key='results')