In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from scipy import stats

import settings as conf
from utils import is_number, chunker

# Load S-PrediXcan results

## From Rapid GWAS project

In [3]:
from results.spredixcan import PhenoResults

In [4]:
_path = os.path.join(conf.SPREDIXCAN_RESULTS_DIR['RapidGWASProject'] + '/*')
display(_path)
all_spredixcan_results_dirs = [f for f in glob(_path) if os.path.isdir(f)]
display(len(all_spredixcan_results_dirs))
assert len(all_spredixcan_results_dirs) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

'/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/results/spredixcan/rapid_gwas_project/*'

4049

In [5]:
all_spredixcan_phenotypes = [PhenoResults(p) for p in all_spredixcan_results_dirs]

display(len(all_spredixcan_phenotypes))
assert len(all_spredixcan_phenotypes) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

4049

## From GTEx GWAS manuscript

In [6]:
_path = os.path.join(conf.SPREDIXCAN_RESULTS_DIR['GTEX_GWAS'] + '/*')
display(_path)
all_extra_results_dirs = [f for f in glob(_path) if os.path.isdir(f)]
display(len(all_extra_results_dirs))
assert len(all_extra_results_dirs) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

'/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/results/spredixcan/gtex_gwas/*'

42

In [7]:
all_extra_results_dirs[:5]

['/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/results/spredixcan/gtex_gwas/GIANT_HEIGHT',
 '/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/results/spredixcan/gtex_gwas/CNCR_Insomnia_all',
 '/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/results/spredixcan/gtex_gwas/SSGAC_Education_Years_Pooled',
 '/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/results/spredixcan/gtex_gwas/Astle_et_al_2016_White_blood_cell_count',
 '/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/results/spredixcan/gtex_gwas/IGAP_Alzheimer']

In [8]:
_file_pattern = 'spredixcan_igwas_gtexmashrv8_(?P<code>[^/]+)__PM__(?P<tissue>.+)\.csv$'
all_extra_phenotypes = [PhenoResults(p, _file_pattern) for p in all_extra_results_dirs]
all_extra_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_extra_phenotypes])

display(len(all_extra_phenotypes))
assert len(all_extra_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

42

# S-PrediXcan: direction of effect

## Effect direction: consensus

In [9]:
from results.spredixcan import PhenoResults

### Compute results

In [10]:
from tqdm import tqdm

In [11]:
def _get_combined_results(phenos):
    return {
        pheno.pheno_info.get_plain_name():
            pheno.get_consensus_effect_direction()
        for pheno in phenos
    }

In [12]:
def _run_all(phenotype_chunks, n_jobs=conf.N_JOBS_HIGH):
    all_results = {}
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk) for chunk in phenotype_chunks]
        for future in tqdm(as_completed(tasks), total=len(tasks)):
            res = future.result()
            all_results.update(res)
    
    return all_results

In [13]:
# phenotype_chunks = chunker(all_spredixcan_phenotypes[:10] + all_extra_phenotypes[:10], 4)
phenotype_chunks = chunker(all_spredixcan_phenotypes + all_extra_phenotypes, 10)

In [14]:
all_results = _run_all(phenotype_chunks, n_jobs=conf.N_JOBS_HIGH)

  0%|          | 0/410 [00:00<?, ?it/s]

  0%|          | 1/410 [04:55<33:37:10, 295.92s/it]

  0%|          | 2/410 [05:12<24:02:24, 212.12s/it]

  1%|          | 3/410 [05:14<16:52:15, 149.23s/it]

  1%|          | 4/410 [05:19<11:55:58, 105.81s/it]

  1%|          | 5/410 [10:05<17:59:55, 159.99s/it]

  1%|▏         | 6/410 [10:24<13:11:57, 117.62s/it]

  2%|▏         | 7/410 [10:27<9:18:21, 83.13s/it]  

  2%|▏         | 8/410 [10:28<6:32:43, 58.62s/it]

  2%|▏         | 9/410 [15:20<14:19:20, 128.58s/it]

  2%|▏         | 10/410 [15:32<10:24:34, 93.69s/it]

  3%|▎         | 11/410 [15:41<7:33:54, 68.26s/it] 

  3%|▎         | 12/410 [15:43<5:20:30, 48.32s/it]

  3%|▎         | 13/410 [20:26<13:06:04, 118.80s/it]

  3%|▎         | 14/410 [20:40<9:35:12, 87.15s/it]  

  4%|▎         | 15/410 [20:49<7:00:58, 63.95s/it]

  4%|▍         | 16/410 [20:54<5:03:57, 46.29s/it]

  4%|▍         | 17/410 [25:39<12:51:00, 117.71s/it]

  4%|▍         | 18/410 [25:50<9:20:46, 85.83s/it]  

  5%|▍         | 19/410 [25:58<6:45:45, 62.26s/it]

  5%|▍         | 20/410 [26:04<4:55:16, 45.43s/it]

  5%|▌         | 21/410 [30:50<12:42:54, 117.67s/it]

  5%|▌         | 22/410 [31:06<9:23:52, 87.20s/it]  

  6%|▌         | 23/410 [31:07<6:34:38, 61.19s/it]

  6%|▌         | 24/410 [31:12<4:46:15, 44.50s/it]

  6%|▌         | 25/410 [36:00<12:33:52, 117.49s/it]

  6%|▋         | 26/410 [36:18<9:21:06, 87.67s/it]  

  7%|▋         | 27/410 [36:21<6:38:00, 62.35s/it]

  7%|▋         | 28/410 [36:23<4:41:09, 44.16s/it]

  7%|▋         | 29/410 [41:09<12:20:38, 116.64s/it]

  7%|▋         | 30/410 [41:27<9:10:54, 86.99s/it]  

  8%|▊         | 31/410 [41:32<6:34:33, 62.46s/it]

  8%|▊         | 32/410 [41:33<4:38:35, 44.22s/it]

  8%|▊         | 33/410 [46:19<12:13:19, 116.71s/it]

  8%|▊         | 34/410 [46:35<9:02:23, 86.55s/it]  

  9%|▊         | 35/410 [46:41<6:28:13, 62.12s/it]

  9%|▉         | 36/410 [46:47<4:42:19, 45.29s/it]

  9%|▉         | 37/410 [51:32<12:09:20, 117.32s/it]

  9%|▉         | 38/410 [51:47<8:57:41, 86.72s/it]  

 10%|▉         | 39/410 [51:51<6:22:28, 61.86s/it]

 10%|▉         | 40/410 [51:59<4:41:22, 45.63s/it]

 10%|█         | 41/410 [56:40<11:55:36, 116.36s/it]

 10%|█         | 42/410 [56:58<8:51:18, 86.63s/it]  

 10%|█         | 43/410 [57:01<6:17:26, 61.71s/it]

 11%|█         | 44/410 [57:08<4:35:47, 45.21s/it]

 11%|█         | 45/410 [1:01:49<11:46:16, 116.10s/it]

 11%|█         | 46/410 [1:02:08<8:46:10, 86.73s/it]  

 11%|█▏        | 47/410 [1:02:10<6:12:14, 61.53s/it]

 12%|█▏        | 48/410 [1:02:17<4:31:52, 45.06s/it]

 12%|█▏        | 49/410 [1:07:02<11:43:45, 116.97s/it]

 12%|█▏        | 50/410 [1:07:14<8:33:38, 85.61s/it]  

 12%|█▏        | 51/410 [1:07:16<6:02:33, 60.60s/it]

 13%|█▎        | 52/410 [1:07:31<4:38:34, 46.69s/it]

 13%|█▎        | 53/410 [1:12:12<11:37:12, 117.18s/it]

 13%|█▎        | 54/410 [1:12:20<8:20:47, 84.40s/it]  

 13%|█▎        | 55/410 [1:12:26<5:59:53, 60.83s/it]

 14%|█▎        | 56/410 [1:12:40<4:36:36, 46.88s/it]

 14%|█▍        | 57/410 [1:17:25<11:36:24, 118.37s/it]

 14%|█▍        | 58/410 [1:17:32<8:17:07, 84.74s/it]  

 14%|█▍        | 59/410 [1:17:35<5:52:02, 60.18s/it]

 15%|█▍        | 60/410 [1:17:55<4:41:22, 48.24s/it]

 15%|█▍        | 61/410 [1:22:36<11:27:11, 118.14s/it]

 15%|█▌        | 62/410 [1:22:38<8:03:09, 83.30s/it]  

 15%|█▌        | 63/410 [1:23:00<6:14:45, 64.80s/it]

 16%|█▌        | 64/410 [1:23:07<4:33:50, 47.49s/it]

 16%|█▌        | 65/410 [1:27:41<11:04:10, 115.51s/it]

 16%|█▌        | 66/410 [1:27:47<7:53:34, 82.60s/it]  

 16%|█▋        | 67/410 [1:28:09<6:07:22, 64.26s/it]

 17%|█▋        | 68/410 [1:28:15<4:28:09, 47.05s/it]

 17%|█▋        | 69/410 [1:32:48<10:52:24, 114.79s/it]

 17%|█▋        | 70/410 [1:32:57<7:49:23, 82.83s/it]  

 17%|█▋        | 71/410 [1:33:19<6:05:53, 64.76s/it]

 18%|█▊        | 72/410 [1:33:25<4:26:08, 47.24s/it]

 18%|█▊        | 73/410 [1:37:55<10:40:34, 114.05s/it]

 18%|█▊        | 74/410 [1:38:06<7:44:13, 82.90s/it]  

 18%|█▊        | 75/410 [1:38:34<6:11:28, 66.53s/it]

 19%|█▊        | 76/410 [1:38:38<4:26:50, 47.94s/it]

 19%|█▉        | 77/410 [1:43:03<10:27:04, 112.99s/it]

 19%|█▉        | 78/410 [1:43:15<7:36:54, 82.57s/it]  

 19%|█▉        | 79/410 [1:43:43<6:04:44, 66.12s/it]

 20%|█▉        | 80/410 [1:43:50<4:26:36, 48.47s/it]

 20%|█▉        | 81/410 [1:48:09<10:12:20, 111.67s/it]

 20%|██        | 82/410 [1:48:23<7:31:02, 82.51s/it]  

 20%|██        | 83/410 [1:48:52<6:01:17, 66.29s/it]

 20%|██        | 84/410 [1:49:00<4:24:34, 48.70s/it]

 21%|██        | 85/410 [1:53:15<9:59:24, 110.66s/it]

 21%|██        | 86/410 [1:53:28<7:19:56, 81.47s/it] 

 21%|██        | 87/410 [1:54:01<6:00:37, 66.99s/it]

 21%|██▏       | 88/410 [1:54:07<4:20:49, 48.60s/it]

 22%|██▏       | 89/410 [1:58:35<10:12:44, 114.53s/it]

 22%|██▏       | 90/410 [1:58:40<7:14:25, 81.46s/it]  

 22%|██▏       | 91/410 [1:59:19<6:05:44, 68.79s/it]

 22%|██▏       | 92/410 [1:59:22<4:19:18, 48.93s/it]

 23%|██▎       | 93/410 [2:03:42<9:53:23, 112.31s/it]

 23%|██▎       | 94/410 [2:03:49<7:05:24, 80.77s/it] 

 23%|██▎       | 95/410 [2:04:28<5:59:07, 68.41s/it]

 23%|██▎       | 96/410 [2:04:31<4:13:58, 48.53s/it]

 24%|██▎       | 97/410 [2:08:49<9:41:47, 111.53s/it]

 24%|██▍       | 98/410 [2:09:03<7:07:09, 82.14s/it] 

 24%|██▍       | 99/410 [2:09:37<5:51:32, 67.82s/it]

 24%|██▍       | 100/410 [2:09:39<4:07:25, 47.89s/it]

 25%|██▍       | 101/410 [2:13:55<9:29:20, 110.55s/it]

 25%|██▍       | 102/410 [2:14:10<6:59:14, 81.67s/it] 

 25%|██▌       | 103/410 [2:14:43<5:43:52, 67.21s/it]

 25%|██▌       | 104/410 [2:14:47<4:05:54, 48.22s/it]

 26%|██▌       | 105/410 [2:19:02<9:20:24, 110.25s/it]

 26%|██▌       | 106/410 [2:19:18<6:54:52, 81.88s/it] 

 26%|██▌       | 107/410 [2:19:53<5:43:28, 68.01s/it]

 26%|██▋       | 108/410 [2:19:54<4:00:04, 47.70s/it]

 27%|██▋       | 109/410 [2:24:10<9:13:48, 110.40s/it]

 27%|██▋       | 110/410 [2:24:27<6:51:26, 82.29s/it] 

 27%|██▋       | 111/410 [2:25:00<5:36:41, 67.56s/it]

 27%|██▋       | 112/410 [2:25:00<3:55:12, 47.36s/it]

 28%|██▊       | 113/410 [2:29:18<9:06:03, 110.31s/it]

 28%|██▊       | 114/410 [2:29:34<6:45:39, 82.23s/it] 

 28%|██▊       | 115/410 [2:30:09<5:34:41, 68.07s/it]

 28%|██▊       | 116/410 [2:30:10<3:54:46, 47.91s/it]

 29%|██▊       | 117/410 [2:34:25<8:56:59, 109.97s/it]

 29%|██▉       | 118/410 [2:34:39<6:35:28, 81.26s/it] 

 29%|██▉       | 119/410 [2:35:19<5:34:05, 68.88s/it]

 29%|██▉       | 120/410 [2:35:20<3:54:49, 48.59s/it]

 30%|██▉       | 121/410 [2:39:31<8:45:41, 109.14s/it]

 30%|██▉       | 122/410 [2:39:44<6:25:29, 80.31s/it] 

 30%|███       | 123/410 [2:40:27<5:30:10, 69.03s/it]

 30%|███       | 124/410 [2:40:27<3:51:00, 48.46s/it]

 30%|███       | 125/410 [2:44:38<8:38:57, 109.26s/it]

 31%|███       | 126/410 [2:44:51<6:20:46, 80.44s/it] 

 31%|███       | 127/410 [2:45:37<5:30:37, 70.10s/it]

 31%|███       | 128/410 [2:45:41<3:55:17, 50.06s/it]

 31%|███▏      | 129/410 [2:49:44<8:26:21, 108.12s/it]

 32%|███▏      | 130/410 [2:49:59<6:13:49, 80.10s/it] 

 32%|███▏      | 131/410 [2:50:53<5:35:50, 72.23s/it]

 32%|███▏      | 133/410 [2:54:52<6:38:58, 86.42s/it]

 33%|███▎      | 134/410 [2:55:05<4:56:50, 64.53s/it]

 33%|███▎      | 135/410 [2:56:02<4:44:48, 62.14s/it]

 33%|███▎      | 136/410 [2:56:08<3:26:39, 45.25s/it]

 33%|███▎      | 137/410 [2:59:57<7:37:32, 100.56s/it]

 34%|███▎      | 138/410 [3:00:13<5:40:41, 75.15s/it] 

 34%|███▍      | 139/410 [3:01:08<5:11:44, 69.02s/it]

 34%|███▍      | 140/410 [3:01:15<3:47:19, 50.52s/it]

 34%|███▍      | 141/410 [3:05:06<7:48:17, 104.45s/it]

 35%|███▍      | 142/410 [3:05:17<5:41:58, 76.56s/it] 

 35%|███▍      | 143/410 [3:06:12<5:11:59, 70.11s/it]

 35%|███▌      | 144/410 [3:06:26<3:56:34, 53.36s/it]

 35%|███▌      | 145/410 [3:10:11<7:42:44, 104.77s/it]

 36%|███▌      | 146/410 [3:10:22<5:37:23, 76.68s/it] 

 36%|███▌      | 147/410 [3:11:21<5:12:03, 71.19s/it]

 36%|███▌      | 148/410 [3:11:31<3:51:34, 53.03s/it]

 36%|███▋      | 149/410 [3:15:16<7:34:44, 104.54s/it]

 37%|███▋      | 150/410 [3:15:34<5:40:52, 78.66s/it] 

 37%|███▋      | 151/410 [3:16:30<5:09:14, 71.64s/it]

 37%|███▋      | 152/410 [3:16:39<3:48:01, 53.03s/it]

 37%|███▋      | 153/410 [3:20:27<7:31:12, 105.34s/it]

 38%|███▊      | 154/410 [3:20:44<5:36:37, 78.90s/it] 

 38%|███▊      | 155/410 [3:21:36<5:01:24, 70.92s/it]

 38%|███▊      | 156/410 [3:21:47<3:44:20, 52.99s/it]

 38%|███▊      | 157/410 [3:25:29<7:16:44, 103.58s/it]

 39%|███▊      | 158/410 [3:25:50<5:30:31, 78.70s/it] 

 39%|███▉      | 159/410 [3:26:42<4:56:18, 70.83s/it]

 39%|███▉      | 160/410 [3:26:53<3:40:47, 52.99s/it]

 39%|███▉      | 161/410 [3:30:33<7:07:55, 103.11s/it]

 40%|███▉      | 162/410 [3:30:56<5:26:06, 78.90s/it] 

 40%|███▉      | 163/410 [3:31:47<4:51:04, 70.71s/it]

 40%|████      | 164/410 [3:32:02<3:40:41, 53.83s/it]

 40%|████      | 165/410 [3:35:39<7:00:15, 102.92s/it]

 40%|████      | 166/410 [3:36:02<5:20:05, 78.71s/it] 

 41%|████      | 167/410 [3:36:55<4:47:48, 71.07s/it]

 41%|████      | 168/410 [3:37:06<3:34:09, 53.10s/it]

 41%|████      | 169/410 [3:40:44<6:52:10, 102.62s/it]

 41%|████▏     | 170/410 [3:41:13<5:21:32, 80.39s/it] 

 42%|████▏     | 171/410 [3:42:02<4:43:38, 71.21s/it]

 42%|████▏     | 172/410 [3:42:17<3:34:56, 54.19s/it]

 42%|████▏     | 173/410 [3:45:52<6:44:44, 102.47s/it]

 42%|████▏     | 174/410 [3:46:17<5:11:44, 79.26s/it] 

 43%|████▎     | 175/410 [3:47:10<4:39:04, 71.26s/it]

 43%|████▎     | 176/410 [3:47:21<3:28:15, 53.40s/it]

 43%|████▎     | 177/410 [3:51:01<6:40:32, 103.14s/it]

 43%|████▎     | 178/410 [3:51:21<5:02:26, 78.22s/it] 

 44%|████▎     | 179/410 [3:52:19<4:37:59, 72.21s/it]

 44%|████▍     | 180/410 [3:52:29<3:24:59, 53.47s/it]

 44%|████▍     | 181/410 [3:56:07<6:32:29, 102.84s/it]

 44%|████▍     | 182/410 [3:56:28<4:57:18, 78.24s/it] 

 45%|████▍     | 183/410 [3:57:28<4:35:22, 72.79s/it]

 45%|████▍     | 184/410 [3:57:34<3:19:38, 53.00s/it]

 45%|████▌     | 185/410 [4:01:12<6:23:56, 102.39s/it]

 45%|████▌     | 186/410 [4:01:38<4:56:23, 79.39s/it] 

 46%|████▌     | 187/410 [4:02:34<4:28:52, 72.35s/it]

 46%|████▌     | 188/410 [4:02:40<3:13:56, 52.42s/it]

 46%|████▌     | 189/410 [4:06:22<6:20:24, 103.28s/it]

 46%|████▋     | 190/410 [4:06:45<4:50:51, 79.32s/it] 

 47%|████▋     | 191/410 [4:07:38<4:20:56, 71.49s/it]

 47%|████▋     | 192/410 [4:07:48<3:12:21, 52.94s/it]

 47%|████▋     | 193/410 [4:11:27<6:12:11, 102.91s/it]

 47%|████▋     | 194/410 [4:11:48<4:42:03, 78.35s/it] 

 48%|████▊     | 195/410 [4:12:43<4:14:41, 71.08s/it]

 48%|████▊     | 196/410 [4:12:51<3:06:59, 52.43s/it]

 48%|████▊     | 197/410 [4:16:36<6:09:37, 104.12s/it]

 48%|████▊     | 198/410 [4:16:52<4:34:48, 77.77s/it] 

 49%|████▊     | 199/410 [4:17:49<4:10:48, 71.32s/it]

 49%|████▉     | 200/410 [4:17:57<3:03:02, 52.30s/it]

 49%|████▉     | 201/410 [4:21:38<5:58:28, 102.91s/it]

 49%|████▉     | 202/410 [4:21:56<4:28:23, 77.42s/it] 

 50%|████▉     | 203/410 [4:22:54<4:07:35, 71.77s/it]

 50%|████▉     | 204/410 [4:23:02<3:00:30, 52.58s/it]

 50%|█████     | 205/410 [4:26:49<5:58:45, 105.00s/it]

 50%|█████     | 206/410 [4:27:01<4:22:01, 77.06s/it] 

 50%|█████     | 207/410 [4:28:00<4:02:14, 71.60s/it]

 51%|█████     | 208/410 [4:28:06<2:54:25, 51.81s/it]

 51%|█████     | 209/410 [4:31:58<5:54:43, 105.89s/it]

 51%|█████     | 210/410 [4:32:05<4:14:14, 76.27s/it] 

 51%|█████▏    | 211/410 [4:33:07<3:58:30, 71.91s/it]

 52%|█████▏    | 212/410 [4:33:13<2:52:52, 52.38s/it]

 52%|█████▏    | 213/410 [4:37:06<5:49:24, 106.42s/it]

 52%|█████▏    | 214/410 [4:37:10<4:06:48, 75.55s/it] 

 52%|█████▏    | 215/410 [4:38:12<3:53:03, 71.71s/it]

 53%|█████▎    | 216/410 [4:38:18<2:47:43, 51.87s/it]

 53%|█████▎    | 217/410 [4:42:12<5:42:22, 106.44s/it]

 53%|█████▎    | 218/410 [4:42:13<3:59:57, 74.99s/it] 

 53%|█████▎    | 219/410 [4:43:18<3:49:20, 72.04s/it]

 54%|█████▎    | 220/410 [4:43:30<2:50:18, 53.78s/it]

 54%|█████▍    | 221/410 [4:47:20<5:36:15, 106.75s/it]

 54%|█████▍    | 222/410 [4:47:20<3:54:20, 74.79s/it] 

 54%|█████▍    | 223/410 [4:48:27<3:45:20, 72.30s/it]

 55%|█████▍    | 224/410 [4:48:35<2:44:58, 53.22s/it]

 55%|█████▍    | 225/410 [4:52:26<5:28:08, 106.42s/it]

 55%|█████▌    | 226/410 [4:52:26<3:49:01, 74.68s/it] 

 55%|█████▌    | 227/410 [4:53:33<3:40:35, 72.33s/it]

 56%|█████▌    | 228/410 [4:53:42<2:41:29, 53.24s/it]

 56%|█████▌    | 229/410 [4:57:31<5:19:12, 105.82s/it]

 56%|█████▌    | 230/410 [4:57:32<3:43:37, 74.54s/it] 

 56%|█████▋    | 231/410 [4:58:41<3:37:37, 72.95s/it]

 57%|█████▋    | 232/410 [4:58:48<2:37:14, 53.01s/it]

 57%|█████▋    | 233/410 [5:02:33<5:08:55, 104.72s/it]

 57%|█████▋    | 234/410 [5:02:41<3:41:55, 75.66s/it] 

 57%|█████▋    | 235/410 [5:03:51<3:36:04, 74.08s/it]

 58%|█████▊    | 236/410 [5:03:53<2:32:08, 52.46s/it]

 58%|█████▊    | 237/410 [5:07:43<5:04:16, 105.53s/it]

 58%|█████▊    | 238/410 [5:07:47<3:35:15, 75.09s/it] 

 58%|█████▊    | 239/410 [5:08:55<3:27:46, 72.90s/it]

 59%|█████▊    | 240/410 [5:08:57<2:26:52, 51.84s/it]

 59%|█████▉    | 241/410 [5:12:53<5:01:08, 106.91s/it]

 59%|█████▉    | 242/410 [5:12:55<3:31:41, 75.60s/it] 

 59%|█████▉    | 243/410 [5:14:00<3:21:39, 72.45s/it]

 60%|█████▉    | 244/410 [5:14:01<2:20:42, 50.86s/it]

 60%|█████▉    | 245/410 [5:17:57<4:52:53, 106.51s/it]

 60%|██████    | 246/410 [5:17:59<3:25:20, 75.13s/it] 

 60%|██████    | 247/410 [5:19:03<3:15:14, 71.87s/it]

 60%|██████    | 248/410 [5:19:05<2:17:30, 50.93s/it]

 61%|██████    | 249/410 [5:23:00<4:44:49, 106.15s/it]

 61%|██████    | 250/410 [5:23:03<3:20:31, 75.19s/it] 

 61%|██████    | 251/410 [5:24:07<3:10:18, 71.81s/it]

 61%|██████▏   | 252/410 [5:24:10<2:14:38, 51.13s/it]

 62%|██████▏   | 253/410 [5:28:06<4:38:48, 106.55s/it]

 62%|██████▏   | 254/410 [5:28:08<3:15:27, 75.18s/it] 

 62%|██████▏   | 255/410 [5:29:13<3:06:26, 72.17s/it]

 62%|██████▏   | 256/410 [5:29:15<2:11:12, 51.12s/it]

 63%|██████▎   | 257/410 [5:33:13<4:33:05, 107.10s/it]

 63%|██████▎   | 258/410 [5:33:16<3:12:14, 75.88s/it] 

 63%|██████▎   | 259/410 [5:34:20<3:01:41, 72.19s/it]

 63%|██████▎   | 260/410 [5:34:27<2:11:52, 52.75s/it]

 64%|██████▎   | 261/410 [5:38:17<4:22:55, 105.88s/it]

 64%|██████▍   | 262/410 [5:38:18<3:03:28, 74.38s/it] 

 64%|██████▍   | 263/410 [5:39:26<2:57:29, 72.45s/it]

 64%|██████▍   | 264/410 [5:39:31<2:07:16, 52.30s/it]

 65%|██████▍   | 265/410 [5:43:19<4:14:01, 105.11s/it]

 65%|██████▍   | 266/410 [5:43:21<2:57:40, 74.03s/it] 

 65%|██████▌   | 267/410 [5:44:31<2:53:46, 72.92s/it]

 65%|██████▌   | 268/410 [5:44:36<2:04:39, 52.67s/it]

 66%|██████▌   | 269/410 [5:48:23<4:06:15, 104.79s/it]

 66%|██████▌   | 270/410 [5:48:29<2:55:28, 75.20s/it] 

 66%|██████▌   | 271/410 [5:49:40<2:51:27, 74.01s/it]

 66%|██████▋   | 272/410 [5:49:43<2:00:53, 52.56s/it]

 67%|██████▋   | 273/410 [5:53:25<3:56:20, 103.51s/it]

 67%|██████▋   | 274/410 [5:53:30<2:47:25, 73.87s/it] 

 67%|██████▋   | 275/410 [5:54:46<2:47:29, 74.44s/it]

 67%|██████▋   | 276/410 [5:54:48<1:58:10, 52.91s/it]

 68%|██████▊   | 277/410 [5:58:28<3:48:27, 103.06s/it]

 68%|██████▊   | 278/410 [5:58:37<2:44:17, 74.68s/it] 

 68%|██████▊   | 279/410 [5:59:50<2:41:55, 74.16s/it]

 68%|██████▊   | 280/410 [5:59:54<1:55:14, 53.19s/it]

 69%|██████▊   | 281/410 [6:03:34<3:41:44, 103.13s/it]

 69%|██████▉   | 282/410 [6:03:42<2:39:16, 74.66s/it] 

 69%|██████▉   | 283/410 [6:04:54<2:36:23, 73.89s/it]

 69%|██████▉   | 284/410 [6:04:56<1:50:04, 52.41s/it]

 70%|██████▉   | 285/410 [6:08:35<3:32:58, 102.22s/it]

 70%|██████▉   | 286/410 [6:08:45<2:34:17, 74.66s/it] 

 70%|███████   | 287/410 [6:09:57<2:31:33, 73.93s/it]

 70%|███████   | 288/410 [6:10:02<1:47:49, 53.03s/it]

 70%|███████   | 289/410 [6:13:41<3:27:24, 102.85s/it]

 71%|███████   | 290/410 [6:13:50<2:29:30, 74.75s/it] 

 71%|███████   | 291/410 [6:15:06<2:28:55, 75.09s/it]

 71%|███████   | 292/410 [6:15:06<1:43:36, 52.68s/it]

 71%|███████▏  | 293/410 [6:18:41<3:17:28, 101.27s/it]

 72%|███████▏  | 294/410 [6:18:53<2:24:21, 74.67s/it] 

 72%|███████▏  | 295/410 [6:20:11<2:24:55, 75.61s/it]

 72%|███████▏  | 296/410 [6:20:14<1:41:50, 53.60s/it]

 72%|███████▏  | 297/410 [6:23:45<3:10:18, 101.05s/it]

 73%|███████▎  | 298/410 [6:23:57<2:18:48, 74.37s/it] 

 73%|███████▎  | 299/410 [6:25:18<2:20:50, 76.13s/it]

 73%|███████▎  | 300/410 [6:25:22<1:39:57, 54.52s/it]

 73%|███████▎  | 301/410 [6:28:49<3:02:26, 100.43s/it]

 74%|███████▎  | 302/410 [6:28:59<2:11:52, 73.26s/it] 

 74%|███████▍  | 303/410 [6:30:22<2:15:53, 76.20s/it]

 74%|███████▍  | 304/410 [6:30:25<1:35:51, 54.26s/it]

 74%|███████▍  | 305/410 [6:33:54<2:56:04, 100.61s/it]

 75%|███████▍  | 306/410 [6:34:03<2:06:57, 73.25s/it] 

 75%|███████▍  | 307/410 [6:35:27<2:10:50, 76.22s/it]

 75%|███████▌  | 308/410 [6:35:31<1:32:49, 54.60s/it]

 75%|███████▌  | 309/410 [6:38:56<2:48:04, 99.85s/it]

 76%|███████▌  | 310/410 [6:39:13<2:04:52, 74.92s/it]

 76%|███████▌  | 311/410 [6:40:32<2:05:40, 76.17s/it]

 76%|███████▌  | 312/410 [6:40:41<1:31:20, 55.92s/it]

 76%|███████▋  | 313/410 [6:44:02<2:40:44, 99.43s/it]

 77%|███████▋  | 314/410 [6:44:23<2:01:25, 75.89s/it]

 77%|███████▋  | 315/410 [6:45:36<1:58:51, 75.07s/it]

 77%|███████▋  | 316/410 [6:45:49<1:28:35, 56.55s/it]

 77%|███████▋  | 317/410 [6:49:04<2:31:58, 98.05s/it]

 78%|███████▊  | 318/410 [6:49:31<1:57:31, 76.65s/it]

 78%|███████▊  | 319/410 [6:50:40<1:52:43, 74.32s/it]

 78%|███████▊  | 320/410 [6:50:53<1:24:13, 56.15s/it]

 78%|███████▊  | 321/410 [6:54:09<2:25:32, 98.12s/it]

 79%|███████▊  | 322/410 [6:54:35<1:52:02, 76.39s/it]

 79%|███████▉  | 323/410 [6:55:44<1:47:24, 74.07s/it]

 79%|███████▉  | 324/410 [6:56:02<1:22:21, 57.46s/it]

 79%|███████▉  | 325/410 [6:59:14<2:18:18, 97.63s/it]

 80%|███████▉  | 326/410 [6:59:44<1:48:19, 77.37s/it]

 80%|███████▉  | 327/410 [7:00:47<1:41:05, 73.08s/it]

 80%|████████  | 328/410 [7:01:06<1:17:54, 57.01s/it]

 80%|████████  | 329/410 [7:04:19<2:11:52, 97.68s/it]

 80%|████████  | 330/410 [7:04:47<1:42:22, 76.78s/it]

 81%|████████  | 331/410 [7:05:50<1:35:43, 72.70s/it]

 81%|████████  | 332/410 [7:06:12<1:14:34, 57.36s/it]

 81%|████████  | 333/410 [7:09:23<2:05:18, 97.64s/it]

 81%|████████▏ | 334/410 [7:09:54<1:38:02, 77.41s/it]

 82%|████████▏ | 335/410 [7:10:57<1:31:24, 73.13s/it]

 82%|████████▏ | 336/410 [7:11:15<1:09:50, 56.63s/it]

 82%|████████▏ | 337/410 [7:14:26<1:57:50, 96.86s/it]

 82%|████████▏ | 338/410 [7:15:03<1:34:56, 79.12s/it]

 83%|████████▎ | 339/410 [7:16:00<1:25:37, 72.36s/it]

 83%|████████▎ | 340/410 [7:16:24<1:07:34, 57.92s/it]

 83%|████████▎ | 341/410 [7:19:27<1:49:38, 95.34s/it]

 83%|████████▎ | 342/410 [7:20:07<1:29:07, 78.64s/it]

 84%|████████▎ | 343/410 [7:21:02<1:20:04, 71.71s/it]

 84%|████████▍ | 344/410 [7:21:31<1:04:53, 58.99s/it]

 84%|████████▍ | 345/410 [7:24:33<1:43:38, 95.67s/it]

 84%|████████▍ | 346/410 [7:25:11<1:23:45, 78.52s/it]

 85%|████████▍ | 347/410 [7:26:11<1:16:43, 73.07s/it]

 85%|████████▍ | 348/410 [7:26:39<1:01:18, 59.33s/it]

 85%|████████▌ | 349/410 [7:29:35<1:36:07, 94.55s/it]

 85%|████████▌ | 350/410 [7:30:16<1:18:17, 78.29s/it]

 86%|████████▌ | 351/410 [7:31:24<1:13:53, 75.15s/it]

 86%|████████▌ | 352/410 [7:31:44<56:38, 58.60s/it]  

 86%|████████▌ | 353/410 [7:34:36<1:28:11, 92.83s/it]

 86%|████████▋ | 354/410 [7:35:19<1:12:41, 77.89s/it]

 87%|████████▋ | 355/410 [7:36:33<1:10:15, 76.64s/it]

 87%|████████▋ | 356/410 [7:36:46<51:39, 57.40s/it]  

 87%|████████▋ | 357/410 [7:39:40<1:21:42, 92.49s/it]

 87%|████████▋ | 358/410 [7:40:21<1:06:54, 77.20s/it]

 88%|████████▊ | 359/410 [7:41:45<1:07:10, 79.03s/it]

 88%|████████▊ | 360/410 [7:41:48<46:56, 56.34s/it]  

 88%|████████▊ | 361/410 [7:44:46<1:15:49, 92.84s/it]

 88%|████████▊ | 362/410 [7:45:29<1:02:16, 77.84s/it]

 89%|████████▊ | 363/410 [7:46:50<1:01:42, 78.78s/it]

 89%|████████▉ | 364/410 [7:46:53<42:56, 56.02s/it]  

 89%|████████▉ | 365/410 [7:49:48<1:08:49, 91.77s/it]

 89%|████████▉ | 366/410 [7:50:31<56:37, 77.21s/it]  

 90%|████████▉ | 367/410 [7:51:53<56:23, 78.68s/it]

 90%|████████▉ | 368/410 [7:51:55<38:51, 55.51s/it]

 90%|█████████ | 369/410 [7:54:52<1:02:57, 92.12s/it]

 90%|█████████ | 370/410 [7:55:38<52:02, 78.06s/it]  

 90%|█████████ | 371/410 [7:56:57<50:54, 78.32s/it]

 91%|█████████ | 372/410 [7:57:01<35:37, 56.25s/it]

 91%|█████████ | 373/410 [7:59:54<56:13, 91.18s/it]

 91%|█████████ | 374/410 [8:00:40<46:38, 77.74s/it]

 91%|█████████▏| 375/410 [8:01:59<45:26, 77.89s/it]

 92%|█████████▏| 376/410 [8:02:11<32:58, 58.18s/it]

 92%|█████████▏| 377/410 [8:05:00<50:14, 91.36s/it]

 92%|█████████▏| 378/410 [8:05:43<41:03, 76.98s/it]

 92%|█████████▏| 379/410 [8:07:04<40:24, 78.21s/it]

 93%|█████████▎| 380/410 [8:07:12<28:32, 57.08s/it]

 93%|█████████▎| 381/410 [8:10:03<44:06, 91.25s/it]

 93%|█████████▎| 382/410 [8:10:46<35:48, 76.75s/it]

 93%|█████████▎| 383/410 [8:12:07<35:04, 77.94s/it]

 94%|█████████▎| 384/410 [8:12:18<25:04, 57.86s/it]

 94%|█████████▍| 385/410 [8:15:04<37:43, 90.53s/it]

 94%|█████████▍| 386/410 [8:15:47<30:29, 76.24s/it]

 94%|█████████▍| 387/410 [8:17:08<29:46, 77.67s/it]

 95%|█████████▍| 388/410 [8:17:23<21:31, 58.72s/it]

 95%|█████████▍| 389/410 [8:20:10<32:00, 91.43s/it]

 95%|█████████▌| 390/410 [8:20:47<24:59, 74.98s/it]

 95%|█████████▌| 391/410 [8:22:12<24:39, 77.86s/it]

 96%|█████████▌| 392/410 [8:22:24<17:27, 58.19s/it]

 96%|█████████▌| 393/410 [8:25:06<25:20, 89.46s/it]

 96%|█████████▌| 394/410 [8:25:47<19:54, 74.68s/it]

 96%|█████████▋| 395/410 [8:27:10<19:17, 77.16s/it]

 97%|█████████▋| 396/410 [8:27:22<13:26, 57.61s/it]

 97%|█████████▋| 397/410 [8:30:03<19:14, 88.82s/it]

 97%|█████████▋| 398/410 [8:30:43<14:48, 74.01s/it]

 97%|█████████▋| 399/410 [8:32:08<14:12, 77.49s/it]

 98%|█████████▊| 400/410 [8:32:21<09:40, 58.03s/it]

 98%|█████████▊| 401/410 [8:34:59<13:13, 88.21s/it]

 98%|█████████▊| 402/410 [8:35:36<09:41, 72.73s/it]

 98%|█████████▊| 403/410 [8:37:04<09:01, 77.32s/it]

 99%|█████████▊| 404/410 [8:37:25<06:01, 60.27s/it]

 99%|█████████▉| 405/410 [8:40:05<07:31, 90.25s/it]

 99%|█████████▉| 406/410 [8:40:41<04:56, 74.08s/it]

 99%|█████████▉| 407/410 [8:41:14<03:04, 61.59s/it]

100%|█████████▉| 408/410 [8:42:16<02:03, 61.80s/it]

100%|█████████▉| 409/410 [8:42:32<00:48, 48.23s/it]

100%|██████████| 410/410 [8:44:45<00:00, 73.52s/it]

100%|██████████| 410/410 [8:44:45<00:00, 76.79s/it]




In [15]:
len(all_results)

4091

### Create DataFrame

In [16]:
_n_expected_phenos = np.sum(list(conf.SMULTIXCAN_EXPECTED_PHENOTYPES.values()))
display(_n_expected_phenos)
assert len(all_results) == _n_expected_phenos, len(all_results)

4091

In [17]:
# the category dtype is for efficiency in storage/loading
spredixcan_genes_effect_directions = pd.DataFrame(all_results, dtype='category')
spredixcan_genes_effect_directions.index.rename('gene_name', inplace=True)

assert spredixcan_genes_effect_directions.index.is_unique

display(spredixcan_genes_effect_directions.shape)
display(spredixcan_genes_effect_directions.head())

(22518, 4091)

Unnamed: 0_level_0,R60-Diagnoses_main_ICD10_R60_Oedema_not_elsewhere_classified,ULCEROTH-Other_ulcerative_colitis,20002_1161-Noncancer_illness_code_selfreported_gall_bladder_disease,22601_31152695-Job_coding_quality_assurance_techniciancoordinator,20003_1140860696-Treatmentmedication_code_lisinopril,D50-Diagnoses_main_ICD10_D50_Iron_deficiency_anaemia,1160-Sleep_duration,20002_1457-Noncancer_illness_code_selfreported_duodenal_ulcer,22660_106-Gap_coding_Unable_to_work_due_to_sickness_or_disability,22601_35333266-Job_coding_insurance_underwriter_insurance_inspector_insurance_account_handler,...,IBD.EUR.Ulcerative_Colitis,Astle_et_al_2016_Sum_basophil_neutrophil_counts,Astle_et_al_2016_Reticulocyte_count,RA_OKADA_TRANS_ETHNIC,GEFOS_Forearm,Jones_et_al_2016_SleepDuration,EAGLE_Eczema,IBD.EUR.Crohns_Disease,IMMUNOBASE_Systemic_lupus_erythematosus_hg19,Astle_et_al_2016_Myeloid_white_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000457,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000460,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000938,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000971,,,,,,,,,,,...,,,,,1.0,,,,,


In [18]:
# For this method we should not remove nans
# Remove genes with no results
#spredixcan_genes_effect_directions = spredixcan_genes_effect_directions.dropna(axis=0, how='all')

In [19]:
# how many entries are nan
spredixcan_genes_effect_directions.isna().sum().sum()

91634228

In [20]:
_tmp = pd.Series(spredixcan_genes_effect_directions.values.flatten()).dropna().astype(float).unique()
display(_tmp)
assert set(_tmp) == set([0, 1, -1]), set(_tmp)

array([-1.,  1.,  0.])

In [21]:
display(f'Results shape: {spredixcan_genes_effect_directions.shape}')

assert spredixcan_genes_effect_directions.shape == (22518, _n_expected_phenos), spredixcan_genes_effect_directions.shape

'Results shape: (22518, 4091)'

## Testing

In [22]:
# _tmp.loc['CNCR_Insomnia_all'].sort_values(ascending=False).head()

# _tmp = all_spredixcan_phenotypes[2]

# _tmp.pheno_info.get_plain_name()

# data_dict = {t:_tmp.get_tissue_data(t, 'zscore', index_col='gene_simple') for t in _tmp.tissues}

# data = pd.DataFrame(data_dict)

# data.max(axis=1).sort_values()

In [23]:
spredixcan_genes_effect_directions.loc[
    [
        'ENSG00000137185',
        'ENSG00000000457',
        'ENSG00000095464',
        'ENSG00000228397',
        'ENSG00000279325',
        'ENSG00000000419',
    ],
    [
        '6157_3-Why_stopped_smoking_Health_precaution',
        'I50-Diagnoses_main_ICD10_I50_Heart_failure',
        'CNCR_Insomnia_all',
    ]
]

Unnamed: 0_level_0,6157_3-Why_stopped_smoking_Health_precaution,I50-Diagnoses_main_ICD10_I50_Heart_failure,CNCR_Insomnia_all
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000137185,-1.0,,
ENSG00000000457,,,
ENSG00000095464,,1.0,
ENSG00000228397,,,1.0
ENSG00000279325,,,-1.0
ENSG00000000419,,,


In [24]:
assert spredixcan_genes_effect_directions.loc['ENSG00000137185', '6157_3-Why_stopped_smoking_Health_precaution'] == -1.0
assert pd.isnull(spredixcan_genes_effect_directions.loc['ENSG00000000457', '6157_3-Why_stopped_smoking_Health_precaution'])
assert pd.isnull(spredixcan_genes_effect_directions.loc['ENSG00000095464', '6157_3-Why_stopped_smoking_Health_precaution'])

assert spredixcan_genes_effect_directions.loc['ENSG00000095464', 'I50-Diagnoses_main_ICD10_I50_Heart_failure'] == 1.0

assert pd.isnull(spredixcan_genes_effect_directions.loc['ENSG00000137185', 'CNCR_Insomnia_all'])
assert spredixcan_genes_effect_directions.loc['ENSG00000228397', 'CNCR_Insomnia_all'] == 1.0
assert spredixcan_genes_effect_directions.loc['ENSG00000279325', 'CNCR_Insomnia_all'] == -1.0

The code below was used to write the assert above; see for each gene if first and last (min and max) correspond to sign above

In [25]:
rapid_gwas_dir = conf.SPREDIXCAN_RESULTS_DIR['RapidGWASProject']
gtex_gwas_dir = conf.SPREDIXCAN_RESULTS_DIR['GTEX_GWAS']

In [26]:
%%bash -s "$rapid_gwas_dir"
cd $1/6157_3
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000137185"' ::: *.csv | sort -k3 -g
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000457"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000095464"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'

ENSG00000137185.11  ZSCAN9               -5.83616067550634
ENSG00000137185.11  ZSCAN9               -5.792628572580804
ENSG00000137185.11  ZSCAN9               -5.681816314702161
ENSG00000137185.11  ZSCAN9               -5.412943710075788
ENSG00000137185.11  ZSCAN9               -5.352010726928711
ENSG00000137185.11  ZSCAN9               -5.279010199840854
ENSG00000137185.11  ZSCAN9               -5.108492106415525
ENSG00000137185.11  ZSCAN9               -5.079929626396137
ENSG00000137185.11  ZSCAN9               -5.063558049159138
ENSG00000137185.11  ZSCAN9               -5.043577015552067
ENSG00000137185.11  ZSCAN9               -5.003944335955496
ENSG00000137185.11  ZSCAN9               -5.001958915404419
ENSG00000137185.11  ZSCAN9               -4.975072786720918
ENSG00000137185.11  ZSCAN9               -4.896579024471048
ENSG00000137185.11  ZSCAN9               -4.890546437827829
ENSG00000137185.11  ZSCAN9               -4.820902875527851
ENSG00000137185.11  ZSCAN9               

In [27]:
%%bash -s "$rapid_gwas_dir"
cd $1/I50
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000095464"' ::: *.csv | sort -k3 -g

ENSG00000095464.9   PDE6C                NA
ENSG00000095464.9   PDE6C                -2.987583875656128
ENSG00000095464.9   PDE6C                -1.706084537916532
ENSG00000095464.9   PDE6C                -1.6929545931506047
ENSG00000095464.9   PDE6C                -1.6694166874361254
ENSG00000095464.9   PDE6C                -1.6645126380746706
ENSG00000095464.9   PDE6C                -1.658251762390137
ENSG00000095464.9   PDE6C                -1.6582517623901367
ENSG00000095464.9   PDE6C                -1.6582517623901367
ENSG00000095464.9   PDE6C                -1.6582517623901367
ENSG00000095464.9   PDE6C                -1.652789255603333
ENSG00000095464.9   PDE6C                -1.6400961218813983
ENSG00000095464.9   PDE6C                -1.6347826064398305
ENSG00000095464.9   PDE6C                -1.6275348613259335
ENSG00000095464.9   PDE6C                -1.6171702146530154
ENSG00000095464.9   PDE6C                -1.6171702146530154
ENSG00000095464.9   PDE6C                -1.6

In [28]:
%%bash -s "$gtex_gwas_dir"
cd $1/CNCR_Insomnia_all
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000137185"' ::: *.csv | sort -k3 -g # | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000228397"' ::: *.csv | sort -k3 -g # | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000279325"' ::: *.csv | sort -k3 -g # | sed -e 1b -e '$!d'

ENSG00000137185.11  ZSCAN9               -2.0075128138942824
ENSG00000137185.11  ZSCAN9               -1.9716453552246094
ENSG00000137185.11  ZSCAN9               -1.9451855273280618
ENSG00000137185.11  ZSCAN9               -1.8545247091875776
ENSG00000137185.11  ZSCAN9               -1.7974615020480318
ENSG00000137185.11  ZSCAN9               -1.6697390108898214
ENSG00000137185.11  ZSCAN9               -1.637959840329144
ENSG00000137185.11  ZSCAN9               -1.5990879264599478
ENSG00000137185.11  ZSCAN9               -1.5792834503640862
ENSG00000137185.11  ZSCAN9               -1.5003131550674709
ENSG00000137185.11  ZSCAN9               -1.4885983833529386
ENSG00000137185.11  ZSCAN9               -1.4863373695166933
ENSG00000137185.11  ZSCAN9               -1.4656920469620405
ENSG00000137185.11  ZSCAN9               -1.4643227059342168
ENSG00000137185.11  ZSCAN9               -1.4559472334714858
ENSG00000137185.11  ZSCAN9               -1.4441600680748996
ENSG00000137185.11  ZSCAN

### Save

In [29]:
spredixcan_genes_effect_directions.shape

(22518, 4091)

In [30]:
spredixcan_genes_effect_directions.head()

Unnamed: 0_level_0,R60-Diagnoses_main_ICD10_R60_Oedema_not_elsewhere_classified,ULCEROTH-Other_ulcerative_colitis,20002_1161-Noncancer_illness_code_selfreported_gall_bladder_disease,22601_31152695-Job_coding_quality_assurance_techniciancoordinator,20003_1140860696-Treatmentmedication_code_lisinopril,D50-Diagnoses_main_ICD10_D50_Iron_deficiency_anaemia,1160-Sleep_duration,20002_1457-Noncancer_illness_code_selfreported_duodenal_ulcer,22660_106-Gap_coding_Unable_to_work_due_to_sickness_or_disability,22601_35333266-Job_coding_insurance_underwriter_insurance_inspector_insurance_account_handler,...,IBD.EUR.Ulcerative_Colitis,Astle_et_al_2016_Sum_basophil_neutrophil_counts,Astle_et_al_2016_Reticulocyte_count,RA_OKADA_TRANS_ETHNIC,GEFOS_Forearm,Jones_et_al_2016_SleepDuration,EAGLE_Eczema,IBD.EUR.Crohns_Disease,IMMUNOBASE_Systemic_lupus_erythematosus_hg19,Astle_et_al_2016_Myeloid_white_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000457,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000460,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000938,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000971,,,,,,,,,,,...,,,,,1.0,,,,,


In [31]:
# Save
spredixcan_genes_effect_directions_filename = os.path.join(conf.GENE_ASSOC_DIR, f'spredixcan-mashr-effect_direction-consensus.pkl.xz')
display(spredixcan_genes_effect_directions_filename)

'/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/gene_assoc/spredixcan-mashr-effect_direction-consensus.pkl.xz'

In [32]:
spredixcan_genes_effect_directions.to_pickle(spredixcan_genes_effect_directions_filename)

### Save in HDF5 format for webapp

In [33]:
spredixcan_genes_effect_directions = pd.read_pickle(spredixcan_genes_effect_directions_filename)

In [34]:
spredixcan_genes_effect_directions.shape

(22518, 4091)

In [35]:
from utils import simplify_string_for_hdf5

In [36]:
os.makedirs(conf.GENE_ASSOC_DIR, exist_ok=True)

In [37]:
OUTPUT_HDF5_FILE = os.path.join(conf.GENE_ASSOC_DIR, 'spredixcan-mashr-effect_direction-consensus.h5')
display(OUTPUT_HDF5_FILE)

'/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/gene_assoc/spredixcan-mashr-effect_direction-consensus.h5'

In [38]:
with pd.HDFStore(OUTPUT_HDF5_FILE, mode='w', complevel=1) as store:
    for col in spredixcan_genes_effect_directions.columns:
        #print('.', flush=True, end='')
        clean_col = simplify_string_for_hdf5(col)
        store[clean_col] = spredixcan_genes_effect_directions[col].astype(float)

In [39]:
# testing
with pd.HDFStore(OUTPUT_HDF5_FILE, mode='r') as store:
    store_keys = list(store.keys())
    assert len(store_keys) == spredixcan_genes_effect_directions.shape[1]
    display(store_keys[:5])
    
    clean_col = simplify_string_for_hdf5('6157_3-Why_stopped_smoking_Health_precaution')
    data = store[clean_col]
    assert data.shape == (22518,), data.shape
    assert data.loc['ENSG00000137185'] == -1.0
    assert pd.isnull(data.loc['ENSG00000000457'])
    assert pd.isnull(data.loc['ENSG00000095464'])
    
    clean_col = simplify_string_for_hdf5('I50-Diagnoses_main_ICD10_I50_Heart_failure')
    data = store[clean_col]
    assert data.shape == (22518,), data.shape
    assert data.loc['ENSG00000095464'] == 1.0
    
    clean_col = simplify_string_for_hdf5('CNCR_Insomnia_all')
    data = store[clean_col]
    assert data.shape == (22518,), data.shape
    assert pd.isnull(data.loc['ENSG00000137185'])
    assert data.loc['ENSG00000228397'] == 1.0
    assert data.loc['ENSG00000279325'] == -1.0

['/c100001_raw_Food_weight',
 '/c100002_raw_Energy',
 '/c100003_raw_Protein',
 '/c100004_raw_Fat',
 '/c100005_raw_Carbohydrate']