# Load Pfizer paper Tx data
Load and explore the Pfizer Tx data from the recount2 database

In [19]:
import os
from pathlib import Path
import pandas as pd
import torch
from utilities import hdf_keys

In [5]:
DATA_PATH = Path("/data/pfizer_tx")

Start with the classification problems, training data only.

In [6]:
all_train_files = [f for f in os.listdir(DATA_PATH/"tasks_all_clr") if 'train'in f]

In [8]:
# only the time series data have a /censor_labels key
all_classification_files = [f for f in all_train_files if '/censor_labels' not in hdf_keys(DATA_PATH/"tasks_all_clr"/f)]

If you get the following error:
```bash
ImportError HDFStore requires PyTables No module named tables
```
then ```pip install --upgrade tables```

In [9]:
all_classification_files

['all_clr_train_LUAD_stage.h5',
 'all_clr_train_GSE65832.h5',
 'all_clr_train_CESC_grade.h5',
 'all_clr_train_COAD_stage.h5',
 'all_clr_train_SKCM_stage.h5',
 'all_clr_train_UCEC_stage.h5',
 'all_clr_train_PAAD_grade.h5',
 'all_clr_train_UCEC_grade.h5',
 'all_clr_train_GSE66207.h5',
 'all_clr_train_LIHC_grade.h5',
 'all_clr_train_KIRC_grade.h5',
 'all_clr_train_LIHC_stage.h5',
 'all_clr_train_STAD_grade.h5',
 'all_clr_train_STAD_stage.h5',
 'all_clr_train_KIRC_stage.h5',
 'all_clr_train_LGG_grade.h5',
 'all_clr_train_THCA_stage.h5']

In [10]:
test_DATA_PATH = DATA_PATH/"tasks_all_clr"/all_classification_files[0]
keys = hdf_keys(test_DATA_PATH)
print(keys)
test_data = {key : pd.read_hdf(DATA_PATH/"tasks_all_clr/all_clr_train_CESC_OS.h5", key = key) for key in keys}

['/expression', '/labels']


In [11]:
test_data['/expression'].shape

(304, 57992)

In [12]:
test_data['/expression'].head()

Unnamed: 0_level_0,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,...,ENSG00000283690,ENSG00000283691,ENSG00000283692,ENSG00000283693,ENSG00000283694,ENSG00000283695,ENSG00000283696,ENSG00000283697,ENSG00000283698,ENSG00000283699
recount_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7826AE44-B03A-4528-9CB5-7421E592AE36,7.273254,-1.611177,7.671172,5.144625,5.185823,5.244084,6.181699,6.734401,5.739017,5.816485,...,-7.115629,1.414801,-7.115629,-7.115629,2.053952,-7.115629,2.813278,3.283886,-7.115629,-7.115629
0DC2197B-2FEA-48EB-953E-8E44E1933757,7.131861,-7.616766,8.530928,5.724083,6.249423,6.307812,5.995008,8.14552,6.313583,6.388412,...,-7.616766,-7.616766,-3.840906,-7.616766,-7.616766,-7.616766,4.019502,4.344022,-0.568187,-7.616766
87ABB098-8C1E-4677-8C31-48B71EFE41C7,6.169603,-0.467976,7.849034,5.23984,5.358871,4.764959,3.23025,7.450602,5.209594,6.550502,...,-6.946502,1.69934,-2.435826,-6.946502,2.351666,-6.946502,4.606699,3.196548,-6.946502,-6.946502
EC0154E7-7A27-4535-83AE-6B379E99A93C,6.276552,-6.554893,7.069809,5.844858,5.788549,5.371618,5.892911,7.027828,6.333499,6.096078,...,-6.554893,3.389995,-6.554893,-6.554893,1.539687,3.017501,5.335064,3.978409,-0.32364,-6.554893
78F79407-D49F-4FA2-B09F-0665C502827C,5.976701,-0.718647,7.741823,5.134561,5.467024,4.976366,6.277791,6.645078,5.786737,5.958712,...,-6.736251,1.955487,-6.736251,0.370077,-6.736251,-6.736251,4.326674,3.529664,-6.736251,-6.736251


Let's see if we have enough memory for the unsupervised data

In [14]:
!ls -la --block-size=GiB /data/pfizer_tx/
# we'll need > 9 GiB of memory to load it all. p2.xlarge has 61 GiB

total 9GiB
drwxrwxr-x 4 ubuntu ubuntu 1GiB Apr 28 14:18 .
drwxrwxrwx 5 root   root   1GiB May  1 10:11 ..
-rw-rw-r-- 1 ubuntu ubuntu 1GiB Apr 26 09:30 CC-BY-4.0-license.txt
-rw-rw-r-- 1 ubuntu ubuntu 1GiB Apr 26 09:30 README.md
-rw-rw-r-- 1 ubuntu ubuntu 1GiB Mar  8  2019 tasks_README.md
drwxr-xr-x 2 ubuntu ubuntu 1GiB Mar  7  2019 tasks_all_clr
drwxrwxr-x 3 ubuntu ubuntu 1GiB Apr 28 14:18 torch-templates
-rw-rw-r-- 1 ubuntu ubuntu 1GiB Mar  8  2019 unsupervised_README.md
-rw-rw-r-- 1 ubuntu ubuntu 9GiB Apr 28 14:13 unsupervised_all_clr_train.h5
-rw-rw-r-- 1 ubuntu ubuntu 1GiB Apr 28 14:12 unsupervised_all_clr_train.hd5


In [17]:
unsupervised_DATA_PATH = DATA_PATH/"unsupervised_all_clr_train.h5"
keys = hdf_keys(unsupervised_DATA_PATH)

In [18]:
keys

['/metadata',
 '/expression/chunk_0',
 '/expression/chunk_1',
 '/expression/chunk_10',
 '/expression/chunk_11',
 '/expression/chunk_12',
 '/expression/chunk_13',
 '/expression/chunk_14',
 '/expression/chunk_15',
 '/expression/chunk_16',
 '/expression/chunk_17',
 '/expression/chunk_18',
 '/expression/chunk_19',
 '/expression/chunk_2',
 '/expression/chunk_20',
 '/expression/chunk_21',
 '/expression/chunk_22',
 '/expression/chunk_23',
 '/expression/chunk_24',
 '/expression/chunk_25',
 '/expression/chunk_26',
 '/expression/chunk_27',
 '/expression/chunk_28',
 '/expression/chunk_29',
 '/expression/chunk_3',
 '/expression/chunk_30',
 '/expression/chunk_31',
 '/expression/chunk_32',
 '/expression/chunk_33',
 '/expression/chunk_34',
 '/expression/chunk_35',
 '/expression/chunk_36',
 '/expression/chunk_37',
 '/expression/chunk_38',
 '/expression/chunk_39',
 '/expression/chunk_4',
 '/expression/chunk_40',
 '/expression/chunk_41',
 '/expression/chunk_42',
 '/expression/chunk_43',
 '/expression/ch