# Generate training/holdout data using the packaged data generation script and inspect

In [3]:
import pandas as pd
import MOA_L1000.dataprep as dataprep
import numpy as np
import os
from cmapPy.pandasGEXpress import parse, write_gctx
from pandas.testing import assert_index_equal
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [5]:

filename = "/home/navid/data/DeepL1000/raw/siginfo.txt"
siginfo = pd.read_csv(filename, sep="\t")

filename = "/home/navid/data/DeepL1000/raw/CORE_AB_pert_info_with_moa.txt"
pertinfo = pd.read_csv(filename, sep="\t")

filename = "/home/navid/data/DeepL1000/raw/modzs_lm_n127751x978.gctx"
gct = parse.parse(filename)

sig, perts, annots = dataprep.find_perts_with_sigs_in_top_n_cell_lines(siginfo, pertinfo, selected_doses=dataprep.SELECTED_DOSES, n_top_cells=7)
X_train, X_holdout, targets_train, targets_holdout = dataprep.generate_training_and_holdout_data(annots, sig, gct)

cell_id
A375    1542.0
HA1E    1542.0
HELA    1544.0
YAPC    1547.0
PC3     1549.0
MCF7    1552.0
HT29    1554.0
dtype: float64
Index(['A375', 'HA1E', 'HELA', 'YAPC', 'PC3', 'MCF7', 'HT29'], dtype='object', name='cell_id')
cell_id
HELA    1548.0
PC3     1551.0
HA1E    1552.0
YAPC    1552.0
A375    1553.0
HT29    1554.0
MCF7    1554.0
dtype: float64
Index(['HELA', 'PC3', 'HA1E', 'YAPC', 'A375', 'HT29', 'MCF7'], dtype='object', name='cell_id')
cell_id
HELA    1552.0
A375    1557.0
HA1E    1557.0
PC3     1557.0
HT29    1558.0
MCF7    1558.0
YAPC    1558.0
dtype: float64
Index(['HELA', 'A375', 'HA1E', 'PC3', 'HT29', 'MCF7', 'YAPC'], dtype='object', name='cell_id')
Total number of perts found for the selected doses and cell lines: 1,555
Total number of perts with available labels: 1,418




Selecting moas that have at least 4 compounds in train and 1 compounds in holdout
Number of MOAs selected: 100
Shape of targets_train: (3367, 100)
Shape of targets_holdout: (846, 100)


In [6]:
assert targets_train.shape[0] == X_train.shape[0]
assert targets_holdout.shape[0] == X_holdout.shape[0]
assert_index_equal(targets_train.columns, targets_holdout.columns)

assert X_train.shape[1] == 978
assert X_holdout.shape[1] == 978

In [7]:
targets_train.sum(1).value_counts()


1.0    2004
0.0    1061
2.0     229
3.0      52
5.0      12
4.0       9
dtype: int64

In [8]:
print(X_train.shape)
print(X_holdout.shape)

print(targets_train.shape)
print(targets_holdout.shape)

targets_train.head()

(3367, 978, 7)
(846, 978, 7)
(3367, 100)
(846, 100)


Unnamed: 0_level_0,Unnamed: 1_level_0,PLK INHIBITOR,CCK RECEPTOR ANTAGONIST,ANDROGEN RECEPTOR AGONIST,AROMATASE INHIBITOR,IMMUNOSUPPRESSANT,PARP INHIBITOR,INTEGRIN INHIBITOR,MET INHIBITOR,CHELATING AGENT,THROMBOXANE RECEPTOR ANTAGONIST,...,VEGFR INHIBITOR,ADRENERGIC RECEPTOR AGONIST,CALCIUM CHANNEL BLOCKER,HISTAMINE RECEPTOR ANTAGONIST,DNA INHIBITOR,PHOSPHODIESTERASE INHIBITOR,DOPAMINE RECEPTOR ANTAGONIST,CYCLOOXYGENASE INHIBITOR,ADRENERGIC RECEPTOR ANTAGONIST,SEROTONIN RECEPTOR ANTAGONIST
pert_id,pert_idose,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BRD-A00147595,1.11 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00147595,10 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00147595,3.33 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00218260,1.11 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00218260,10 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Run the packaged data generation script for version 1
run make_data_v1() 

In [9]:
dataprep.make_data_v1()

cell_id
A375    1542.0
HA1E    1542.0
HELA    1544.0
YAPC    1547.0
PC3     1549.0
MCF7    1552.0
HT29    1554.0
dtype: float64
Index(['A375', 'HA1E', 'HELA', 'YAPC', 'PC3', 'MCF7', 'HT29'], dtype='object', name='cell_id')
cell_id
HELA    1548.0
PC3     1551.0
HA1E    1552.0
YAPC    1552.0
A375    1553.0
HT29    1554.0
MCF7    1554.0
dtype: float64
Index(['HELA', 'PC3', 'HA1E', 'YAPC', 'A375', 'HT29', 'MCF7'], dtype='object', name='cell_id')
cell_id
HELA    1552.0
A375    1557.0
HA1E    1557.0
PC3     1557.0
HT29    1558.0
MCF7    1558.0
YAPC    1558.0
dtype: float64
Index(['HELA', 'A375', 'HA1E', 'PC3', 'HT29', 'MCF7', 'YAPC'], dtype='object', name='cell_id')
Total number of perts found for the selected doses and cell lines: 1,555
Total number of perts with available labels: 1,418




Selecting moas that have at least 4 compounds in train and 1 compounds in holdout
Number of MOAs selected: 100
Shape of targets_train: (3367, 100)
Shape of targets_holdout: (846, 100)


## Load the generated training data 

In [12]:
import MOA_L1000.dataprep as dataprep
import MOA_L1000.dataload as dataload

X_train, y_train, w_train, X_holdout, y_holdout = dataload.load_data_v1()
X_train.shape

(3367, 978, 7)

In [16]:
y_train.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,PLK INHIBITOR,CCK RECEPTOR ANTAGONIST,ANDROGEN RECEPTOR AGONIST,AROMATASE INHIBITOR,IMMUNOSUPPRESSANT,PARP INHIBITOR,INTEGRIN INHIBITOR,MET INHIBITOR,CHELATING AGENT,THROMBOXANE RECEPTOR ANTAGONIST,...,VEGFR INHIBITOR,ADRENERGIC RECEPTOR AGONIST,CALCIUM CHANNEL BLOCKER,HISTAMINE RECEPTOR ANTAGONIST,DNA INHIBITOR,PHOSPHODIESTERASE INHIBITOR,DOPAMINE RECEPTOR ANTAGONIST,CYCLOOXYGENASE INHIBITOR,ADRENERGIC RECEPTOR ANTAGONIST,SEROTONIN RECEPTOR ANTAGONIST
pert_id,pert_idose,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BRD-A00147595,1.11 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00147595,10 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00147595,3.33 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00218260,1.11 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00218260,10 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00218260,3.33 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00376169,1.11 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00376169,10 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00376169,3.33 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BRD-A00546892,1.11 uM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Design Transformer that for each sample (in each cell line) removes the median across all features

In [36]:
class TransformerRemoveMed:
    '''
    Remove gene and cell medians from each sample
    RobustScale all features
    '''
    
    def __init__(self):
        pass
    
    def fit(self, X):
        pass
        
    def transform(self, X):
        q25 = np.quantile(X, 0.25, axis=1)
        q75 = np.quantile(X, 0.75, axis=1)
        med = (q25  + q75) / 2

        # Give med a dummy dimension so its shape
        # matches that of X_train and we can subtract
        med = np.expand_dims(med, 1)

        X_transformed = np.subtract(X, med)
        return X_transformed

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

# Transformer that quantile transforms across all samples for each cell line 

In [48]:
from sklearn.preprocessing import QuantileTransformer
class TransformerQuantileTransform:
    '''
    Quantile Transform across all features
    '''
    
    def __init__(self):
        self.quantile_transformers = [
            QuantileTransformer(n_quantiles=100, output_distribution="normal") for i in range(7)
        ]

    def fit(self, X):
        for i in range(7):
            self.quantile_transformers[i].fit(X[:, :, i])
    
    def transform(self, X):
        X_transformed = np.stack(
            [self.quantile_transformers[i].transform(X[:, :, i]) for i in range(7)],
            axis=2
        )
        return X_transformed

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)




In [50]:
tr = TransformerQuantileTransform()
XX = tr.fit_transform(X_train)