## Transfer learning from CCLE to MMRF (V6)

Last Modified: 05/17/2021

Written By: Sumi Thakur and Qingyang Xu

- Pretrain on CCLE cell line data to predict IC50

- Transfer to patient RNA-seq data to predict PFS

References

- Download patient genomic data (e.g. `MMRF_CoMMpass_IA15a_CNA_Exome_PerGene_LargestSegment.txt`)

https://research.themmrf.org/

- Download DevMap cell line data (e.g. `CCLE_expression.csv`)

https://depmap.org/portal/download/

In [1]:
#from google.colab import drive
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
#!pip install -q -U keras-tuner
from kerastuner import HyperModel
from keras import models, layers,regularizers
from tensorflow.keras.optimizers import Adam
from kerastuner.tuners.bayesian import BayesianOptimization
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
import pickle

In [2]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import neural_network
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn import metrics as skmetrics
from sklearn.decomposition import PCA

In [3]:
import MMRF_utils_V2

In [None]:
import sklearn 
print(tf.__version__, sklearn.__version__)

## 1. Pretrain on CCLE IC50 data

In [4]:
def normalize_features(train, test):
    
    print('Normalizing input features...')
    nsample, nfeature = train.shape
    
    assert nfeature == test.shape[1]

    for i in range(nfeature):
        mu = np.mean(train[:,i])
        sigma = np.std(train[:,i])
        train[:,i] -= mu
        test[:,i] -= mu
        
        if sigma > 0: # some genes may have zero variance
            train[:,i] /= sigma
            test[:,i] /= sigma
    
    return train, test

In [5]:
def preprocess_CCLE(ccle_exp, sample_info, sanger_dose_response, mapping):
    
    print('Preprocessing CCLE data...')
    
    ccle_exp = ccle_exp.rename(columns={'Unnamed: 0':'DepMap_ID'})
    cols_to_keep = ['DepMap_ID'] + list(pd.unique(mapping.HGNC_ID))
    ccle_chosen = ccle_exp[cols_to_keep]

    sanger_dose_response_filt = sanger_dose_response[(sanger_dose_response.DRUG_NAME.str.contains("BORTEZOMIB"))
                                                | (sanger_dose_response.DRUG_NAME.str.contains("LENALIDOMIDE"))
                                                ]

    sanger_dose_response_filt = sanger_dose_response_filt.rename(columns={'ARXSPAN_ID':'DepMap_ID'})
    
    merged_df = sanger_dose_response_filt.merge(ccle_chosen, on= 'DepMap_ID')[list(ccle_chosen.columns)+['DRUG_NAME','IC50_PUBLISHED']]
    merged_df['log(IC_50)'] = merged_df.IC50_PUBLISHED.apply(np.log10)
    
    merged_df_bort = merged_df[merged_df.DRUG_NAME == 'BORTEZOMIB'].drop_duplicates()
    merged_df_lenal = merged_df[merged_df.DRUG_NAME == 'LENALIDOMIDE'].drop_duplicates()
    bort_labels = merged_df_bort['log(IC_50)']
    lenal_labels = merged_df_lenal['log(IC_50)']
    bort_data = merged_df_bort.drop(columns = ['DepMap_ID','IC50_PUBLISHED','DRUG_NAME','log(IC_50)'])
    lenal_data = merged_df_lenal.drop(columns = ['DepMap_ID','IC50_PUBLISHED','DRUG_NAME','log(IC_50)'])
    
    return bort_data, bort_labels, lenal_data, lenal_labels

In [6]:
def split_train_test_CCLE(X, y, normalize=True, nPCA=0):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    
    X_train, X_test = normalize_features(X_train, X_test)
    
    pca = None
    # try running PCA on raw RNAseq data
    if nPCA > 0:
        print('Running PCA-%d on gene expressions...'%nPCA)
        pca = PCA(n_components=nPCA)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
    
    assert X_train.shape[0] == y_train.shape[0]
    
    assert X_test.shape[0] == y_test.shape[0]
    
    print('Training set shape:')
    print(X_train.shape)
    
    print('Test set shape:')
    print(X_test.shape)
    
    return X_train, y_train, X_test, y_test, pca

In [7]:
def linear_regression_IC50(X_train, y_train, X_test, y_test):
    print('Running linear regression to predict IC50...')
    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    R2 = r2_score(y_test, y_pred)
    print('R2: %0.2f'%R2)

    rho, p = pearsonr(y_test, y_pred)
    print('Correlation: %0.2f'%rho)
    print('p-value: %0.5f'%p)

In [8]:
class RegressionHyperModel(HyperModel):
    def __init__(self, input_shape, enc_layer_size=5):
        self.input_shape = input_shape
        self.enc_layer_size = enc_layer_size
    def build(self, hp):
        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Dense(
                units=hp.Int('units', 40, 100, step=5, default=80),
                activation=hp.Choice(
                    'dense_activation',
                    values=['relu', 'tanh', 'sigmoid'],
                    default='relu'),
                name = 'dense1',
                input_shape = self.input_shape
            )
        )
        
        model.add(
            tf.keras.layers.Dropout(
                hp.Float(
                    'dropout',
                    min_value=0.0,
                    max_value=0.6,
                    default=0.3,
                    step=0.1),
                name='dropout1'
            )
        )
        
        model.add(
            tf.keras.layers.Dense(
                units=hp.Int('units', 10, 50, step=5, default=40),
                activation=hp.Choice(
                    'dense_activation',
                    values=['relu', 'tanh', 'sigmoid'],
                    default='relu'),
                name='dense2'
            )
        )
        
        model.add(
            tf.keras.layers.Dropout(
                hp.Float(
                    'dropout',
                    min_value=0.0,
                    max_value=0.6,
                    default=0.3,
                    step=0.1),
                name='dropout2'
            )
        )
        
        model.add(tf.keras.layers.Dense(self.enc_layer_size, name='encoder'))
        
        model.add(tf.keras.layers.Dense(1))
        
        model.compile(
            optimizer='Adam',loss='mse',metrics=['mse']
        )
        
        return model

In [9]:
def pretrain_MLP_model(X_train, y_train, X_test, y_test, params):
    
    print('Training MLP model...')

    enc_layer_size = params['enc_layer']
    
    input_shape = (X_train.shape[1],)
    print(input_shape)
    hypermodel = RegressionHyperModel(input_shape, enc_layer_size)    
    
    ### using bayesian hyperparamter optimization
    tuner_bo = BayesianOptimization(
                hypermodel,
                objective='mse',
                max_trials=10,
                seed=42,
                executions_per_trial=2
            )
    
    ### set epochs
    tuner_bo.search(X_train, y_train, epochs=10, validation_split=0.2, verbose=1)
    best_model = tuner_bo.get_best_models(num_models=1)[0]
    best_model.get_config()
  
    return best_model

In [10]:
def plot_R2_correlation(R2_his, rho_his):

    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel(r'$R^2$', color=color)
    ax1.plot(R2_his, color=color)
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:blue'
    ax2.set_ylabel(r'$\rho$', color=color)  # we already handled the x-label with ax1
    ax2.plot(rho_his, color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.title(r'$R^2$ and $\rho$ during training')
    plt.show()

In [11]:
def plot_losses(train_loss,val_loss):

    plt.yscale('log')
    plt.plot(train_loss[1:], label='Train Loss')
    plt.plot(val_loss[1:], label='Valid Loss')
    plt.legend()
    plt.title('Train vs. Valid Losses during Training')
    plt.xlabel('Epoch')
    plt.show()

## 2. Transfer to MMRF patient data

In [12]:
def preprocess_MMRF_genomic(genomic_fn, CCLE_cols, mapping):

    print('\nPreprocess patient genomic data in MMRF...')
    genomic_df = MMRF_utils_V2.preprocess_genomic_data(genomic_fn, nPCA=0)

    mapping_dict = {}

    # create mapping dict from Ensembl to HGNC
    for i in range(mapping.shape[0]):
        eid = mapping['Ensembl_ID'].iloc[i]
        hid = mapping['HGNC_ID'].iloc[i]
        mapping_dict[eid] = hid
        
    # average over columns with the same Ensembl ID
    gene_df = genomic_df[mapping['Ensembl_ID']]
    pids = genomic_df['pids']

    HGNC_cols = [mapping_dict[eid] for eid in gene_df.columns]
    gene_df.columns = HGNC_cols
    # average over columns with the same HGNC name
    gene_df = gene_df.groupby(by=gene_df.columns, axis=1).mean()
    
    gene_df = gene_df[CCLE_cols]
    
    # normalize each gene
    for col in gene_df.columns:
        mu = gene_df[col].mean()
        sigma = gene_df[col].std()
        gene_df[col] -= mu
        if sigma>0: gene_df[col] /= sigma
    
    return gene_df, pids

## 3. Comparing AUC of three approaches

In [13]:
# no grid search
def predict_PFS(clf, model_name, train_X, train_y, test_X, test_y, training=True):
    
    if training:
        print('Fitting model %s...'%model_name)
        clf.fit(train_X, train_y)

    # predict PoS in test set
    print('Predicting on test set...')
    y_pred = clf.predict_proba(test_X)[:,1]

    auc = skmetrics.roc_auc_score(test_y, y_pred)
    f1 = skmetrics.f1_score(test_y, y_pred.round())
    pres = skmetrics.precision_score(test_y, y_pred.round())
    rec = skmetrics.recall_score(test_y, y_pred.round())
    acc = skmetrics.accuracy_score(test_y, y_pred.round())

    print('Accuracy: %0.2f'%acc)
    print('Precision: %0.2f'%pres)
    print('Recall: %0.2f'%rec)
    print('F1 score: %0.2f'%f1)
    print('AUC: %0.2f'%auc)
    print('')
    
    return clf

In [14]:
def predict_PFS_grid_search(clf, model_name, train_X, train_y, test_X, test_y):
    
    print('Commencing grid search for %s'%model_name)

    grid = {"max_depth": [10,20,50],
            "n_estimators":[1000,5000]}
        
    clf_search = RandomizedSearchCV(clf, param_distributions=grid, cv=5, random_state=50, scoring='roc_auc')
    clf_search.fit(train_X, train_y)

    print('Random search complete, fitting model %s...'%model_name)
    best_model =  clf_search.best_estimator_
    best_model.fit(train_X, train_y)

    # predict PoS in test set
    print('Predicting on test set...')
    y_pred = best_model.predict_proba(test_X)[:,1]

    auc = skmetrics.roc_auc_score(test_y, y_pred)
    f1 = skmetrics.f1_score(test_y, y_pred.round())
    pres = skmetrics.precision_score(test_y, y_pred.round())
    rec = skmetrics.recall_score(test_y, y_pred.round())
    acc = skmetrics.accuracy_score(test_y, y_pred.round())

    print('Accuracy: %0.2f'%acc)
    print('Precision: %0.2f'%pres)
    print('Recall: %0.2f'%rec)
    print('F1 score: %0.2f'%f1)
    print('AUC: %0.2f'%auc)
    print('')
    
    return best_model

### 3.1 Only use patient data

In [15]:
def stratify_patients(data, drug='All', option='', mmrf_asct=None):
    
    if drug != 'All':
        data = data[data[drug+'1']==1]
        
    if option == 'ASCT':
        data = data.merge(mmrf_asct[mmrf_asct['ASCT']==1], left_on='pids', right_on='pids')
        data.pop('ASCT')
    
    if option == 'NON-ASCT':
        data = data.merge(mmrf_asct[mmrf_asct['ASCT']==0], left_on='pids', right_on='pids')
        data.pop('ASCT')

    if 'age' in option:
        option = option.split('_')[1]
        if '+' in option:
            age = int(option[:-1])
            print('Age above %d'%age)            
            data = data[data['age']>= age]
            
        if '-' in option:
            age = int(option[:-1])
            print('Age below %d'%age)            
            data = data[data['age']< age]
        
        
    return data

In [16]:
def predict_pfs_patient_only(train_df,valid_df,test_df, clf, model_name='',
                             train_drug='All', test_drug='All', option='', mmrf_asct=None, grid_search=False, training=True):

    print('Predicting PFS using patient clinical data only...')
    
    assert train_drug in ['Bor','Len','All'] and test_drug in ['Bor','Len','All']
    
    print('Predicting patients with (%s, %s) as first line treatment...'%(train_drug,test_drug))
    
    # no genomic data
    train = pd.concat([train_df,valid_df])
    test = test_df
    
    train = stratify_patients(train, drug=train_drug, option=option, mmrf_asct=mmrf_asct)
    test = stratify_patients(test, drug=test_drug, option=option, mmrf_asct=mmrf_asct)
        
    print(train.shape)    
    print(test.shape)
    
    pfs_thresh = 12
    train_y = np.array(train.pop('pfs'))
    train_y = np.array([int(pfs>pfs_thresh) for pfs in train_y])
    p1 = train.pop('pids')
    train_X = np.array(train)

    test_y = np.array(test.pop('pfs'))
    test_y = np.array([int(pfs>pfs_thresh) for pfs in test_y])
    p0 = test.pop('pids')
    test_X = np.array(test)
    
    if grid_search:
        clf_fit = predict_PFS_grid_search(clf, model_name, train_X, train_y, test_X, test_y)
    else:
        clf_fit = predict_PFS(clf, model_name, train_X, train_y, test_X, test_y, training)
        
    return clf_fit

### 3.2 PCA on raw RNA

In [17]:
def predict_pfs_patient_PCA(genomic_fn, nPCA, train_df,valid_df, test_df, clf, model_name='',
                            train_drug='All', test_drug='Bor', option='', mmrf_asct=None, grid_search=False, training=True):
    
    print('Predicting PFS using patient clinical data and PCA-%d on RNAseq...'%nPCA)
    
    assert nPCA > 0    
    assert train_drug in ['Bor','Len','All'] and test_drug in ['Bor','Len','All']
    
    print('Predicting patients with (%s, %s) as first line treatment...'%(train_drug,test_drug))
    
    print('\nPreprocess patient genomic data in MMRF...')
    
    genomic_df = MMRF_utils_V2.preprocess_genomic_data(genomic_fn, nPCA=nPCA)
    print(genomic_df.shape)
    
    print('\nMerging patient data with genomic data...')
    train_patient_genomic = train_df.merge(genomic_df, left_on='pids', right_on='pids')
    print(train_patient_genomic.shape)

    valid_patient_genomic = valid_df.merge(genomic_df, left_on='pids', right_on='pids')
    print(valid_patient_genomic.shape)

    test_patient_genomic = test_df.merge(genomic_df, left_on='pids', right_on='pids')
    print(test_patient_genomic.shape)
    
    train = pd.concat([train_patient_genomic,valid_patient_genomic])
    train = train
    test = test_patient_genomic
    
    train = stratify_patients(train, drug=train_drug, option=option, mmrf_asct=mmrf_asct)
    test = stratify_patients(test, drug=test_drug, option=option, mmrf_asct=mmrf_asct)

    print(train.shape)
    print(test.shape)

    pfs_thresh = 12
    train_y = np.array(train.pop('pfs'))
    train_y = np.array([int(pfs>pfs_thresh) for pfs in train_y])
    p1 = train.pop('pids')
    train_X = np.array(train)

    test_y = np.array(test.pop('pfs'))
    test_y = np.array([int(pfs>pfs_thresh) for pfs in test_y])
    p0 = test.pop('pids')
    test_X = np.array(test)
    
    if grid_search:
        clf_fit = predict_PFS_grid_search(clf, model_name, train_X, train_y, test_X, test_y)
    else:
        clf_fit = predict_PFS(clf, model_name, train_X, train_y, test_X, test_y, training)

    return clf_fit

### 3.3 Transfer from CCLE

In [18]:
def predict_pfs_patient_CCLE(enc_gene_df, train_df,valid_df, test_df, clf, model_name='',
                            train_drug='All', test_drug='Bor', option='', mmrf_asct=None, grid_search=False, training=True):
    
    print('\nPredicing PFS with transfer model from CCLE...')
    
    print('\nMerging patient data with genomic data...')
    train_patient_genomic = train_df.merge(enc_gene_df, left_on='pids', right_on='pids')
    print(train_patient_genomic.shape)

    valid_patient_genomic = valid_df.merge(enc_gene_df, left_on='pids', right_on='pids')
    print(valid_patient_genomic.shape)

    test_patient_genomic = test_df.merge(enc_gene_df, left_on='pids', right_on='pids')
    print(test_patient_genomic.shape)
    
    train = pd.concat([train_patient_genomic,valid_patient_genomic])
    test = test_patient_genomic
       
    train = stratify_patients(train, drug=train_drug, option=option, mmrf_asct=mmrf_asct)
    test = stratify_patients(test, drug=test_drug, option=option, mmrf_asct=mmrf_asct)
    
    print(train.shape)
    print(test.shape)
    
    pfs_thresh = 12
    train_y = np.array(train.pop('pfs'))
    train_y = np.array([int(pfs>pfs_thresh) for pfs in train_y])
    p1 = train.pop('pids')
    train_X = np.array(train)

    test_y = np.array(test.pop('pfs'))
    test_y = np.array([int(pfs>pfs_thresh) for pfs in test_y])
    p0 = test.pop('pids')
    test_X = np.array(test)
    
    if grid_search:
        clf_fit = predict_PFS_grid_search(clf, model_name, train_X, train_y, test_X, test_y)
    else:
        clf_fit = predict_PFS(clf, model_name, train_X, train_y, test_X, test_y, training)

    return clf_fit

### 3.4 Auto-encoder for RNA-seq

In [19]:
def predict_pfs_patient_AE(ae_dict, ae_type, train_df, valid_df, test_df, clf, model_name='',
                             train_drug='All', test_drug='Bor', option='', mmrf_asct=None, grid_search=False, training=True):
        
    print('\nPredicing PFS with auto-encoded RNAseq...')

    print('Using auto-encoder %s'%ae_type)
    
    train_ae = ae_dict['/mmrf_train_%s'%ae_type]
    valid_ae = ae_dict['/mmrf_valid_%s'%ae_type]
    test_ae = ae_dict['/mmrf_test_%s'%ae_type]

    train_ae_df = train_df.merge(train_ae, left_on='pids', right_on='pids')
    print(train_ae_df.shape)

    valid_ae_df = valid_df.merge(valid_ae, left_on='pids', right_on='pids')
    print(valid_ae_df.shape)

    test_ae_df = test_df.merge(test_ae, left_on='pids', right_on='pids')
    print(test_ae_df.shape)
    
    train = pd.concat([train_ae_df,valid_ae_df])
    test = test_ae_df

    train = stratify_patients(train, drug=train_drug, option=option, mmrf_asct=mmrf_asct)
    test = stratify_patients(test, drug=test_drug, option=option, mmrf_asct=mmrf_asct)
    
    print(train.shape)
    print(test.shape)
    
    pfs_thresh = 12
    train_y = np.array(train.pop('pfs'))
    train_y = np.array([int(pfs>pfs_thresh) for pfs in train_y])
    p1 = train.pop('pids')
    train_X = np.array(train)

    test_y = np.array(test.pop('pfs'))
    test_y = np.array([int(pfs>pfs_thresh) for pfs in test_y])
    p0 = test.pop('pids')
    test_X = np.array(test)
    
    if grid_search:
        clf_fit = predict_PFS_grid_search(clf, model_name, train_X, train_y, test_X, test_y)
    else:
        clf_fit = predict_PFS(clf, model_name, train_X, train_y, test_X, test_y, training)

    return clf_fit

## 3.5 PCA + CCLE

In [20]:
def predict_pfs_patient_PCA_CCLE(enc_gene_df, genomic_fn, nPCA, train_df, valid_df, test_df, clf, model_name='',
                            train_drug='All', test_drug='Bor', option='', mmrf_asct=None, grid_search=False, training=True):
    
    print('Predicting PFS using patient clinical data and PCA-%d on RNAseq...'%nPCA)
    
    assert nPCA > 0    
    assert train_drug in ['Bor','Len','All'] and test_drug in ['Bor','Len','All']
    
    print('Predicting patients with (%s, %s) as first line treatment...'%(train_drug,test_drug))
    
    print('\nPreprocess patient genomic data in MMRF...')
    
    genomic_df = MMRF_utils_V2.preprocess_genomic_data(genomic_fn, nPCA=nPCA)
    print(genomic_df.shape)
    
    print('\nMerging patient data with genomic data...')
    train_df = train_df.merge(genomic_df, left_on='pids', right_on='pids')
    train_df = train_df.merge(enc_gene_df, left_on='pids', right_on='pids')
    print(train_df.shape)
    
    valid_df = valid_df.merge(genomic_df, left_on='pids', right_on='pids')
    valid_df = valid_df.merge(enc_gene_df, left_on='pids', right_on='pids')
    print(valid_df.shape)
    
    test_df = test_df.merge(genomic_df, left_on='pids', right_on='pids')
    test_df = test_df.merge(enc_gene_df, left_on='pids', right_on='pids')
    print(test_df.shape)
    
    train = pd.concat([train_df,valid_df])
    test = test_df
    
    train = stratify_patients(train, drug=train_drug, option=option, mmrf_asct=mmrf_asct)
    test = stratify_patients(test, drug=test_drug, option=option, mmrf_asct=mmrf_asct)

    print(train.shape)
    print(test.shape)

    pfs_thresh = 12
    train_y = np.array(train.pop('pfs'))
    train_y = np.array([int(pfs>pfs_thresh) for pfs in train_y])
    p1 = train.pop('pids')
    train_X = np.array(train)

    test_y = np.array(test.pop('pfs'))
    test_y = np.array([int(pfs>pfs_thresh) for pfs in test_y])
    p0 = test.pop('pids')
    test_X = np.array(test)
    
    if grid_search:
        clf_fit = predict_PFS_grid_search(clf, model_name, train_X, train_y, test_X, test_y)
    else:
        clf_fit = predict_PFS(clf, model_name, train_X, train_y, test_X, test_y, training)

    return clf_fit

## 4. Run experiments

In [21]:
print('Testing transfer learning pipeline from CCLE to MMRF...')
    
print('Reading data files...')
    
data_dir = './data/' # Enter the path to your folder here
    
# have the following files in your data folder
ind = 1
mmrf_filename = data_dir+'cleaned_mm%d_2mos_pfs_ind.pkl'%(ind)
    
ccle_exp = pd.read_csv(data_dir+"CCLE_expression.csv")
sample_info = pd.read_csv(data_dir+"sample_info.csv")
sanger_dose_response = pd.read_csv(data_dir+"sanger-dose-response.csv")
mapping = pd.read_csv(data_dir+'Ensembl_HGNC_map_042421.csv')
   
genomic_fn = data_dir+'MMRF_CoMMpass_IA15a_E74GTF_Salmon_Gene_TPM.txt'

mmrf_asct = pd.read_csv('./data/MMRF_ASCT.csv')
mmrf_asct = mmrf_asct[['pids','ASCT']]

Testing transfer learning pipeline from CCLE to MMRF...
Reading data files...


In [22]:
print('\nPart I. Pretraining on CCLE IC50 data')
    
bort_data, bort_labels, lenal_data, lenal_labels = preprocess_CCLE(ccle_exp, sample_info, sanger_dose_response, mapping)

X_train, y_train, X_test, y_test, pca = split_train_test_CCLE(bort_data, bort_labels, normalize=True, nPCA=100)

#linear_regression_IC50(X_train, y_train, X_test, y_test) # linear regression baseline


Part I. Pretraining on CCLE IC50 data
Preprocessing CCLE data...
Normalizing input features...
Running PCA-100 on gene expressions...
Training set shape:
(560, 100)
Test set shape:
(241, 100)


In [23]:
# pretrain a model with encoding layer size 5
MLP_params = {'niter':30, 'nepoch':10,'batch_size':32,'hidden_layers':[50,20,10],'lr':1e-3, 'enc_layer':5}
pretrain_model = pretrain_MLP_model(X_train, y_train, X_test, y_test, MLP_params)

Trial 10 Complete [00h 00m 05s]
mse: 0.45137323439121246

Best mse So Far: 0.11537745967507362
Total elapsed time: 00h 01m 00s
INFO:tensorflow:Oracle triggered exit


In [24]:
pretrain_model.save("./pretrain_model_051721")
pretrain_model.get_config()

INFO:tensorflow:Assets written to: ./pretrain_model_051721\assets


{'name': 'sequential',
 'layers': [{'class_name': 'InputLayer',
   'config': {'batch_input_shape': (None, 100),
    'dtype': 'float32',
    'sparse': False,
    'ragged': False,
    'name': 'dense1_input'}},
  {'class_name': 'Dense',
   'config': {'name': 'dense1',
    'trainable': True,
    'batch_input_shape': (None, 100),
    'dtype': 'float32',
    'units': 100,
    'activation': 'sigmoid',
    'use_bias': True,
    'kernel_initializer': {'class_name': 'GlorotUniform',
     'config': {'seed': None}},
    'bias_initializer': {'class_name': 'Zeros', 'config': {}},
    'kernel_regularizer': None,
    'bias_regularizer': None,
    'activity_regularizer': None,
    'kernel_constraint': None,
    'bias_constraint': None}},
  {'class_name': 'Dropout',
   'config': {'name': 'dropout1',
    'trainable': True,
    'dtype': 'float32',
    'rate': 0.0,
    'noise_shape': None,
    'seed': None}},
  {'class_name': 'Dense',
   'config': {'name': 'dense2',
    'trainable': True,
    'dtype': 'flo

In [25]:
# pretrain a second model with encoding layer size 10
MLP_params = {'niter':30, 'nepoch':10,'batch_size':32,'hidden_layers':[50,20,10],'lr':1e-3, 'enc_layer':10}
pretrain_model2 = pretrain_MLP_model(X_train, y_train, X_test, y_test, MLP_params)

Trial 10 Complete [00h 00m 05s]
mse: 37.830315589904785

Best mse So Far: 0.13267181813716888
Total elapsed time: 00h 01m 06s
INFO:tensorflow:Oracle triggered exit


In [26]:
pretrain_model2.save("./pretrain_model2_051721")
pretrain_model2.get_config()

INFO:tensorflow:Assets written to: ./pretrain_model2_051721\assets


{'name': 'sequential',
 'layers': [{'class_name': 'InputLayer',
   'config': {'batch_input_shape': (None, 100),
    'dtype': 'float32',
    'sparse': False,
    'ragged': False,
    'name': 'dense1_input'}},
  {'class_name': 'Dense',
   'config': {'name': 'dense1',
    'trainable': True,
    'batch_input_shape': (None, 100),
    'dtype': 'float32',
    'units': 100,
    'activation': 'sigmoid',
    'use_bias': True,
    'kernel_initializer': {'class_name': 'GlorotUniform',
     'config': {'seed': None}},
    'bias_initializer': {'class_name': 'Zeros', 'config': {}},
    'kernel_regularizer': None,
    'bias_regularizer': None,
    'activity_regularizer': None,
    'kernel_constraint': None,
    'bias_constraint': None}},
  {'class_name': 'Dropout',
   'config': {'name': 'dropout1',
    'trainable': True,
    'dtype': 'float32',
    'rate': 0.0,
    'noise_shape': None,
    'seed': None}},
  {'class_name': 'Dense',
   'config': {'name': 'dense2',
    'trainable': True,
    'dtype': 'flo

In [27]:
print('\nPart II. Preprocess MMRF genomic data')

# run PCA on MMRF genomic data and use IC50 model in Part I to encode
gene_df, pids = preprocess_MMRF_genomic(genomic_fn, bort_data.columns, mapping)
pca_gene = pca.transform(gene_df)


Part II. Preprocess MMRF genomic data

Preprocess patient genomic data in MMRF...
Reading ./data/MMRF_CoMMpass_IA15a_E74GTF_Salmon_Gene_TPM.txt
Raw data shape:
(57997, 922)
0 out of 57997 genes have missing data 
Using raw genomic data...
Created new dataframe...


In [28]:
# encoded genes for the first CCLE IC50 model
layer_name = 'encoder'
encoder = tf.keras.Model(inputs=pretrain_model.input,
                         outputs=pretrain_model.get_layer(layer_name).output)
enc_gene = encoder.predict(pca_gene)
    
enc_gene_df = pd.DataFrame(data=enc_gene, columns=['Enc%d'%(i+1) for i in range(enc_gene.shape[1])])
enc_gene_df['pids'] = pids

In [29]:
enc_gene.shape

(774, 5)

In [30]:
# encoded genes for the second CCLE IC50 model
layer_name = 'encoder'
encoder2 = tf.keras.Model(inputs=pretrain_model2.input,
                         outputs=pretrain_model2.get_layer(layer_name).output)
enc_gene2 = encoder2.predict(pca_gene)

enc_gene_df2 = pd.DataFrame(data=enc_gene2, columns=['Enc%d'%(i+1) for i in range(enc_gene2.shape[1])])
enc_gene_df2['pids'] = pids

enc_gene2.shape

(774, 10)

In [31]:
#data_dir = './data/'
#ind = 1
#mmrf_filename = data_dir+'cleaned_mm%d_2mos_pfs_ind.pkl'%(ind)

print('\nPart III. Predict PFS on MMRF patient data')
train, test, valid = MMRF_utils_V2.get_train_test_valid(mmrf_filename, ind, show_features=True)

print('\nPreprocess patient data in MMRF')
# include the first n clinical visit data for each patient (default 0)
train_df = MMRF_utils_V2.preprocess_patient_data(train, num_clin_visits=1)  
print(train_df.shape)

valid_df = MMRF_utils_V2.preprocess_patient_data(valid, num_clin_visits=1)  
print(valid_df.shape)

test_df = MMRF_utils_V2.preprocess_patient_data(test, num_clin_visits=1)  
print(test_df.shape)

print(train_df.columns)


Part III. Predict PFS on MMRF patient data
loading from: ./data/cleaned_mm1_2mos_pfs_ind.pkl
pids
(494,)
x
(494, 33, 16)
m
(494, 33, 16)
feature_names_x
(16,)
['cbc_abs_neut' 'chem_albumin' 'chem_bun' 'chem_calcium' 'chem_creatinine'
 'chem_glucose' 'cbc_hemoglobin' 'serum_kappa' 'serum_m_protein'
 'cbc_platelet' 'chem_totprot' 'cbc_wbc' 'serum_iga' 'serum_igg'
 'serum_igm' 'serum_lambda']
ys_seq
(494, 1)
ce
(494, 1)
feature_names_y
(1,)
['progression free survival (all)']
b
(494, 16)
feature_names
(16,)
Index(['iss', 'age', 'gender', 'ecog', 'serum_beta2_microglobulin', 'PC1',
       'PC2', 'PC3', 'PC4', 'PC5', 'heavy_chain', 'igg_type', 'iga_type',
       'igm_type', 'kappa_type', 'lambda_type'],
      dtype='object')
a
(494, 33, 9)
m_a
(494, 33, 6)
feature_names_a
(9,)
['local_clock' 'Bor' 'Car' 'Cyc' 'Dex' 'Len' 'line1' 'line2' 'line3plus']

Preprocess patient data in MMRF
Keep first 1 clinical visits.
(461, 27)
Keep first 1 clinical visits.
(161, 27)
Keep first 1 clinical visits.

### 4.1 Only use patient data

In [32]:
print('\nComparing prediction accuracy of different models... \n')
    
# Model 1: patient data only
print('\n Part I \n')
clf1 = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)    
clf1 = predict_pfs_patient_only(train_df,valid_df,test_df, clf1, model_name='RF',
                             train_drug='Bor', test_drug='Bor', option='', mmrf_asct=None, grid_search=True, training=True)


Comparing prediction accuracy of different models... 


 Part I 

Predicting PFS using patient clinical data only...
Predicting patients with (Bor, Bor) as first line treatment...
(526, 27)
(170, 27)
Commencing grid search for RF




Random search complete, fitting model RF...
Predicting on test set...
Accuracy: 0.59
Precision: 0.60
Recall: 0.86
F1 score: 0.71
AUC: 0.58



In [33]:
print('Predicting patients with ASCT')
_ = predict_pfs_patient_only(train_df,valid_df,test_df, clf1, model_name='RF',
                             train_drug='Bor', test_drug='Bor', option='ASCT', mmrf_asct=mmrf_asct, training=False)

print('Predicting patients without ASCT')
_ = predict_pfs_patient_only(train_df,valid_df,test_df, clf1, model_name='RF',
                             train_drug='Bor', test_drug='Bor', option='NON-ASCT', mmrf_asct=mmrf_asct, training=False)

Predicting patients with ASCT
Predicting PFS using patient clinical data only...
Predicting patients with (Bor, Bor) as first line treatment...
(328, 27)
(96, 27)
Predicting on test set...
Accuracy: 0.67
Precision: 0.67
Recall: 0.93
F1 score: 0.78
AUC: 0.54

Predicting patients without ASCT
Predicting PFS using patient clinical data only...
Predicting patients with (Bor, Bor) as first line treatment...
(198, 27)
(74, 27)
Predicting on test set...
Accuracy: 0.50
Precision: 0.49
Recall: 0.75
F1 score: 0.59
AUC: 0.57



In [34]:
print('Best model for Part I:')
clf1.get_params()

Best model for Part I:


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 5000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

### 4.2 Use patient data and PCA on raw RNA-seq

In [35]:
# Model 2: patient data with PCA on MMRF RNAseq
print('\n Part II \n')

nPCA = 50
clf2 = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
clf2 = predict_pfs_patient_PCA(genomic_fn, nPCA, train_df, valid_df, test_df, clf2, model_name='RF',
                            train_drug='Bor', test_drug='Bor', option='', mmrf_asct=None, grid_search=True, training=True)


 Part II 

Predicting PFS using patient clinical data and PCA-50 on RNAseq...
Predicting patients with (Bor, Bor) as first line treatment...

Preprocess patient genomic data in MMRF...
Reading ./data/MMRF_CoMMpass_IA15a_E74GTF_Salmon_Gene_TPM.txt
Raw data shape:
(57997, 922)
0 out of 57997 genes have missing data 
Running PCA with 50 principal components...
Finished PCA!
Created new dataframe...
(774, 51)

Merging patient data with genomic data...
(313, 77)
(97, 77)
(142, 77)
(345, 77)
(112, 77)
Commencing grid search for RF




Random search complete, fitting model RF...
Predicting on test set...
Accuracy: 0.60
Precision: 0.58
Recall: 0.95
F1 score: 0.72
AUC: 0.66



In [36]:
_ = predict_pfs_patient_PCA(genomic_fn, nPCA, train_df, valid_df, test_df, clf2, model_name='RF',
                            train_drug='Bor', test_drug='Bor', option='ASCT', mmrf_asct=mmrf_asct, grid_search=False, training=False)

Predicting PFS using patient clinical data and PCA-50 on RNAseq...
Predicting patients with (Bor, Bor) as first line treatment...

Preprocess patient genomic data in MMRF...
Reading ./data/MMRF_CoMMpass_IA15a_E74GTF_Salmon_Gene_TPM.txt
Raw data shape:
(57997, 922)
0 out of 57997 genes have missing data 
Running PCA with 50 principal components...
Finished PCA!
Created new dataframe...
(774, 51)

Merging patient data with genomic data...
(313, 77)
(97, 77)
(142, 77)
(218, 77)
(58, 77)
Predicting on test set...
Accuracy: 0.67
Precision: 0.68
Recall: 0.95
F1 score: 0.79
AUC: 0.66



In [37]:
_ = predict_pfs_patient_PCA(genomic_fn, nPCA, train_df, valid_df, test_df, clf2, model_name='RF',
                            train_drug='Bor', test_drug='Bor', option='NON-ASCT', mmrf_asct=mmrf_asct, grid_search=False, training=False)

Predicting PFS using patient clinical data and PCA-50 on RNAseq...
Predicting patients with (Bor, Bor) as first line treatment...

Preprocess patient genomic data in MMRF...
Reading ./data/MMRF_CoMMpass_IA15a_E74GTF_Salmon_Gene_TPM.txt
Raw data shape:
(57997, 922)
0 out of 57997 genes have missing data 
Running PCA with 50 principal components...
Finished PCA!
Created new dataframe...
(774, 51)

Merging patient data with genomic data...
(313, 77)
(97, 77)
(142, 77)
(127, 77)
(54, 77)
Predicting on test set...
Accuracy: 0.52
Precision: 0.48
Recall: 0.96
F1 score: 0.64
AUC: 0.66



In [38]:
print('Best model for Part II:')

clf2.get_params()

Best model for Part II:


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 20,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

### 4.3 Transfer from CCLE IC50 Predictions

In [39]:
print('\n Part III \n')
print('First IC50 model\n')
    
clf3 = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
clf3 = predict_pfs_patient_CCLE(enc_gene_df, train_df, valid_df, test_df, clf3, model_name='RF',
                            train_drug='Bor', test_drug='Bor', option='', mmrf_asct=None, grid_search=True, training=True)


 Part III 

First IC50 model


Predicing PFS with transfer model from CCLE...

Merging patient data with genomic data...
(313, 32)
(97, 32)
(142, 32)
(345, 32)
(112, 32)
Commencing grid search for RF




Random search complete, fitting model RF...
Predicting on test set...
Accuracy: 0.61
Precision: 0.60
Recall: 0.87
F1 score: 0.71
AUC: 0.65



In [40]:
_ = predict_pfs_patient_CCLE(enc_gene_df, train_df, valid_df, test_df, clf3, model_name='RF',
                            train_drug='Bor', test_drug='Bor', option='ASCT', mmrf_asct=mmrf_asct, grid_search=False, training=False)

_ = predict_pfs_patient_CCLE(enc_gene_df, train_df, valid_df, test_df, clf3, model_name='RF',
                            train_drug='Bor', test_drug='Bor', option='NON-ASCT', mmrf_asct=mmrf_asct, grid_search=False, training=False)


Predicing PFS with transfer model from CCLE...

Merging patient data with genomic data...
(313, 32)
(97, 32)
(142, 32)
(218, 32)
(58, 32)
Predicting on test set...
Accuracy: 0.67
Precision: 0.69
Recall: 0.89
F1 score: 0.78
AUC: 0.67


Predicing PFS with transfer model from CCLE...

Merging patient data with genomic data...
(313, 32)
(97, 32)
(142, 32)
(127, 32)
(54, 32)
Predicting on test set...
Accuracy: 0.54
Precision: 0.49
Recall: 0.83
F1 score: 0.62
AUC: 0.58



In [43]:
clf3.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 5000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [41]:
print('\n Part III \n')
print('Second IC50 model\n')
clf3_2 = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
clf3_2 = predict_pfs_patient_CCLE(enc_gene_df2, train_df, valid_df, test_df, clf3_2, model_name='RF',
                            train_drug='Bor', test_drug='Bor', option='', mmrf_asct=None, grid_search=True, training=True)


 Part III 

Second IC50 model


Predicing PFS with transfer model from CCLE...

Merging patient data with genomic data...
(313, 37)
(97, 37)
(142, 37)
(345, 37)
(112, 37)
Commencing grid search for RF




Random search complete, fitting model RF...
Predicting on test set...
Accuracy: 0.59
Precision: 0.59
Recall: 0.85
F1 score: 0.70
AUC: 0.63



In [42]:
_ = predict_pfs_patient_CCLE(enc_gene_df2, train_df, valid_df, test_df, clf3_2, model_name='RF',
                            train_drug='Bor', test_drug='Bor', option='ASCT', mmrf_asct=mmrf_asct, grid_search=False, training=False)

_ = predict_pfs_patient_CCLE(enc_gene_df2, train_df, valid_df, test_df, clf3_2, model_name='RF',
                            train_drug='Bor', test_drug='Bor', option='NON-ASCT', mmrf_asct=mmrf_asct, grid_search=False, training=False)


Predicing PFS with transfer model from CCLE...

Merging patient data with genomic data...
(313, 37)
(97, 37)
(142, 37)
(218, 37)
(58, 37)
Predicting on test set...
Accuracy: 0.64
Precision: 0.68
Recall: 0.84
F1 score: 0.75
AUC: 0.58


Predicing PFS with transfer model from CCLE...

Merging patient data with genomic data...
(313, 37)
(97, 37)
(142, 37)
(127, 37)
(54, 37)
Predicting on test set...
Accuracy: 0.54
Precision: 0.49
Recall: 0.88
F1 score: 0.63
AUC: 0.65



In [44]:
clf3_2.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 5000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

### 4.4 Transfer from auto-encoded RNA-seq

In [45]:
# Model 4: Auto-encoder for CCLE/MMRF
print('\nPart IV. Predict PFS with auto-encoded RNA-seq')

ae_fn = data_dir + 'normalized_embeddings_64.pkl' # auto-encoder of RNA
ae_dict = pickle.load(open(ae_fn, "rb" ))
ae_type = 'combinedae' # other types: ['mmrfae', 'ccleae', 'combinedae']

clf4 = RandomForestClassifier(n_estimators=1000, max_depth=50, random_state=0)

clf4 = predict_pfs_patient_AE(ae_dict, ae_type, train_df, valid_df, test_df, clf4, model_name='RF',
                        train_drug='Bor', test_drug='Bor', option='', mmrf_asct=None, grid_search=True, training=True)


Part IV. Predict PFS with auto-encoded RNA-seq

Predicing PFS with auto-encoded RNAseq...
Using auto-encoder combinedae
(313, 91)
(97, 91)
(142, 91)
(345, 91)
(112, 91)
Commencing grid search for RF




Random search complete, fitting model RF...
Predicting on test set...
Accuracy: 0.57
Precision: 0.57
Recall: 0.87
F1 score: 0.69
AUC: 0.62



In [46]:
_ = predict_pfs_patient_AE(ae_dict, ae_type, train_df, valid_df, test_df, clf4, model_name='RF',
                        train_drug='Bor', test_drug='Bor', option='ASCT', mmrf_asct=mmrf_asct, grid_search=False, training=False)


Predicing PFS with auto-encoded RNAseq...
Using auto-encoder combinedae
(313, 91)
(97, 91)
(142, 91)
(218, 91)
(58, 91)
Predicting on test set...
Accuracy: 0.66
Precision: 0.69
Recall: 0.87
F1 score: 0.77
AUC: 0.63



In [47]:
_ = predict_pfs_patient_AE(ae_dict, ae_type, train_df, valid_df, test_df, clf4, model_name='RF',
                        train_drug='Bor', test_drug='Bor', option='NON-ASCT', mmrf_asct=mmrf_asct, grid_search=False, training=False)


Predicing PFS with auto-encoded RNAseq...
Using auto-encoder combinedae
(313, 91)
(97, 91)
(142, 91)
(127, 91)
(54, 91)
Predicting on test set...
Accuracy: 0.48
Precision: 0.46
Recall: 0.88
F1 score: 0.60
AUC: 0.60



In [48]:
print('Best model for Part IV (Latent 64):')

clf4.get_params()

Best model for Part IV (Latent 64):


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 20,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 5000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

In [49]:
# Model 4: Auto-encoder for CCLE/MMRF
# Try the AE with latent dimension 128
print('\n Part IV \n')

ae_fn = data_dir + 'normalized_embeddings_128.pkl' # auto-encoder of RNA
ae_dict = pickle.load(open(ae_fn, "rb" ))
ae_type = 'combinedae' # other types: ['mmrfae', 'ccleae', 'combinedae']

clf4_2 = RandomForestClassifier(n_estimators=1000, max_depth=50, random_state=0)

clf4_2 = predict_pfs_patient_AE(ae_dict, ae_type, train_df, valid_df, test_df, clf4_2, model_name='RF',
                        train_drug='Bor', test_drug='Bor', option='', mmrf_asct=None, grid_search=True, training=True)


 Part IV 


Predicing PFS with auto-encoded RNAseq...
Using auto-encoder combinedae
(313, 155)
(97, 155)
(142, 155)
(345, 155)
(112, 155)
Commencing grid search for RF




Random search complete, fitting model RF...
Predicting on test set...
Accuracy: 0.59
Precision: 0.59
Recall: 0.87
F1 score: 0.70
AUC: 0.65



In [50]:
_ = predict_pfs_patient_AE(ae_dict, ae_type, train_df, valid_df, test_df, clf4_2, model_name='RF',
                        train_drug='Bor', test_drug='Bor', option='ASCT', mmrf_asct=mmrf_asct, grid_search=False, training=False)


Predicing PFS with auto-encoded RNAseq...
Using auto-encoder combinedae
(313, 155)
(97, 155)
(142, 155)
(218, 155)
(58, 155)
Predicting on test set...
Accuracy: 0.64
Precision: 0.68
Recall: 0.84
F1 score: 0.75
AUC: 0.66



In [51]:
_ = predict_pfs_patient_AE(ae_dict, ae_type, train_df, valid_df, test_df, clf4_2, model_name='RF',
                        train_drug='Bor', test_drug='Bor', option='NON-ASCT', mmrf_asct=mmrf_asct, grid_search=False, training=False)


Predicing PFS with auto-encoded RNAseq...
Using auto-encoder combinedae
(313, 155)
(97, 155)
(142, 155)
(127, 155)
(54, 155)
Predicting on test set...
Accuracy: 0.54
Precision: 0.49
Recall: 0.92
F1 score: 0.64
AUC: 0.63



In [52]:
print('Best model for Part IV (Latent Dim 128):')

clf4_2.get_params()

Best model for Part IV (Latent Dim 128):


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 5000,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

### Main function

In [None]:
#if __name__ == '__main__':
def main():
    
    print('Testing transfer learning pipeline from CCLE to MMRF...')
    
    print('Reading data files...')
    
    data_dir = './data/' # Enter the path to your folder here
    
    # have the following files in your data folder
    ind = 1
    mmrf_filename = data_dir+'cleaned_mm%d_2mos_pfs_ind.pkl'%(ind)
    
    mmrf_asct = pd.read_csv('./data/MMRF_ASCT.csv')
    mmrf_asct = mmrf_asct[['pids','ASCT']]
    
    ccle_exp = pd.read_csv(data_dir+"CCLE_expression.csv")
    sample_info = pd.read_csv(data_dir+"sample_info.csv")
    sanger_dose_response = pd.read_csv(data_dir+"sanger-dose-response.csv")
    mapping = pd.read_csv(data_dir+'Ensembl_HGNC_map_042421.csv')
   
    ae_fn = data_dir + 'normalized_embeddings_64.pkl' # auto-encoder of RNA
    genomic_fn = data_dir+'MMRF_CoMMpass_IA15a_E74GTF_Salmon_Gene_TPM.txt'
    
    
    print('\nPart I. Pretraining on CCLE IC50 data')
    
    bort_data, bort_labels, lenal_data, lenal_labels = preprocess_CCLE(ccle_exp, sample_info, sanger_dose_response, mapping)

    X_train, y_train, X_test, y_test, pca = split_train_test_CCLE(bort_data, bort_labels, normalize=True, nPCA=100)

    #linear_regression_IC50(X_train, y_train, X_test, y_test) # linear regression baseline
    
    MLP_params = {'niter':30, 'nepoch':10,'batch_size':32,'hidden_layers':[50,20,10],'lr':1e-3}
    pretrain_model = pretrain_MLP_model(X_train, y_train, X_test, y_test, MLP_params)
    
    print('')
    
    ############################################################################
    

    print('\nPart II. Preprocess MMRF genomic data')

    # run PCA on MMRF genomic data and use IC50 model in Part I to encode
    gene_df, pids = preprocess_MMRF_genomic(genomic_fn, bort_data.columns, mapping)
    pca_gene = pca.transform(gene_df)
    
    layer_name = 'encoder'
    encoder = tf.keras.Model(inputs=pretrain_model.input,
                             outputs=pretrain_model.get_layer(layer_name).output)
    enc_gene = encoder(pca_gene).numpy()
    
    enc_gene_df = pd.DataFrame(data=enc_gene, columns=['Enc%d'%(i+1) for i in range(enc_gene.shape[1])])
    enc_gene_df['pids'] = pids
    print('')
    
    ############################################################################
   
    print('\nPart III. Predict PFS on MMRF patient data')
    train, test, valid = MMRF_utils_V2.get_train_test_valid(mmrf_filename, ind, show_features=True)

    print('\nPreprocess patient data in MMRF')
    # include the first n clinical visit data for each patient (default 0)
    train_df = MMRF_utils_V2.preprocess_patient_data(train, num_clin_visits=1)  
    print(train_df.shape)

    valid_df = MMRF_utils_V2.preprocess_patient_data(valid, num_clin_visits=1)  
    print(valid_df.shape)

    test_df = MMRF_utils_V2.preprocess_patient_data(test, num_clin_visits=1)  
    print(test_df.shape)

    ############################################################################
    
    print('\nPart IV. Predict PFS with auto-encoded RNA-seq')
    
    ae_dict = pickle.load(open(ae_fn, "rb" ))
    ae_type = 'combinedae' # other types: ['mmrfae', 'ccleae', 'combinedae']
    
    #############################################################################

    print('\nComparing prediction accuracy of different models... \n')
    
    # Model 1: patient data only
    print('\n Part I \n')
    clf1 = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)    
    clf1 = predict_pfs_patient_only(train_df,valid_df,test_df, clf1, model_name='RF',
                             train_drug='Bor', test_drug='Bor', option='', mmrf_asct=None, grid_search=False)

    print('Predicting patients with ASCT')
    clf2 = predict_pfs_patient_only(train_df,valid_df,test_df, clf1, model_name='RF',
                             train_drug='Bor', test_drug='Bor', option='ASCT', mmrf_asct=mmrf_asct)
    
    print('Predicting patients without ASCT')
    clf3 = predict_pfs_patient_only(train_df,valid_df,test_df, clf1, model_name='RF',
                             train_drug='Bor', test_drug='Bor', option='NON-ASCT', mmrf_asct=mmrf_asct)
    
    
    # Model 2: patient data with PCA on MMRF RNAseq
    print('\n Part II \n')
    clf2 = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
    clf2 = predict_pfs_patient_PCA(genomic_fn, nPCA, train_df, valid_df, test_df, clf2, model_name='',
                            train_drug='All', test_drug='Bor', option='', mmrf_asct=None, grid_search=False)
    
    
    # Model 3: Transfer from CCLE IC50 to MMRF
    print('\n Part III \n')
    #clf3 = RandomForestClassifier(n_estimators=5000, max_depth=50, random_state=0)
    #clf3 = predict_pfs_patient_CCLE(enc_gene_df, train_df, valid_df, test_df, clf3, model_name='RF',drug='Bor')
    
    # Model 4: Auto-encoder for CCLE/MMRF
    print('\n Part IV \n')
    clf4 = RandomForestClassifier(n_estimators=1000, max_depth=50, random_state=0)
    clf4 = predict_pfs_patient_AE(ae_dict, ae_type, train_df, valid_df, test_df, clf4, model_name='RF',drug='Bor')
    
    print('Work complete!')