## Transfer learning from CCLE to MMRF (V2)

Last Modified: 05/12/2021

Written By: Sumi Thakur and Qingyang Xu

- Pretrain on CCLE cell line data to predict IC50

- Transfer to patient RNA-seq data to predict PFS

References

- Download patient genomic data (e.g. `MMRF_CoMMpass_IA15a_CNA_Exome_PerGene_LargestSegment.txt`)

https://research.themmrf.org/

- Download DevMap cell line data (e.g. `CCLE_expression.csv`)

https://depmap.org/portal/download/

In [8]:
#from google.colab import drive
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
#!pip install -q -U keras-tuner
from kerastuner import HyperModel
from keras import models, layers,regularizers
from tensorflow.keras.optimizers import Adam
from kerastuner.tuners.bayesian import BayesianOptimization
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
import pickle

In [9]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import neural_network

from sklearn import metrics as skmetrics
from sklearn.decomposition import PCA

In [10]:
import MMRF_utils

## 1. Pretrain on CCLE IC50 data

In [11]:
def normalize_features(train, test):
    
    print('Normalizing input features...')
    nsample, nfeature = train.shape
    
    assert nfeature == test.shape[1]

    for i in range(nfeature):
        mu = np.mean(train[:,i])
        sigma = np.std(train[:,i])
        train[:,i] -= mu
        test[:,i] -= mu
        
        if sigma > 0: # some genes may have zero variance
            train[:,i] /= sigma
            test[:,i] /= sigma
    
    return train, test

In [12]:
def preprocess_CCLE(ccle_exp, sample_info, sanger_dose_response, mapping):
    
    print('Preprocessing CCLE data...')
    
    ccle_exp = ccle_exp.rename(columns={'Unnamed: 0':'DepMap_ID'})
    cols_to_keep = ['DepMap_ID'] + list(pd.unique(mapping.HGNC_ID))
    ccle_chosen = ccle_exp[cols_to_keep]

    sanger_dose_response_filt = sanger_dose_response[(sanger_dose_response.DRUG_NAME.str.contains("BORTEZOMIB"))
                                                | (sanger_dose_response.DRUG_NAME.str.contains("LENALIDOMIDE"))
                                                ]

    sanger_dose_response_filt = sanger_dose_response_filt.rename(columns={'ARXSPAN_ID':'DepMap_ID'})
    
    merged_df = sanger_dose_response_filt.merge(ccle_chosen, on= 'DepMap_ID')[list(ccle_chosen.columns)+['DRUG_NAME','IC50_PUBLISHED']]
    merged_df['log(IC_50)'] = merged_df.IC50_PUBLISHED.apply(np.log10)
    
    merged_df_bort = merged_df[merged_df.DRUG_NAME == 'BORTEZOMIB'].drop_duplicates()
    merged_df_lenal = merged_df[merged_df.DRUG_NAME == 'LENALIDOMIDE'].drop_duplicates()
    bort_labels = merged_df_bort['log(IC_50)']
    lenal_labels = merged_df_lenal['log(IC_50)']
    bort_data = merged_df_bort.drop(columns = ['DepMap_ID','IC50_PUBLISHED','DRUG_NAME','log(IC_50)'])
    lenal_data = merged_df_lenal.drop(columns = ['DepMap_ID','IC50_PUBLISHED','DRUG_NAME','log(IC_50)'])
    
    return bort_data, bort_labels, lenal_data, lenal_labels

In [13]:
def split_train_test_CCLE(X, y, normalize=True, nPCA=0):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    
    X_train, X_test = normalize_features(X_train, X_test)
    
    pca = None
    # try running PCA on raw RNAseq data
    if nPCA > 0:
        print('Running PCA-%d on gene expressions...'%nPCA)
        pca = PCA(n_components=nPCA)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
    
    assert X_train.shape[0] == y_train.shape[0]
    
    assert X_test.shape[0] == y_test.shape[0]
    
    print('Training set shape:')
    print(X_train.shape)
    
    print('Test set shape:')
    print(X_test.shape)
    
    return X_train, y_train, X_test, y_test, pca

In [14]:
def linear_regression_IC50(X_train, y_train, X_test, y_test):
    print('Running linear regression to predict IC50...')
    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    R2 = r2_score(y_test, y_pred)
    print('R2: %0.2f'%R2)

    rho, p = pearsonr(y_test, y_pred)
    print('Correlation: %0.2f'%rho)
    print('p-value: %0.5f'%p)

In [15]:
class RegressionHyperModel(HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape
    def build(self, hp):
        model = tf.keras.Sequential()
        model.add(
            tf.keras.layers.Dense(
                units=hp.Int('units', 40, 100, 10, default=80),
                activation=hp.Choice(
                    'dense_activation',
                    values=['relu', 'tanh', 'sigmoid'],
                    default='relu'),
                input_shape = self.input_shape
            )
        )
        
        model.add(
            tf.keras.layers.Dropout(
                hp.Float(
                    'dropout',
                    min_value=0.0,
                    max_value=0.6,
                    default=0.3,
                    step=0.1)
            )
        )
        
        model.add(
            tf.keras.layers.Dense(
                units=hp.Int('units', 10, 50, 10, default=40),
                activation=hp.Choice(
                    'dense_activation',
                    values=['relu', 'tanh', 'sigmoid'],
                    default='relu')
            )
        )
        
        model.add(
            tf.keras.layers.Dropout(
                hp.Float(
                    'dropout',
                    min_value=0.0,
                    max_value=0.6,
                    default=0.3,
                    step=0.1)
            )
        )
        
        model.add(tf.keras.layers.Dense(5, name='encoder'))
        
        model.add(tf.keras.layers.Dense(1))
        
        model.compile(
            optimizer='Adam',loss='mse',metrics=['mse']
        )
        
        return model

In [16]:
def pretrain_MLP_model(X_train, y_train, X_test, y_test, params):
    
    print('Training MLP model...')

    input_shape = (X_train.shape[1],)
    print(input_shape)
    hypermodel = RegressionHyperModel(input_shape)    
    
    ### using bayesian hyperparamter optimization
    tuner_bo = BayesianOptimization(
                hypermodel,
                objective='mse',
                max_trials=2,
                seed=42,
                executions_per_trial=2
            )
    
    ### set epochs
    tuner_bo.search(X_train, y_train, epochs=3, validation_split=0.2, verbose=1)
    best_model = tuner_bo.get_best_models(num_models=1)[0]
  
    return best_model

In [17]:
def plot_R2_correlation(R2_his, rho_his):

    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel(r'$R^2$', color=color)
    ax1.plot(R2_his, color=color)
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:blue'
    ax2.set_ylabel(r'$\rho$', color=color)  # we already handled the x-label with ax1
    ax2.plot(rho_his, color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.title(r'$R^2$ and $\rho$ during training')
    plt.show()

In [18]:
def plot_losses(train_loss,val_loss):

    plt.yscale('log')
    plt.plot(train_loss[1:], label='Train Loss')
    plt.plot(val_loss[1:], label='Valid Loss')
    plt.legend()
    plt.title('Train vs. Valid Losses during Training')
    plt.xlabel('Epoch')
    plt.show()

## 2. Transfer to MMRF patient data

In [None]:
def preprocess_MMRF_genomic(genomic_fn, CCLE_cols, mapping):

    print('\nPreprocess patient genomic data in MMRF...')
    genomic_df = MMRF_utils.preprocess_genomic_data(genomic_fn, nPCA=0)

    mapping_dict = {}

    # create mapping dict from Ensembl to HGNC
    for i in range(mapping.shape[0]):
        eid = mapping['Ensembl_ID'].iloc[i]
        hid = mapping['HGNC_ID'].iloc[i]
        mapping_dict[eid] = hid
        
    # average over columns with the same Ensembl ID
    gene_df = genomic_df[mapping['Ensembl_ID']]
    pids = genomic_df['pids']

    HGNC_cols = [mapping_dict[eid] for eid in gene_df.columns]
    gene_df.columns = HGNC_cols
    # average over columns with the same HGNC name
    gene_df = gene_df.groupby(by=gene_df.columns, axis=1).mean()
    
    gene_df = gene_df[CCLE_cols]
    
    # normalize each gene
    for col in gene_df.columns:
        mu = gene_df[col].mean()
        sigma = gene_df[col].std()
        gene_df[col] -= mu
        if sigma>0: gene_df[col] /= sigma
    
    return gene_df, pids

## 3. Comparing AUC of three approaches

In [None]:
def predict_PFS(clf, model_name, train_X, train_y, test_X, test_y):
    
    print('Fitting model %s...'%model_name)
    clf.fit(train_X, train_y)

    # predict PoS in test set
    print('Predicting on test set...')
    y_pred = clf.predict_proba(test_X)[:,1]

    auc = skmetrics.roc_auc_score(test_y, y_pred)
    f1 = skmetrics.f1_score(test_y, y_pred.round())
    pres = skmetrics.precision_score(test_y, y_pred.round())
    rec = skmetrics.recall_score(test_y, y_pred.round())
    acc = skmetrics.accuracy_score(test_y, y_pred.round())

    print('Accuracy: %0.2f'%acc)
    print('Precision: %0.2f'%pres)
    print('Recall: %0.2f'%rec)
    print('F1 score: %0.2f'%f1)
    print('AUC: %0.2f'%auc)
    print('')
    
    return clf

### 3.1 Only use patient data

In [None]:
def predict_pfs_patient_only(train_df,valid_df,test_df, clf, model_name='',drug='Bor'):

    print('Predicting PFS using patient clinical data only...')
    
    assert drug == 'Bor' or drug == 'Len'
    
    print('Predicting patients with %s as first line treatment...'%drug)
    
    # no genomic data
    train = pd.concat([train_df,valid_df])
    train = train[(train[drug+'1']==1)]
    print(train.shape)

    test = test_df[test_df[drug+'1']==1]
    print(test.shape)
    
    pfs_thresh = 12
    train_y = np.array(train.pop('pfs'))
    train_y = np.array([int(pfs>pfs_thresh) for pfs in train_y])
    p1 = train.pop('pids')
    train_X = np.array(train)

    test_y = np.array(test.pop('pfs'))
    test_y = np.array([int(pfs>pfs_thresh) for pfs in test_y])
    p0 = test.pop('pids')
    test_X = np.array(test)
    
    clf = predict_PFS(clf, model_name, train_X, train_y, test_X, test_y)
    
    return clf

### 3.2 PCA on raw RNA

In [None]:
def predict_pfs_patient_PCA(genomic_fn, train_df,valid_df,test_df, clf, model_name='',drug='Bor',nPCA=20):
    
    print('Predicting PFS using patient clinical data and PCA on RNAseq...')
    
    assert drug == 'Bor' or drug == 'Len'
    assert nPCA > 0
    
    print('Predicting patients with %s as first line treatment...'%drug)
    
    print('\nPreprocess patient genomic data in MMRF...')
    
    genomic_df = MMRF_utils.preprocess_genomic_data(genomic_fn, nPCA=nPCA)
    print(genomic_df.shape)
    
    print('\nMerging patient data with genomic data...')
    train_patient_genomic = train_df.merge(genomic_df, left_on='pids', right_on='pids')
    print(train_patient_genomic.shape)

    valid_patient_genomic = valid_df.merge(genomic_df, left_on='pids', right_on='pids')
    print(valid_patient_genomic.shape)

    test_patient_genomic = test_df.merge(genomic_df, left_on='pids', right_on='pids')
    print(test_patient_genomic.shape)
    
    train = pd.concat([train_patient_genomic,valid_patient_genomic])
    train = train[(train[drug+'1']==1)]
    print(train.shape)

    test = test_patient_genomic[test_patient_genomic[drug+'1']==1]
    print(test.shape)
    
    pfs_thresh = 12
    train_y = np.array(train.pop('pfs'))
    train_y = np.array([int(pfs>pfs_thresh) for pfs in train_y])
    p1 = train.pop('pids')
    train_X = np.array(train)

    test_y = np.array(test.pop('pfs'))
    test_y = np.array([int(pfs>pfs_thresh) for pfs in test_y])
    p0 = test.pop('pids')
    test_X = np.array(test)
    
    clf = predict_PFS(clf, model_name, train_X, train_y, test_X, test_y)
    return clf

### 3.3 Transfer from CCLE

In [None]:
def predict_pfs_patient_CCLE(enc_gene_df, train_df,valid_df,test_df, clf, model_name='',drug='Bor'):

    print('\nPredicing PFS with transfer model from CCLE...')
    
    print('\nMerging patient data with genomic data...')
    train_patient_genomic = train_df.merge(enc_gene_df, left_on='pids', right_on='pids')
    print(train_patient_genomic.shape)

    valid_patient_genomic = valid_df.merge(enc_gene_df, left_on='pids', right_on='pids')
    print(valid_patient_genomic.shape)

    test_patient_genomic = test_df.merge(enc_gene_df, left_on='pids', right_on='pids')
    print(test_patient_genomic.shape)
    
    train = pd.concat([train_patient_genomic,valid_patient_genomic])
    train = train[train[drug+'1']==1]
    print(train.shape)

    test = test_patient_genomic[test_patient_genomic[drug+'1']==1]
    print(test.shape)
    
    pfs_thresh = 12
    train_y = np.array(train.pop('pfs'))
    train_y = np.array([int(pfs>pfs_thresh) for pfs in train_y])
    p1 = train.pop('pids')
    train_X = np.array(train)

    test_y = np.array(test.pop('pfs'))
    test_y = np.array([int(pfs>pfs_thresh) for pfs in test_y])
    p0 = test.pop('pids')
    test_X = np.array(test)
    
    clf = predict_PFS(clf, model_name, train_X, train_y, test_X, test_y)
    return clf

### 3.4 Auto-encoder for RNA-seq

In [None]:
def predict_pfs_patient_AE(ae_dict, ae_type, train_df, valid_df, test_df, clf, model_name='',drug='Bor'):
    
    print('\nPredicing PFS with auto-encoded RNAseq...')

    print('Using auto-encoder %s'%ae_type)
    
    train_ae = ae_dict['/mmrf_train_%s'%ae_type]
    valid_ae = ae_dict['/mmrf_valid_%s'%ae_type]
    test_ae = ae_dict['/mmrf_test_%s'%ae_type]

    train_ae_df = train_df.merge(train_ae, left_on='pids', right_on='pids')
    print(train_ae_df.shape)

    valid_ae_df = valid_df.merge(valid_ae, left_on='pids', right_on='pids')
    print(valid_ae_df.shape)

    test_ae_df = test_df.merge(test_ae, left_on='pids', right_on='pids')
    print(test_ae_df.shape)
    
    train = pd.concat([train_ae_df,valid_ae_df])
    train = train[train[drug+'1']==1]
    print(train.shape)

    test = test_ae_df[test_ae_df[drug+'1']==1]
    print(test.shape)
    
    pfs_thresh = 12
    train_y = np.array(train.pop('pfs'))
    train_y = np.array([int(pfs>pfs_thresh) for pfs in train_y])
    p1 = train.pop('pids')
    train_X = np.array(train)

    test_y = np.array(test.pop('pfs'))
    test_y = np.array([int(pfs>pfs_thresh) for pfs in test_y])
    p0 = test.pop('pids')
    test_X = np.array(test)
    
    clf = predict_PFS(clf, model_name, train_X, train_y, test_X, test_y)
    
    return clf

### Main function

In [None]:
#if __name__ == '__main__':
def main():
    
    print('Testing transfer learning pipeline from CCLE to MMRF...')
    
    print('Reading data files...')
    
    data_dir = './data/' # Enter the path to your folder here

    # have the following files in your data folder
    ccle_exp = pd.read_csv(data_dir+"CCLE_expression.csv")
    sample_info = pd.read_csv(data_dir+"sample_info.csv")
    sanger_dose_response = pd.read_csv(data_dir+"sanger-dose-response.csv")
    mapping = pd.read_csv(data_dir+'Ensembl_HGNC_map_042421.csv')
    genomic_fn = data_dir+'MMRF_CoMMpass_IA15a_E74GTF_Salmon_Gene_TPM.txt'
   
    ae_fn = data_dir + 'normalized_embeddings.pkl' # auto-encoder of RNA
    
    ind = 1
    mmrf_filename = data_dir+'cleaned_mm%d_2mos_pfs_ind.pkl'%(ind)

    print('\nPart I. Pretraining on CCLE IC50 data')
    
    bort_data, bort_labels, lenal_data, lenal_labels = preprocess_CCLE(ccle_exp, sample_info, sanger_dose_response, mapping)

    X_train, y_train, X_test, y_test, pca = split_train_test_CCLE(bort_data, bort_labels, normalize=True, nPCA=100)

    #linear_regression_IC50(X_train, y_train, X_test, y_test) # linear regression baseline
    
    MLP_params = {'niter':30, 'nepoch':10,'batch_size':32,'hidden_layers':[50,20,10],'lr':1e-3}
    pretrain_model = pretrain_MLP_model(X_train, y_train, X_test, y_test, MLP_params)
    
    print('')

    ############################################################################
    
    print('\nPart II. Preprocess MMRF genomic data')

    # run PCA on MMRF genomic data and use IC50 model in Part I to encode
    gene_df, pids = preprocess_MMRF_genomic(genomic_fn, bort_data.columns, mapping)
    pca_gene = pca.transform(gene_df)
    
    layer_name = 'encoder'
    encoder = tf.keras.Model(inputs=pretrain_model.input,
                             outputs=pretrain_model.get_layer(layer_name).output)
    enc_gene = encoder(pca_gene).numpy()
    
    enc_gene_df = pd.DataFrame(data=enc_gene, columns=['Enc%d'%(i+1) for i in range(enc_gene.shape[1])])
    enc_gene_df['pids'] = pids
    print('')
    
    ############################################################################
   
    print('\nPart III. Predict PFS on MMRF patient data')
    train, test, valid = MMRF_utils.get_train_test_valid(mmrf_filename, ind, show_features=True)

    print('\nPreprocess patient data in MMRF')
    # include the first n clinical visit data for each patient (default 0)
    train_df = MMRF_utils.preprocess_patient_data(train, num_clin_visits=3)  
    print(train_df.shape)

    valid_df = MMRF_utils.preprocess_patient_data(valid, num_clin_visits=3)  
    print(valid_df.shape)

    test_df = MMRF_utils.preprocess_patient_data(test, num_clin_visits=3)  
    print(test_df.shape)

    ############################################################################
    
    print('\nPart IV. Predict PFS with auto-encoded RNA-seq')
    
    ae_dict = pickle.load(open(ae_fn, "rb" ))
    ae_type = 'mmrfae' # other types: ['mmrfae', 'ccleae', 'combinedae']
    
    #############################################################################

    print('\nComparing prediction accuracy of different models... \n')
    
    # Model 1: patient data only
    print('Part I \n')
    clf1 = RandomForestClassifier(n_estimators=5000, max_depth=50, random_state=0)
    clf1 = predict_pfs_patient_only(train_df, valid_df, test_df, clf1, model_name='RF', drug='Bor')
    
    # Model 2: patient data with PCA on MMRF RNAseq
    print('Part II \n')
    clf2 = RandomForestClassifier(n_estimators=5000, max_depth=50, random_state=0)
    clf2 = predict_pfs_patient_PCA(genomic_fn, train_df, valid_df, test_df, clf2, model_name='RF',drug='Bor',nPCA=50)
    
    # Model 3: Transfer from CCLE IC50 to MMRF
    print('Part III \n')
    clf3 = RandomForestClassifier(n_estimators=5000, max_depth=50, random_state=0)
    clf3 = predict_pfs_patient_CCLE(enc_gene_df, train_df, valid_df, test_df, clf3, model_name='RF',drug='Bor')
    
    # Model 4: Auto-encoder for CCLE/MMRF
    print('Part IV \n')
    clf4 = RandomForestClassifier(n_estimators=5000, max_depth=50, random_state=0)
    clf4 = predict_pfs_patient_AE(ae_dict, ae_type, train_df, valid_df, test_df, clf4, model_name='RF',drug='Bor')
    
    print('Work complete!')

In [None]:
main()