## Download Data from Google-Drive

In [None]:
'''
File Name : Data
File Link : https://drive.google.com/file/d/1rWS8Jj19ZOzdXFR4jUjmHIx_fTRyYzOw/view?usp=share_link
File Id : '1rWS8Jj19ZOzdXFR4jUjmHIx_fTRyYzOw'

'''
!gdown --id 1rWS8Jj19ZOzdXFR4jUjmHIx_fTRyYzOw
!unzip ccle_ctrpv2_gdse.zip

## Import necessary libraries

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from genetic_selection import GeneticSelectionCV
from sklearn.metrics import mean_squared_error , mean_absolute_error
from time import perf_counter, sleep
import math
import scipy.stats as stats
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

## Read train and test data

In [13]:
# x_train
ccle_ctrpv2 = pd.read_csv("../../data/ccle_ctrpv2_rnaseq_tpm.csv")
# x_test
ccle_ctrpv2_aac = pd.read_csv("../../data/ccle_ctrpv2_aac.csv")
# y_train
gdse_rnaseq = pd.read_csv("../../data/gdse_rnaseq_tpm.csv")
# y_test
gdse_aac = pd.read_csv("../../data/gdse_aac.csv")


## Cleaning the dataframes

In [14]:
drug_names = gdse_aac.columns.values.tolist()

ccle_ctrpv2 = ccle_ctrpv2.rename(columns={"Unnamed: 0": "sample"})
gdse_rnaseq = gdse_rnaseq.rename(columns={"Unnamed: 0": "sample"})
ccle_ctrpv2_aac = ccle_ctrpv2_aac.rename(columns={"Unnamed: 0": "sample"})
gdse_aac = gdse_aac.rename(columns={"Unnamed: 0": "sample"})

In [15]:
drug_names = drug_names[1:]

['Vorinostat',
 'Vincristine',
 'Venetoclax',
 'Trametinib',
 'Tozasertib',
 'Topotecan',
 'Teniposide',
 'Sorafenib',
 'Sirolimus',
 'Ruxolitinib',
 'Pictilisib',
 'Pevonedistat',
 'Paclitaxel',
 'Oxaliplatin',
 'Olaparib',
 'Obatoclax',
 'Nilotinib',
 'Navitoclax',
 'Molibresib',
 'Linsitinib',
 'Lapatinib',
 'Ibrutinib',
 'Gemcitabine',
 'Gefitinib',
 'Foretinib',
 'Erlotinib',
 'Entinostat',
 'Dinaciclib',
 'Dasatinib',
 'Daporinad',
 'Dabrafenib',
 'Cytarabine',
 'Crizotinib',
 'Axitinib',
 'Alpelisib',
 'Alisertib',
 'Afatinib',
 'Adavosertib']

In [16]:
def remove_nan(data):
    data = data.dropna()
    data.reset_index(drop = True)
    return data

# Feature selection method

    RFE (Recursive Feature Elimination) 
    

In [17]:
from sklearn.feature_selection import RFE

# Training procedure

*For each drug in the drug list:*
   1. Split the data in to train and test sets, and then remove the Nan values.
   2. Merging X_train with ytrain, and X_test with  y_test because the samples in each are different.
   3. Normalize the training X. </br>
   
   *Running the following 6 times:*
   
       4. Applying 4-fold Cross Valisation
       5. Applying the relevant features selection technique, and extract the selected features.
       6. Train the model based on the CV data and extracted features
       7. Saving the best model performed in CV and the best features
       
   8. Predicting the test set using the best model from the previous section
   9. Calculating the correlation score between the predicted drug_respons and the actual drug_response

In [18]:
pearson_corr = []
kendalltau_corr = []
spearmanr_corr = []
MSE_metric = []
RMSE_metric = []
MAE_metric = []
pearson_corr_train = []
kendalltau_corr_train = []
spearmanr_corr_train = []
MSE_metric_train = []
RMSE_metric_train = []
MAE_metric_train = []
Sscaler = StandardScaler()
start = perf_counter()

for idx, drug in enumerate(drug_names):
    
    selected_features = []


    '''
        train test split
    '''
    y_train = ccle_ctrpv2_aac[['sample', drug]]
    y_test  = gdse_aac[['sample', drug]]

    y_train = remove_nan(y_train)
    y_test  = remove_nan(y_test)

    ccle_ctrpv2 = remove_nan(ccle_ctrpv2)
    gdse_rnaseq = remove_nan(gdse_rnaseq)


    '''

    Merge
    '''

    gdse_rnaseq_merged_df = pd.merge(gdse_rnaseq, y_test, on='sample', how='inner')
    ccle_ctrpv2_merged_df = pd.merge(ccle_ctrpv2, y_train, on='sample', how='inner')

    X_train = ccle_ctrpv2_merged_df.iloc[:, 1: len(gdse_rnaseq.columns)]
    X_test  = gdse_rnaseq_merged_df.iloc[:, 1: len(gdse_rnaseq.columns)]

    Y_train = ccle_ctrpv2_merged_df.iloc[:, len(gdse_rnaseq.columns):]
    Y_test  = gdse_rnaseq_merged_df.iloc[:, len(gdse_rnaseq.columns):]

    X_train_norm = Sscaler.fit_transform(X_train)
    X_train_normalized = pd.DataFrame(X_train_norm, columns=X_train.columns)

    X_test_norm = Sscaler.fit_transform(X_test)
    X_test_normalized = pd.DataFrame(X_test_norm, columns=X_test.columns)

    X = np.asarray(X_train_normalized)
    X_test_array = np.asarray(X_test_normalized)
    Y = np.asarray(Y_train).ravel()
    Y_test_array = np.asarray(Y_test).ravel()
    

    estimators = RandomForestRegressor(max_depth= 2 , n_estimators=20)

    # Define the number of folds for cross-validation
    num_folds = 4

    # Create a K-fold cross-validation object
    kfold = KFold(n_splits=num_folds)
      
    estimators.fit(X, Y)
    
    # Perform cross-validation
    cv_scores = cross_val_score(estimators, X, Y , cv=kfold, scoring='neg_mean_squared_error' , n_jobs=-1)
    print("Cross validation done!")
    # Calculate the mean squared error (MSE) scores
    mse_scores = -cv_scores
    print("Mean Squared Error:", np.min(mse_scores))
    # Get the best model based on cross-validation
    mymin = np.min(mse_scores)
    best_model_index = [i for i, x in enumerate(mse_scores) if x == mymin]
    #best_model_index = np.argmax(mse_scores)
    best_model = estimators
    # Retrieve the best model based on the index
    best_model = best_model.estimators_[best_model_index.pop()]



    n_features_to_select = 100
    rfe = RFE(best_model, n_features_to_select=n_features_to_select , step=0.3)
    
    
    X_split, X_val, Y_split, y_val = train_test_split(X_train_normalized, Y_train, test_size=0.25, random_state=1)
    rfe.fit(X_split, Y_split)

    predictions_train = rfe.predict(X_val)

    mse_train = mean_squared_error(np.asarray(y_val).ravel(), predictions_train)
    rmse_train = math.sqrt(mse_train)
    mae_train = mean_absolute_error(np.asarray(y_val).ravel(), predictions_train)

    # Calculate Correlation for the validation data
    spearmanr_correlation_train, _ = stats.spearmanr(np.asarray(y_val).ravel(), predictions_train)
    kendalltau_correlation_train, _ = stats.kendalltau(np.asarray(y_val).ravel(), predictions_train)
    pearson_correlation_train, _ = pearsonr(np.asarray(y_val).ravel(), predictions_train)


    pearson_corr_train.append(pearson_correlation_train)
    kendalltau_corr_train.append(kendalltau_correlation_train)
    spearmanr_corr_train.append(spearmanr_correlation_train)
    MSE_metric_train.append(mse_train)
    RMSE_metric_train.append(rmse_train)
    MAE_metric_train.append(mae_train)
    
    #Predict X test
    predictions = rfe.predict(X_test_normalized)
    
    #Calculating Metrics for the Testing Phase
    mse = mean_squared_error(Y_test_array, predictions)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(Y_test_array, predictions)
    
    #Calculating Correlation for the Testing Phase
    
    spearmanr_correlation, _ = stats.spearmanr(Y_test_array.flatten(),  predictions.flatten())
    kendalltau_correlation, _ = stats.kendalltau(Y_test_array.flatten(),  predictions.flatten())
    pearson_correlation, _ = pearsonr(Y_test_array.flatten(),  predictions.flatten())
    
    pearson_corr.append(pearson_correlation)
    kendalltau_corr.append(kendalltau_correlation)
    spearmanr_corr.append(spearmanr_correlation)
    MSE_metric.append(mse)
    RMSE_metric.append(rmse)
    MAE_metric.append(mae)
    
end = perf_counter()    
    
    
    
    

Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.004167758569848071
Fitting RFE Done!
Correlation: 0.6367456174937132




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.03023333720557492
Fitting RFE Done!
Correlation: 0.5656149459773099




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.01792279959059231
Fitting RFE Done!
Correlation: 0.5338874980036769




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.03171618939225612
Fitting RFE Done!
Correlation: 0.4243761108954366




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.010756524227547505
Fitting RFE Done!
Correlation: -0.007273776106652016




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.015549787893811015
Fitting RFE Done!
Correlation: 0.5047329910550429




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.011755007964878435
Fitting RFE Done!
Correlation: 0.5245423764562861




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.006632565829477649
Fitting RFE Done!
Correlation: 0.5024982575573034




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.020531215612677546
Fitting RFE Done!
Correlation: 0.2765094970083412




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.0029177284753949483
Fitting RFE Done!
Correlation: 3.401715826630708e-06




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.011649325644501683
Fitting RFE Done!
Correlation: 0.17610896716765506




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.012986147994754905
Fitting RFE Done!
Correlation: 0.4274594790010495




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.04088823252227611
Fitting RFE Done!
Correlation: 0.1593545395287831




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.004436808692986983
Fitting RFE Done!
Correlation: 0.39872488877722245




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.003630145290977979
Fitting RFE Done!
Correlation: 0.15677137417870435




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.006722202938363961
Fitting RFE Done!
Correlation: 0.19693782775012147




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.006649169866276528
Fitting RFE Done!
Correlation: 0.8118677198397779




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.01680839585382137
Fitting RFE Done!
Correlation: 0.4643469453003418




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.006993254015507814
Fitting RFE Done!
Correlation: 0.3140392712618668




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.00406652328741841
Fitting RFE Done!
Correlation: 0.376013255595013




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.0067695154658568134
Fitting RFE Done!
Correlation: 0.3594286382113676




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.01055264477737824
Fitting RFE Done!
Correlation: 0.33350103895199257




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.029963445324797465
Fitting RFE Done!
Correlation: 0.5965090486266256




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.006544490223399214
Fitting RFE Done!
Correlation: 0.375107793167892




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.011836941545387539
Fitting RFE Done!
Correlation: 0.40183426296384217




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.005529004533299379
Fitting RFE Done!
Correlation: 0.37020033328302476




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.006626299837738547
Fitting RFE Done!
Correlation: 0.48111090072422147




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.025486691371408306
Fitting RFE Done!
Correlation: 0.1847935166361982




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.025371710843563107
Fitting RFE Done!
Correlation: 0.42883248410835806




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.026845621672413923
Fitting RFE Done!
Correlation: 0.34931042667327805




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.00815498201666645
Fitting RFE Done!
Correlation: 0.1915627637416434




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.014795395532114494
Fitting RFE Done!
Correlation: 0.5205541191434404




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.006164900620423114
Fitting RFE Done!
Correlation: 0.2358011308901964




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.0074648160123889605
Fitting RFE Done!
Correlation: 0.11783758644598266




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.008039504861252275
Fitting RFE Done!
Correlation: 0.08432285836151568




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.017037753324464493
Fitting RFE Done!
Correlation: 0.4492046546389545




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.012550179095631634
Fitting RFE Done!
Correlation: 0.46166818545708255




Data is prepare!
Fitting the model done!
Cross validation done!
Mean Squared Error: 0.008716082880943915
Fitting RFE Done!
Correlation: 0.5480768577655094




In [19]:
execution_time = end - start

df = pd.DataFrame({"Drugs": drug_names, "pearsonCor": pearson_corr , "spearmanCor": spearmanr_corr , "kendallCor": kendalltau_corr , 
                  "RMSE": RMSE_metric ,  "MSE": MSE_metric , "MAE": MAE_metric , "Time": execution_time})
df.to_csv('RF-results/Result_RFE_RandomForrest_test.csv', index=False)

df = pd.DataFrame({"Drugs": drug_names, "pearsonCor": pearson_corr_train , "spearmanCor": spearmanr_corr_train , "kendallCor": kendalltau_corr_train , 
                  "RMSE": RMSE_metric_train ,  "MSE": MSE_metric_train , "MAE": MAE_metric_train , "Time": execution_time})
df.to_csv('RF-results/Result_RFE_RandomForrest_train.csv', index=False)