## Download Data from Google-Drive

In [None]:
'''
File Name : Data
File Link : https://drive.google.com/file/d/1rWS8Jj19ZOzdXFR4jUjmHIx_fTRyYzOw/view?usp=share_link
File Id : '1rWS8Jj19ZOzdXFR4jUjmHIx_fTRyYzOw'

'''
!gdown --id 1rWS8Jj19ZOzdXFR4jUjmHIx_fTRyYzOw
!unzip ccle_ctrpv2_gdse.zip

## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from genetic_selection import GeneticSelectionCV
from sklearn.metrics import mean_squared_error , mean_absolute_error
from time import perf_counter, sleep
import math
import scipy.stats as stats
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

## Read train and test data

In [3]:
# x_train
ccle_ctrpv2 = pd.read_csv("../../data/ccle_ctrpv2_rnaseq_tpm.csv")
# x_test
ccle_ctrpv2_aac = pd.read_csv("../../data/ccle_ctrpv2_aac.csv")
# y_train
gdse_rnaseq = pd.read_csv("../../data/gdse_rnaseq_tpm.csv")
# y_test
gdse_aac = pd.read_csv("../../data/gdse_aac.csv")


## Cleaning the dataframes

In [4]:
drug_names = gdse_aac.columns.values.tolist()


ccle_ctrpv2 = ccle_ctrpv2.rename(columns={"Unnamed: 0": "sample"})
gdse_rnaseq = gdse_rnaseq.rename(columns={"Unnamed: 0": "sample"})
ccle_ctrpv2_aac = ccle_ctrpv2_aac.rename(columns={"Unnamed: 0": "sample"})
gdse_aac = gdse_aac.rename(columns={"Unnamed: 0": "sample"})

In [5]:
drug_names = drug_names[1:]

In [6]:
def remove_nan(data):
    data = data.dropna()
    data.reset_index(drop = True)
    return data

# Feature selection method

    MRMR (Minimum-Redundancy-Maximum-Relevance) 
    
**Install**:     `!pip install mrmr_selection`

In [7]:
import mrmr
from mrmr import mrmr_regression

# Training procedure

*For each drug in the drug list:*
   1. Split the data in to train and test sets, and then remove the Nan values.
   2. Merging X_train with ytrain, and X_test with  y_test because the samples in each are different.
   3. Normalize the training X. </br>
   
   *Running the following 6 times:*
   
       4. Applying 4-fold Cross Valisation
       5. Applying the relevant features selection technique, and extract the selected features.
       6. Train the model based on the CV data and extracted features
       7. Saving the best model performed in CV and the best features
       
   8. Predicting the test set using the best model from the previous section
   9. Calculating the correlation score between the predicted drug_respons and the actual drug_response

In [11]:
pearson_corr = []
kendalltau_corr = []
spearmanr_corr = []
MSE_metric = []
RMSE_metric = []
MAE_metric = []
pearson_corr_train = []
kendalltau_corr_train = []
spearmanr_corr_train = []
MSE_metric_train = []
RMSE_metric_train = []
MAE_metric_train = []
Sscaler = StandardScaler()
start = perf_counter()

for idx, drug in enumerate(drug_names):
    
    selected_features = []


    '''
        train test split
    '''
    y_train = ccle_ctrpv2_aac[['sample', drug]]
    y_test  = gdse_aac[['sample', drug]]

    y_train = remove_nan(y_train)
    y_test  = remove_nan(y_test)

    ccle_ctrpv2 = remove_nan(ccle_ctrpv2)
    gdse_rnaseq = remove_nan(gdse_rnaseq)


    '''

    Merge
    '''

    gdse_rnaseq_merged_df = pd.merge(gdse_rnaseq, y_test, on='sample', how='inner')
    ccle_ctrpv2_merged_df = pd.merge(ccle_ctrpv2, y_train, on='sample', how='inner')

    X_train = ccle_ctrpv2_merged_df.iloc[:, 1: len(gdse_rnaseq.columns)]
    X_test  = gdse_rnaseq_merged_df.iloc[:, 1: len(gdse_rnaseq.columns)]

    Y_train = ccle_ctrpv2_merged_df.iloc[:, len(gdse_rnaseq.columns):]
    Y_test  = gdse_rnaseq_merged_df.iloc[:, len(gdse_rnaseq.columns):]

    X_train_norm = Sscaler.fit_transform(X_train)
    X_train_normalized = pd.DataFrame(X_train_norm, columns=X_train.columns)

    X_test_norm = Sscaler.fit_transform(X_test)
    X_test_normalized = pd.DataFrame(X_test_norm, columns=X_test.columns)

    X = np.asarray(X_train_normalized)
    X_test_array = np.asarray(X_test_normalized)
    Y = np.asarray(Y_train).ravel()
    Y_test_array = np.asarray(Y_test).ravel()
    

    estimators = RandomForestRegressor(max_depth= 2 , n_estimators=20)

    # Define the number of folds for cross-validation
    num_folds = 4

    # Create a K-fold cross-validation object
    kfold = KFold(n_splits=num_folds)

    
    estimators.fit(X, Y)
    
    # Perform cross-validation
    cv_scores = cross_val_score(estimators, X, Y , cv=kfold, scoring='neg_mean_squared_error' , n_jobs=-1)

    # Calculate the mean squared error (MSE) scores
    mse_scores = -cv_scores
    # Get the best model based on cross-validation
    mymin = np.min(mse_scores)
    best_model_index = [i for i, x in enumerate(mse_scores) if x == mymin]
    #best_model_index = np.argmax(mse_scores)
    best_model = estimators
    # Retrieve the best model based on the index
    best_model = best_model.estimators_[best_model_index.pop()]


    selected_features = mrmr.mrmr_regression(X_train_normalized, Y_train , 100)
    X_train_new = X_train_normalized[selected_features]
    X_split, X_val, Y_split, y_val = train_test_split(X_train_new, Y_train, test_size=0.25, random_state=1)
    best_model.fit(X_split, Y_split)
    
    
    
    
    predictions_train = best_model.predict(X_val)

    mse_train = mean_squared_error(np.asarray(y_val).ravel(), predictions_train)
    rmse_train = math.sqrt(mse_train)
    mae_train = mean_absolute_error(np.asarray(y_val).ravel(), predictions_train)

    # Calculate Correlation for the validation data
    spearmanr_correlation_train, _ = stats.spearmanr(np.asarray(y_val).ravel(), predictions_train)
    kendalltau_correlation_train, _ = stats.kendalltau(np.asarray(y_val).ravel(), predictions_train)
    pearson_correlation_train, _ = pearsonr(np.asarray(y_val).ravel(), predictions_train)


    pearson_corr_train.append(pearson_correlation_train)
    kendalltau_corr_train.append(kendalltau_correlation_train)
    spearmanr_corr_train.append(spearmanr_correlation_train)
    MSE_metric_train.append(mse_train)
    RMSE_metric_train.append(rmse_train)
    MAE_metric_train.append(mae_train)
    
    #Predict X test
    X_test_new = X_test_normalized[selected_features]
    predictions = best_model.predict(np.asarray(X_test_new))
    
    #Calculating Metrics for the Testing Phase
    mse = mean_squared_error(Y_test_array, predictions)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(Y_test_array, predictions)
    
    #Calculating Correlation for the Testing Phase
    
    spearmanr_correlation, _ = stats.spearmanr(Y_test_array.flatten(),  predictions.flatten())
    kendalltau_correlation, _ = stats.kendalltau(Y_test_array.flatten(),  predictions.flatten())
    pearson_correlation, _ = pearsonr(Y_test_array.flatten(),  predictions.flatten())
    
    pearson_corr.append(pearson_correlation)
    kendalltau_corr.append(kendalltau_correlation)
    spearmanr_corr.append(spearmanr_correlation)
    MSE_metric.append(mse)
    RMSE_metric.append(rmse)
    MAE_metric.append(mae)
    
end = perf_counter()    
    
    

100%|██████████| 100/100 [02:50<00:00,  1.71s/it]
100%|██████████| 100/100 [02:53<00:00,  1.73s/it]
100%|██████████| 100/100 [02:45<00:00,  1.65s/it]
100%|██████████| 100/100 [02:45<00:00,  1.66s/it]
100%|██████████| 100/100 [02:23<00:00,  1.44s/it]
100%|██████████| 100/100 [02:51<00:00,  1.72s/it]
100%|██████████| 100/100 [02:45<00:00,  1.65s/it]
100%|██████████| 100/100 [02:51<00:00,  1.71s/it]
100%|██████████| 100/100 [02:48<00:00,  1.69s/it]
100%|██████████| 100/100 [02:48<00:00,  1.68s/it]
100%|██████████| 100/100 [02:47<00:00,  1.68s/it]
100%|██████████| 100/100 [02:47<00:00,  1.67s/it]
100%|██████████| 100/100 [02:45<00:00,  1.66s/it]
100%|██████████| 100/100 [02:46<00:00,  1.66s/it]
100%|██████████| 100/100 [02:48<00:00,  1.68s/it]
100%|██████████| 100/100 [02:47<00:00,  1.68s/it]
100%|██████████| 100/100 [03:13<00:00,  1.94s/it]
100%|██████████| 100/100 [03:41<00:00,  2.21s/it]
100%|██████████| 100/100 [03:38<00:00,  2.19s/it]
100%|██████████| 100/100 [03:38<00:00,  2.19s/it]


In [12]:
execution_time = end - start

df = pd.DataFrame({"Drugs": drug_names, "pearsonCor": pearson_corr , "spearmanCor": spearmanr_corr , "kendallCor": kendalltau_corr , 
                  "RMSE": RMSE_metric ,  "MSE": MSE_metric , "MAE": MAE_metric , "Time": execution_time})
df.to_csv('RF-results/Result_MRMR_RandomForrest_test.csv', index=False)

df = pd.DataFrame({"Drugs": drug_names, "pearsonCor": pearson_corr_train , "spearmanCor": spearmanr_corr_train , "kendallCor": kendalltau_corr_train , 
                  "RMSE": RMSE_metric_train ,  "MSE": MSE_metric_train , "MAE": MAE_metric_train , "Time": execution_time})
df.to_csv('RF-results/Result_MRMR_RandomForrest_train.csv', index=False)