## Download Data from Google-Drive

In [None]:
'''
File Name : Data
File Link : https://drive.google.com/file/d/1rWS8Jj19ZOzdXFR4jUjmHIx_fTRyYzOw/view?usp=share_link
File Id : '1rWS8Jj19ZOzdXFR4jUjmHIx_fTRyYzOw'

'''
!gdown --id 1rWS8Jj19ZOzdXFR4jUjmHIx_fTRyYzOw
!unzip ccle_ctrpv2_gdse.zip

## Import necessary libraries

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import cross_val_score, KFold
from genetic_selection import GeneticSelectionCV
from sklearn.metrics import mean_squared_error , mean_absolute_error
from time import perf_counter, sleep
import math
import scipy.stats as stats
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

## Read train and test data

In [20]:
# x_train
ccle_ctrpv2 = pd.read_csv("../../data/ccle_ctrpv2_rnaseq_tpm.csv")
# x_test
ccle_ctrpv2_aac = pd.read_csv("../../data/ccle_ctrpv2_aac.csv")
# y_train
gdse_rnaseq = pd.read_csv("../../data/gdse_rnaseq_tpm.csv")
# y_test
gdse_aac = pd.read_csv("../../data/gdse_aac.csv")


## Cleaning the dataframes

In [21]:
drug_names = gdse_aac.columns.values.tolist()

ccle_ctrpv2 = ccle_ctrpv2.rename(columns={"Unnamed: 0": "sample"})
gdse_rnaseq = gdse_rnaseq.rename(columns={"Unnamed: 0": "sample"})
ccle_ctrpv2_aac = ccle_ctrpv2_aac.rename(columns={"Unnamed: 0": "sample"})
gdse_aac = gdse_aac.rename(columns={"Unnamed: 0": "sample"})

In [22]:
drug_names = drug_names[1:]

In [23]:
def remove_nan(data):
    data = data.dropna()
    data.reset_index(drop = True)
    return data

# Feature selection method

    Genetic algorithm
    
**Install**:     `!pip install feature-selection-ga`

In [24]:
from feature_selection_ga import FeatureSelectionGA, FitnessFunction

# Training procedure

*For each drug in the drug list:*
   1. Split the data in to train and test sets, and then remove the Nan values.
   2. Merging X_train with ytrain, and X_test with  y_test because the samples in each are different.
   3. Normalize the training X. </br>
   
   *Running the following 6 times:*
   
       4. Applying 4-fold Cross Valisation
       5. Applying the relevant features selection technique, and extract the selected features.
       6. Train the model based on the CV data and extracted features
       7. Saving the best model performed in CV and the best features
       
   8. Predicting the test set using the best model from the previous section
   9. Calculating the correlation score between the predicted drug_respons and the actual drug_response

In [25]:
pearson_corr = []
kendalltau_corr = []
spearmanr_corr = []
MSE_metric = []
RMSE_metric = []
MAE_metric = []
pearson_corr_train = []
kendalltau_corr_train = []
spearmanr_corr_train = []
MSE_metric_train = []
RMSE_metric_train = []
MAE_metric_train = []
Sscaler = StandardScaler()
start = perf_counter()
for idx, drug in enumerate(drug_names):
    
    selected_features = []


    '''
        train test split
    '''
    y_train = ccle_ctrpv2_aac[['sample', drug]]
    y_test  = gdse_aac[['sample', drug]]

    y_train = remove_nan(y_train)
    y_test  = remove_nan(y_test)

    ccle_ctrpv2 = remove_nan(ccle_ctrpv2)
    gdse_rnaseq = remove_nan(gdse_rnaseq)


    '''

    Merge
    '''

    gdse_rnaseq_merged_df = pd.merge(gdse_rnaseq, y_test, on='sample', how='inner')
    ccle_ctrpv2_merged_df = pd.merge(ccle_ctrpv2, y_train, on='sample', how='inner')

    X_train = ccle_ctrpv2_merged_df.iloc[:, 1: len(gdse_rnaseq.columns)]
    X_test  = gdse_rnaseq_merged_df.iloc[:, 1: len(gdse_rnaseq.columns)]

    Y_train = ccle_ctrpv2_merged_df.iloc[:, len(gdse_rnaseq.columns):]
    Y_test  = gdse_rnaseq_merged_df.iloc[:, len(gdse_rnaseq.columns):]

    X_train_norm = Sscaler.fit_transform(X_train)
    X_train_normalized = pd.DataFrame(X_train_norm, columns=X_train.columns)

    X_test_norm = Sscaler.fit_transform(X_test)
    X_test_normalized = pd.DataFrame(X_test_norm, columns=X_test.columns)

    X = np.asarray(X_train_normalized)
    X_test_array = np.asarray(X_test_normalized)
    Y = np.asarray(Y_train).ravel()
    Y_test_array = np.asarray(Y_test).ravel()
    
    #Define the model
    
    estimators = ElasticNetCV(
        cv=4,
        random_state=0
    )


    #Fitting the model
    
    estimators.fit(X, Y)
    
    selectors = GeneticSelectionCV(estimators, 
                                  cv=4,
                                  verbose=4,
                                  scoring="neg_mean_squared_error",
                                  max_features=300,
                                  n_population=70,
                                  crossover_proba=0.7,
                                  mutation_proba=0.4,
                                  n_generations=5,
                                  crossover_independent_proba=0.7,
                                  mutation_independent_proba=0.07,
                                  tournament_size=5,
                                  n_gen_no_change=20,
                                  caching=True,
                                  n_jobs=-4)
    
    selectors = selectors.fit(X, Y)
    selected_features = np.where(selectors.support_)[0]  # Get the indices of selected features
 
    selected_features = selected_features[:100]          #Get top 100 selected features


    X_train_new = X_train_normalized.iloc[:,selected_features]
    X_split, X_val, Y_split, y_val = train_test_split(X_train_new, Y_train, test_size=0.25, random_state=1)
    estimators.fit(X_split, Y_split)
    
    #Calculate metrics for the validation data
    
    predictions_train = estimators.predict(X_val)

    mse_train = mean_squared_error(np.asarray(y_val).ravel(), predictions_train)
    rmse_train = math.sqrt(mse_train)
    mae_train = mean_absolute_error(np.asarray(y_val).ravel(), predictions_train)

    # Calculate Correlation for the validation data
    spearmanr_correlation_train, _ = stats.spearmanr(np.asarray(y_val).ravel(), predictions_train)
    kendalltau_correlation_train, _ = stats.kendalltau(np.asarray(y_val).ravel(), predictions_train)
    pearson_correlation_train, _ = pearsonr(np.asarray(y_val).ravel(), predictions_train)


    pearson_corr_train.append(pearson_correlation_train)
    kendalltau_corr_train.append(kendalltau_correlation_train)
    spearmanr_corr_train.append(spearmanr_correlation_train)
    MSE_metric_train.append(mse_train)
    RMSE_metric_train.append(rmse_train)
    MAE_metric_train.append(mae_train)
    
    #Predict X test
    X_test_new = X_test_normalized.iloc[:,selected_features]
    predictions = estimators.predict(X_test_new)
    
    #Calculating Metrics for the Testing Phase
    mse = mean_squared_error(Y_test_array, predictions)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(Y_test_array, predictions)
    
    #Calculating Correlation for the Testing Phase
    
    spearmanr_correlation, _ = stats.spearmanr(Y_test_array.flatten(),  predictions.flatten())
    kendalltau_correlation, _ = stats.kendalltau(Y_test_array.flatten(),  predictions.flatten())
    pearson_correlation, _ = pearsonr(Y_test_array.flatten(),  predictions.flatten())
    
    pearson_corr.append(pearson_correlation)
    kendalltau_corr.append(kendalltau_correlation)
    spearmanr_corr.append(spearmanr_correlation)
    MSE_metric.append(mse)
    RMSE_metric.append(rmse)
    MAE_metric.append(mae)
    
end = perf_counter()    

Selecting features with genetic algorithm.
gen	nevals	avg                               	std                               	min                            	max                               
0  	70    	[ -0.004787 127.5        0.000598]	[  0.000312  85.856816   0.000103]	[-0.006225  1.        0.000425]	[ -0.004388 287.         0.000937]
1  	61    	[-4571.431023   821.271429  4571.428888]	[ 4981.596543   682.731529  4981.598502]	[-10000.           89.            0.000411]	[   -0.004319  1638.       10000.      ]
2  	61    	[-3428.574347   679.6       3428.571787]	[ 4746.640099   658.21376   4746.641948]	[-10000.          154.            0.000393]	[   -0.004298  2320.       10000.      ]
3  	61    	[-3428.57429    696.        3428.571796]	[ 4746.640141   644.119909  4746.641942]	[-10000.          174.            0.000489]	[   -0.004253  1658.       10000.      ]
4  	51    	[-3714.288419   727.185714  3714.286071]	[ 4831.864928   655.652419  4831.866733]	[-10000.          187.            

4  	57    	[-4142.866255   781.9       4142.859205]	[ 4925.975093   673.329237  4925.981023]	[-10000.          174.            0.002396]	[   -0.014501  1877.       10000.      ]
5  	56    	[-4285.722969   792.742857  4285.716177]	[ 4948.709073   694.297583  4948.714955]	[-10000.          172.            0.002424]	[   -0.014393  2418.       10000.      ]
Selecting features with genetic algorithm.
gen	nevals	avg                               	std                               	min                            	max                               
0  	70    	[ -0.009158 150.7        0.002153]	[  0.000168  89.278913   0.000097]	[-0.009657  2.        0.001866]	[ -0.008837 299.         0.002391]
1  	50    	[-3428.577328   597.814286  3428.572782]	[ 4746.637946   630.963872  4746.64123 ]	[-10000.           40.            0.001799]	[   -0.008768  1680.       10000.      ]
2  	57    	[-4142.862356   748.885714  4142.858318]	[ 4925.978373   680.134578  4925.981768]	[-10000.          105.            

2  	61    	[-3714.28912    732.042857  3714.286154]	[ 4831.864389   662.536456  4831.866669]	[-10000.         140.           0.00054]   	[   -0.005199  1715.       10000.      ]
3  	54    	[-3285.717893   650.7       3285.714754]	[ 4696.935255   623.966239  4696.93745 ]	[-10000.          140.            0.000549]	[   -0.005127  1659.       10000.      ]
4  	55    	[-3285.717837   695.385714  3285.714753]	[ 4696.935294   686.440119  4696.937451]	[-10000.          140.            0.000511]	[   -0.005096  2791.       10000.      ]
5  	58    	[-4714.28847    868.028571  4714.28607 ]	[ 4991.827457   675.033565  4991.829724]	[-10000.          140.            0.000503]	[   -0.005001  1706.       10000.      ]
Selecting features with genetic algorithm.
gen	nevals	avg                               	std                               	min                            	max                               
0  	70    	[ -0.005387 145.514286   0.000982]	[  0.000276  94.610139   0.000076]	[-0.006319  3.  

1  	52    	[-4000.005492   737.157143  4000.000964]	[ 4898.975001   659.547196  4898.978698]	[-10000.         110.           0.00103]	[   -0.00863  1686.      10000.     ]
2  	59    	[-4285.719483   820.828571  4285.715157]	[ 4948.712092   705.58211   4948.715838]	[-10000.         110.           0.00103]	[   -0.00863  2283.      10000.     ]
3  	53    	[-4857.147457   884.557143  4857.143653]	[ 4997.954297   677.57118   4997.957993]	[-10000.          138.            0.001256]	[   -0.008543  1688.       10000.      ]
4  	62    	[-4142.862306   794.142857  4142.858031]	[ 4925.978414   671.792066  4925.98201 ]	[-10000.          164.            0.001269]	[   -0.008481  1977.       10000.      ]
5  	61    	[-4714.290278   871.528571  4714.286508]	[ 4991.82575    683.740943  4991.829311]	[-10000.          211.            0.001348]	[   -0.008455  1685.       10000.      ]
Selecting features with genetic algorithm.
gen	nevals	avg                               	std                              

Selecting features with genetic algorithm.
gen	nevals	avg                               	std                               	min                            	max                               
0  	70    	[ -0.033689 146.614286   0.005436]	[  0.00273   89.946221   0.001016]	[-0.042381  5.        0.002788]	[ -0.028821 295.         0.007963]
1  	54    	[-4428.588808   806.257143  4428.574255]	[ 4967.22412    679.317739  4967.237095]	[-10000.           66.            0.002788]	[   -0.028821  1681.       10000.      ]
2  	51    	[-4142.874824   789.828571  4142.860262]	[ 4925.967887   674.30095   4925.980134]	[-10000.           75.            0.002788]	[   -0.026861  1681.       10000.      ]
3  	56    	[-3142.87722    664.785714  3142.861013]	[ 4642.294067   629.635357  4642.305039]	[-10000.           75.            0.002788]	[   -0.026861  1687.       10000.      ]
4  	62    	[-3571.446897   717.528571  3571.432306]	[ 4791.560579   646.905043  4791.571454]	[-10000.         144.           0.

4  	57    	[-4285.719234   853.757143  4285.715188]	[ 4948.712308   671.015487  4948.715812]	[-10000.          247.            0.001454]	[   -0.00848  1689.      10000.     ]   
5  	56    	[-3000.006009   679.442857  3000.001118]	[ 4582.571761   622.46275   4582.574963]	[-10000.          252.            0.001448]	[   -0.008435  1705.       10000.      ]
Selecting features with genetic algorithm.
gen	nevals	avg                               	std                               	min                            	max                               
0  	70    	[ -0.01106  162.114286   0.00284 ]	[  0.000281  91.976945   0.000124]	[-0.01174   1.        0.002573]	[ -0.010328 298.         0.003177]
1  	51    	[-3714.292489   711.085714  3714.287496]	[ 4831.861799   673.851419  4831.865637]	[-10000.           65.            0.002476]	[   -0.010244  1697.       10000.      ]
2  	49    	[-2428.579466   548.5       2428.573558]	[ 4288.090025   604.438175  4288.093371]	[-10000.           95.            

## Saving the results

In [26]:
execution_time = end - start

df = pd.DataFrame({"Drugs": drug_names, "pearsonCor": pearson_corr , "spearmanCor": spearmanr_corr , "kendallCor": kendalltau_corr , 
                  "RMSE": RMSE_metric ,  "MSE": MSE_metric , "MAE": MAE_metric , "Time": execution_time})
df.to_csv('EN-results/Result_Genetic_ElasticNet_test.csv', index=False)

df = pd.DataFrame({"Drugs": drug_names, "pearsonCor": pearson_corr_train , "spearmanCor": spearmanr_corr_train , "kendallCor": kendalltau_corr_train , 
                  "RMSE": RMSE_metric_train ,  "MSE": MSE_metric_train , "MAE": MAE_metric_train , "Time": execution_time})
df.to_csv('EN-results/Result_Genetic_ElasticNet_train.csv', index=False)

In [27]:
pearson_corr

[0.6641637120258999,
 0.6222729886248682,
 0.41886408278769416,
 0.4951982297904725,
 0.25856567604057484,
 0.5426670246721199,
 0.5588691142314816,
 0.4472682844203784,
 0.30606884463453776,
 0.14178661848451407,
 0.302501216926347,
 0.4562596220121463,
 0.30648530917596006,
 0.5348669715213124,
 0.2052812232130401,
 0.20273254848413813,
 0.45370917431940805,
 0.6148542161651251,
 0.24205576643723276,
 0.32610743848582097,
 0.4111556313951093,
 0.34286964173150414,
 0.4893895341028138,
 0.2774220449510006,
 0.39686630463354533,
 0.3826417492583676,
 0.5642778071590484,
 0.4516565791562821,
 0.4452499866262914,
 0.6055475133053878,
 0.35327835382881584,
 0.5853677040114573,
 0.17722812219170633,
 0.4533816415977996,
 0.2975458249340127,
 0.5085538752530541,
 0.4983910952114226,
 0.5967366821917834]