## Calling the libraries

In [1]:
import pandas as pd
import numpy as np
import math
import itertools
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn import model_selection, metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from scipy import stats


from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, r2_score, f1_score,mean_squared_error

from sklearn.svm import SVR

from numpy.random import seed
seed(1)

### Degining the functions

In [2]:
def complete_model(trainx, trainy, testx, testy, model):
    model.fit(trainx, trainy)
    y_pred = model.predict(testx)
    return calculation(testy, y_pred)

In [3]:
def calculation(y_test, y_pred):  
    mse=mean_squared_error(y_test,y_pred)
    rmse=np.sqrt(mse)
    r2=metrics.r2_score(y_test,y_pred)
    pcc=stats.pearsonr(y_test,y_pred)
    return [mse, rmse, r2, pcc[0],pcc[1]]

In [4]:
def print_output(model_name, val, output_val):
    print('\033[1mOptimized {} model {} performance: \033[0m'.format(model_name, val))
    print("MSE:         {0:.3f}".format(output_val[0]))
    print("RMSE: {0:.3f}".format(output_val[1]))
    print("R2: {0:.3f}".format(output_val[2]))
    print("PCC:    {0:.3f}".format(output_val[3]))
    print("p-value:    {0:.3f}".format(output_val[4]))

#### Read data (training files)

In [5]:
## Read the file
overall_data =pd.read_csv("NS5A_top50_SVR_MLT_train_input.csv")

### Remove last column from the file
overall_data = overall_data.iloc[:, :-1]

## Replace the string character
overall_data.replace('?',0,inplace=True)


overall_data.head()

Unnamed: 0,pIC50,ATSC5p,VR1_DzZ,SpMin6_Bhv,SpMin3_Bhe,C4SP3,VC-3,maxHBint4,nF8Ring,TDB10u,...,GraphFP770,MACCSFP140,PubchemFP20,KRFP19,KRFP413,KRFP1412,KRFP3550,AD2D603,KRFPC3139,KRFPC3709
0,7.0,-1.407749,362.878746,1.549652,1.778676,0,1.820869,2.800691,0.0,9.646082,...,1,1,1,0,0,0,0,0,0.0,0.0
1,10.142668,6.351432,1944.062619,1.768022,1.725425,1,3.565112,0.0,0.0,8.4838,...,1,1,1,0,0,0,0,0,0.0,0.0
2,6.522879,0.244588,460.725548,1.480215,1.753079,0,1.063571,0.0,0.0,8.677389,...,1,0,0,0,0,0,0,0,0.0,0.0
3,11.045757,-3.404421,956.331384,1.782068,1.8485,0,2.505661,0.0,0.0,8.86281,...,1,1,1,0,0,0,0,0,0.0,0.0
4,10.886057,5.213243,856.934759,1.705756,1.695694,0,1.77796,0.0,0.0,8.268452,...,1,1,1,0,0,0,0,0,0.0,0.0


#### Independent validation dataset

In [6]:
test = pd.read_csv("NS5A_top50_SVR_MLT_test_input.csv", low_memory=False)


test = test.iloc[:, :-1]

## Replace the string character from the test dataset
test.replace('?',0,inplace=True)

## Spliting the files in x and y variables [test dataset]
X_val = test.iloc[:,1:].values
y_val = test.loc[:,'pIC50'].values

X_val[:] = np.nan_to_num(X_val)
y_val[:] = np.nan_to_num(y_val)
print(y_val)
print(X_val)

[ 7.95860731  6.21824463  7.03151705  7.15490196  6.1414628   5.69897
 11.74472749 11.26760624 11.69897    10.65757732 10.13076828 10.67778071
  9.16430943  9.82390874  6.19246497  8.22184875 10.85387196 11.09691001
  8.39794001 11.25963731 10.66354027 13.          7.23657201 10.69897
 10.67778071 11.04575749  6.92811799  6.20971484  9.46852108  8.15490196
 10.4436975  11.39794001  6.20481541 10.7212464   9.98716278 10.29242982
  6.79588002 10.69897     9.30103    11.31875876  6.60730305 11.30103
  6.36311109 11.24412514  8.40893539  9.94692156]
[[-5.44097480e+00  6.25682243e+02  1.39610826e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-5.79118133e+00  3.49163596e+02  1.39281117e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-1.02012251e+01  4.41113411e+02  1.37337906e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 6.59286281e+00  1.41319879e+03  1.70945830e+00 ...  0.00000000e+00
   1.00000000e+00  0.00000000e+00]
 [ 9.78975985e+00  4.65

#### Spliting the files in x and y variables [training dataset]

In [7]:
X = overall_data.iloc[:,1:].values
y = overall_data.loc[:,'pIC50'].values

X[:] = np.nan_to_num(X)
y[:] = np.nan_to_num(y)

print(X,y)

[[-1.40774940e+00  3.62878746e+02  1.54965151e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 6.35143180e+00  1.94406262e+03  1.76802230e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 2.44588309e-01  4.60725548e+02  1.48021483e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [-3.98222102e+00  1.10904162e+03  1.80564131e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-1.74128724e+00  1.06038028e+03  1.73389208e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-1.74128724e+00  1.06038028e+03  1.73389208e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]] [ 7.         10.1426675   6.52287875 11.04575749 10.88605665  8.1426675
 10.63827216 11.52287875 11.52287875 11.20065945  5.69897    10.88605665
 11.15490196  6.13667714 10.58502665 10.85387196 10.67778071  6.25963731
  5.69897    11.04575749  9.          8.52287875 11.30103     7.24412514
  9.7212464   8.43179828  9.71219827  5.74593555  6.16241156 10.85387196


### Call the defined functions

In [8]:
column_names =  ['Range_val', 'Model_info','MAE', 'RMSE','R2', 'PCC','p-val']

In [9]:
output  = pd.DataFrame(columns = column_names)

## Change the kernels; C and gamma values for model optimization
# E.g. kernels = ['poly', 'rbf'] 
# Cs = [0.01, 0.1, 1, 10, 100]
# gammas = [0.1, 0.01, 0.001, 0.0001]

kernels = ['rbf']
Cs = [0.01,0.1]
gammas = [0.1, 0.01]


param = [k for k in itertools.product(kernels, Cs, gammas)]

for i in range(len(param)):
    params=param[i]
    kernel, C, gamma = params[0], params[1], params[2]
    #cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    model = SVR(C=C, kernel=kernel, gamma=gamma)
    for j in range(100):
        param_name = 'svm_'+'param_'+str(i)+'_kernel_'+params[0]+'_C_'+str(params[1])+'_gamma_'+str(params[2])
        #print(i, j)
        x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=j)
        output_val = complete_model(trainx=x_train, trainy=y_train, testx=x_test, testy=y_test, model=model)
        output.loc[len(output)] = [str(j), param_name, output_val[0], output_val[1], output_val[2], output_val[3],output_val[4]]

# output.to_csv(r'''svm_training_metrics_all_10.csv''')

### Print the output of best performing SVM model

In [10]:
### SVM
final_svm = SVR(C=0.1, kernel='rbf', gamma=0.01)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1, random_state=62)
### Optimized SVM model training performance
print_output('SVM', 'training', complete_model(x_train, y_train, x_test, y_test, final_svm))
### Optimized SVM model testing performance
print_result('SVM', 'testing', complete_model(X_train, y_train, X_val, y_val, final_svm))