## Calling the libraries

In [12]:
import pandas as pd
import numpy as np
import math
import itertools
import matplotlib.pyplot as plt

from sklearn import model_selection, metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from scipy import stats


from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, r2_score, f1_score,mean_squared_error

from sklearn.svm import SVR

from numpy.random import seed
seed(1)

### Degining the functions

In [2]:
def complete_model(trainx, trainy, testx, testy, model):
    model.fit(trainx, trainy)
    y_pred = model.predict(testx)
    return calculation(testy, y_pred)

In [3]:
def calculation(y_test, y_pred):  
    mse=mean_squared_error(y_test,y_pred)
    rmse=np.sqrt(mse)
    r2=metrics.r2_score(y_test,y_pred)
    pcc=stats.pearsonr(y_test,y_pred)
    return [mse, rmse, r2, pcc[0],pcc[1]]

In [4]:
def print_output(model_name, val, output_val):
    print('\033[1mOptimized {} model {} performance: \033[0m'.format(model_name, val))
    print("MSE:         {0:.3f}".format(output_val[0]))
    print("RMSE: {0:.3f}".format(output_val[1]))
    print("R2: {0:.3f}".format(output_val[2]))
    print("PCC:    {0:.3f}".format(output_val[3]))
    print("p-value:    {0:.3f}".format(output_val[4]))

#### Read data (training files)

In [6]:
## Read the file
overall_data =pd.read_csv("../SARS/sars_boruta_grepped_features_train.csv")

### Remove last column from the file
overall_data = overall_data.iloc[:, :-1]

## Replace the string character
overall_data.replace('?',0,inplace=True)


overall_data.head()

Unnamed: 0,Name,AATS3s,AATS4s,AATSC0c,AATSC1p,MATS1p,MATS1i,VE1_Dzs,VE2_Dzs,TDB5e,...,PubchemFP497,PubchemFP609,PubchemFP622,PubchemFP674,KRFP2018,KRFP2135,KRFP4772,AD2D174,KRFPC2135,KRFPC2975
0,5.18,2.034846,1.948337,0.007198,-0.014797,-0.059715,-0.211553,0.493891,0.02245,34.024208,...,0,0,0,0,0,0,0,0,0.0,9.0
1,5.1,2.168422,2.021814,0.011219,-0.007268,-0.029721,-0.185476,0.45791,0.019909,32.854418,...,0,0,0,0,0,0,0,0,0.0,9.0
2,5.89,2.586843,2.484821,0.01104,0.001238,0.004958,-0.123107,0.285269,0.011411,35.089963,...,0,0,0,0,0,0,0,0,0.0,5.0
3,5.3,6.735119,5.824755,0.039992,-0.005964,-0.029783,-0.20113,0.229938,0.012774,43.154668,...,0,0,0,0,0,0,0,0,0.0,9.0
4,4.03,2.308201,2.173413,0.006154,-0.008504,-0.034845,-0.098218,0.413094,0.014753,35.676718,...,0,0,0,0,0,0,0,0,0.0,8.0


#### Independent validation dataset

In [7]:
test = pd.read_csv("../SARS_CoV-2/sars_cov2_boruta_grepped_features_test.csv", low_memory=False)


test = test.iloc[:, :-1]

## Replace the string character from the test dataset
test.replace('?',0,inplace=True)

## Spliting the files in x and y variables [test dataset]
X_val = test.iloc[:,1:].values
y_val = test.loc[:,'Name'].values

X_val[:] = np.nan_to_num(X_val)
y_val[:] = np.nan_to_num(y_val)

#### Spliting the files in x and y variables [training dataset]

In [9]:
x = overall_data.iloc[:,1:].values
y = overall_data.loc[:,'Name'].values

x[:] = np.nan_to_num(x)
y[:] = np.nan_to_num(y)

#print(x,y)

### Call the defined functions

In [10]:
column_names =  ['Range_val', 'Model_info','MAE', 'RMSE','R2', 'PCC','p-val']

In [11]:
output  = pd.DataFrame(columns = column_names)

## Change the kernels; C and gamma values for model optimization
# E.g. kernels = ['poly', 'rbf'] 
# Cs = [0.01, 0.1, 1, 10, 100]
# gammas = [0.1, 0.01, 0.001, 0.0001]

kernels = ['rbf']
Cs = [0.01,0.1]
gammas = [0.1, 0.01]


param = [k for k in itertools.product(kernels, Cs, gammas)]

for i in range(len(param)):
    params=param[i]
    kernel, C, gamma = params[0], params[1], params[2]
    #cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    model = SVR(C=C, kernel=kernel, gamma=gamma)
    for j in range(100):
        param_name = 'svm_'+'param_'+str(i)+'_kernel_'+params[0]+'_C_'+str(params[1])+'_gamma_'+str(params[2])
        #print(i, j)
        x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1, random_state=j)
        output_val = complete_model(trainx=x_train, trainy=y_train, testx=x_test, testy=y_test, model=model)
        output.loc[len(output)] = [str(j), param_name, output_val[0], output_val[1], output_val[2], output_val[3],output_val[4]]

output.to_csv(r'''svm_training_metrics_all_10.csv''')

### Print the output of best performing SVM model

In [65]:
### SVM
final_svm = SVR(C=0.1, kernel='rbf', gamma=0.01)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1, random_state=62)
### Optimized SVM model training performance
print_output('SVM', 'training', complete_model(x_train, y_train, x_test, y_test, final_svm))
### Optimized SVM model testing performance
print_result('SVM', 'testing', complete_model(X_train, y_train, X_val, y_val, final_svm))

[1mOptimized SVM model training performance: [0m
MSE:         1.217
RMSE: 1.103
R2: 0.088
PCC:    0.808
p-value:    0.000
