## Read the libraries

In [1]:
import pandas as pd
import numpy as np
import math
import itertools

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

from sklearn import model_selection, metrics
from sklearn.model_selection import train_test_split
from scipy import stats


from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, r2_score, f1_score,mean_squared_error

from sklearn.neighbors import KNeighborsRegressor

from numpy.random import seed
seed(1)

### Defining the functions

In [2]:
def complete_model(trainX, trainy, testX, testy, model):
    model.fit(trainX, trainy)
    y_pred = model.predict(testX)
    return calculation(testy, y_pred)

In [3]:
def calculation(y_test, y_pred):  
    mse=mean_squared_error(y_test,y_pred)
    rmse=np.sqrt(mse)
    r2=metrics.r2_score(y_test,y_pred)
    pcc=stats.pearsonr(y_test,y_pred)
    return [mse, rmse, r2, pcc[0],pcc[1]]

In [4]:
def print_output(model_name, val, output_val):
    print('\033[1mOptimized {} model {} performance: \033[0m'.format(model_name, val))
    print("MSE:         {0:.3f}".format(output_val[0]))
    print("RMSE: {0:.3f}".format(output_val[1]))
    print("R2: {0:.3f}".format(output_val[2]))
    print("PCC:    {0:.3f}".format(output_val[3]))
    print("p-value:    {0:.3f}".format(output_val[4]))

## Read the dataset

In [5]:
## Read the training file
overall_data =pd.read_csv("NS5A_top50_SVR_MLT_train_input.csv")

### Remove last column from the file
overall_data = overall_data.iloc[:, :-1]

## Replace the string character
overall_data.replace('?',0,inplace=True)

### Split the data
X = overall_data.iloc[:,1:].values
y = overall_data.loc[:,'pIC50'].values

X[:] = np.nan_to_num(X)
y[:] = np.nan_to_num(y)


overall_data.head()

Unnamed: 0,pIC50,ATSC5p,VR1_DzZ,SpMin6_Bhv,SpMin3_Bhe,C4SP3,VC-3,maxHBint4,nF8Ring,TDB10u,...,GraphFP770,MACCSFP140,PubchemFP20,KRFP19,KRFP413,KRFP1412,KRFP3550,AD2D603,KRFPC3139,KRFPC3709
0,7.0,-1.407749,362.878746,1.549652,1.778676,0,1.820869,2.800691,0.0,9.646082,...,1,1,1,0,0,0,0,0,0.0,0.0
1,10.142668,6.351432,1944.062619,1.768022,1.725425,1,3.565112,0.0,0.0,8.4838,...,1,1,1,0,0,0,0,0,0.0,0.0
2,6.522879,0.244588,460.725548,1.480215,1.753079,0,1.063571,0.0,0.0,8.677389,...,1,0,0,0,0,0,0,0,0.0,0.0
3,11.045757,-3.404421,956.331384,1.782068,1.8485,0,2.505661,0.0,0.0,8.86281,...,1,1,1,0,0,0,0,0,0.0,0.0
4,10.886057,5.213243,856.934759,1.705756,1.695694,0,1.77796,0.0,0.0,8.268452,...,1,1,1,0,0,0,0,0,0.0,0.0


In [6]:
### Read the test dataset

test = pd.read_csv("NS5A_top50_SVR_MLT_test_input.csv", low_memory=False)


test = test.iloc[:, :-1]

## Replace the string character from the test dataset
test.replace('?',0,inplace=True)

## Spliting the files in x and y variables [test dataset]
X_val = test.iloc[:,1:].values
y_val = test.loc[:,'pIC50'].values

X_val[:] = np.nan_to_num(X_val)
y_val[:] = np.nan_to_num(y_val)

test.head()

Unnamed: 0,pIC50,ATSC5p,VR1_DzZ,SpMin6_Bhv,SpMin3_Bhe,C4SP3,VC-3,maxHBint4,nF8Ring,TDB10u,...,GraphFP770,MACCSFP140,PubchemFP20,KRFP19,KRFP413,KRFP1412,KRFP3550,AD2D603,KRFPC3139,KRFPC3709
0,7.958607,-5.440975,625.682243,1.396108,1.699472,0,1.242232,2.761511,0.0,9.294378,...,1,1,1,0,0,0,0,0,0.0,0.0
1,6.218245,-5.791181,349.163596,1.392811,1.674385,0,1.081311,3.505246,0.0,9.760321,...,1,0,0,0,0,0,0,0,0.0,0.0
2,7.031517,-10.201225,441.113411,1.373379,1.654528,0,1.658526,2.794355,0.0,9.46597,...,1,1,1,0,0,0,0,0,0.0,0.0
3,7.154902,-3.877879,347.644681,1.366456,1.704803,0,1.190885,2.79358,0.0,9.098943,...,1,1,1,0,0,0,0,0,0.0,0.0
4,6.141463,-4.270221,566.594925,1.338042,1.580448,0,0.72492,2.827997,0.0,9.472881,...,1,1,1,0,0,0,0,0,0.0,0.0


### Call the defined function

In [7]:
column_names =  ['Range_val', 'Model_info','MAE', 'RMSE','R2', 'PCC','p-val']

In [8]:
### define a dataframe to save the training performance
output  = pd.DataFrame(columns = column_names)

for k in [3,5,7,9,11]:  
    model = KNeighborsRegressor(n_neighbors=k)
    print(model)
    
    for j in range(100):    
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=j)
        output_val = complete_model(trainX=X_train, trainy=y_train, testX=X_test, testy=y_test, model=model)
        output.loc[len(output)] = [str(j), 'knn_k'+str(k),output_val[0], output_val[1], output_val[2], output_val[3],  output_val[4]]

# output.to_csv(r'''knn_training_metrics_all.csv''')

KNeighborsRegressor(n_neighbors=3)
KNeighborsRegressor()
KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor(n_neighbors=9)
KNeighborsRegressor(n_neighbors=11)


### Print the output of best performing kNN model

In [9]:
### KNN
optimized_knn = KNeighborsRegressor(n_neighbors=3)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=51)
### Optimized KNN model training performance
print_result('KNN', 'training', complete_model(X_train, y_train, X_test, y_test, optimized_knn))
### Optimized KNN model testing performance
print_result('KNN', 'testing', complete_model(X_train, y_train, X_val, y_val, optimized_knn))