## Neural Network Approach



### Setup

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
# read dynamic path
base_dir = Path(os.getcwd())/"implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [3]:
knn_data_raw_train = pd.read_csv(data_dir / "ACHE/ache_smote_train.csv")
knn_data_raw_test = pd.read_csv(data_dir / "ACHE/ache_smote_test.csv")


In [4]:
knn_data_train = {'data': np.array(knn_data_raw_train.iloc[:, 2:-1]),
             'target': np.array(knn_data_raw_train.iloc[0:,-1]),
             'feature_names': knn_data_raw_train.columns[2:-1],
             'target_names': ['inactive', 'active']}

knn_data_test = {'data': np.array(knn_data_raw_test.iloc[:, 2:-1]),
             'target': np.array(knn_data_raw_test.iloc[0:,-1]),
             'feature_names': knn_data_raw_test.columns[2:-1],
             'target_names': ['inactive', 'active']}

### Apply K-NearestNeighbour

In [5]:
parameters = {"kneighborsclassifier__n_neighbors": list(range(1, 20, 2))}
# normalize date with pipeline
pipe1 = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_models = GridSearchCV(pipe1, parameters, cv=20, n_jobs=-1)

Fit model with the provided parameters

In [6]:
knn_models.fit(knn_data_train["data"], knn_data_train["target"])

In [7]:
pd.DataFrame(knn_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,0.044391,0.016322,0.156277,0.036461,1,{'kneighborsclassifier__n_neighbors': 1},0.777778,0.694444,0.714286,0.714286,...,0.685714,0.685714,0.828571,0.771429,0.657143,0.714286,0.714286,0.739325,0.057874,2
1,0.018323,0.013204,0.018714,0.014161,3,{'kneighborsclassifier__n_neighbors': 3},0.777778,0.666667,0.657143,0.771429,...,0.628571,0.657143,0.857143,0.828571,0.628571,0.685714,0.685714,0.723651,0.065642,9
2,0.024923,0.025731,0.029502,0.027239,5,{'kneighborsclassifier__n_neighbors': 5},0.777778,0.638889,0.657143,0.714286,...,0.714286,0.685714,0.828571,0.885714,0.685714,0.8,0.714286,0.736548,0.063811,4
3,0.013492,0.008784,0.022727,0.021372,7,{'kneighborsclassifier__n_neighbors': 7},0.75,0.611111,0.742857,0.685714,...,0.714286,0.657143,0.714286,0.771429,0.657143,0.771429,0.742857,0.719484,0.054188,10
4,0.016132,0.013306,0.017515,0.01738,9,{'kneighborsclassifier__n_neighbors': 9},0.75,0.722222,0.742857,0.714286,...,0.685714,0.742857,0.771429,0.771429,0.685714,0.714286,0.742857,0.730754,0.037794,6
5,0.010608,0.007251,0.02096,0.024629,11,{'kneighborsclassifier__n_neighbors': 11},0.75,0.694444,0.742857,0.714286,...,0.714286,0.714286,0.742857,0.771429,0.714286,0.742857,0.742857,0.743651,0.042818,1
6,0.022646,0.016842,0.019104,0.015232,13,{'kneighborsclassifier__n_neighbors': 13},0.777778,0.722222,0.714286,0.685714,...,0.714286,0.657143,0.8,0.8,0.685714,0.714286,0.771429,0.739286,0.048108,3
7,0.014803,0.009757,0.023362,0.018728,15,{'kneighborsclassifier__n_neighbors': 15},0.75,0.75,0.685714,0.657143,...,0.742857,0.657143,0.828571,0.771429,0.714286,0.714286,0.685714,0.727857,0.047643,7
8,0.016606,0.016093,0.012541,0.012092,17,{'kneighborsclassifier__n_neighbors': 17},0.75,0.722222,0.714286,0.657143,...,0.771429,0.714286,0.771429,0.8,0.657143,0.742857,0.685714,0.726468,0.045626,8
9,0.014573,0.014937,0.022242,0.022441,19,{'kneighborsclassifier__n_neighbors': 19},0.75,0.666667,0.714286,0.742857,...,0.771429,0.714286,0.771429,0.828571,0.714286,0.742857,0.657143,0.730833,0.048501,5


In [8]:
knn_models.best_score_

0.7436507936507937

In [9]:
knn_models.best_params_

{'kneighborsclassifier__n_neighbors': 11}

### Score

In [10]:
prediction = knn_models.best_estimator_.predict(knn_data_test["data"])
knn_models.best_estimator_.score(knn_data_test["data"], knn_data_test["target"])

0.6843853820598007

### Prepare data for evaluation

In [11]:
result_df = pd.DataFrame(columns=knn_data_raw_train.columns[:-1])

for i, row in enumerate(knn_data_test["data"]):
    data = [i,i]
    data.extend(row)
    result_df.loc[i+1] = data

result_df["LABEL"] = knn_data_test["target"]
result_df["PRED"] = prediction

result_df.to_csv(result_dir / "ACHE/fe_smote_knn.csv", encoding="utf-8")