## Neural Network Approach



### Setup

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
# read dynamic path
base_dir = Path(os.getcwd())/"implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [3]:
knn_data_raw = pd.read_csv(data_dir / "ACHE/ache_pca.csv")

knn_data_raw

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,43,44,45,46,47,48,49,50,51,LABEL
0,0,2.775103,1.569092,-1.633093,0.509205,-0.365362,0.464463,-1.344698,0.340212,1.195709,...,0.028038,0.472668,0.000417,0.032610,0.321647,0.373303,0.378937,0.053533,0.146973,active
1,1,1.160143,0.745342,3.626592,-0.482753,-0.501234,-0.702610,0.148980,1.191717,-0.315544,...,0.152880,-0.211698,-0.036652,0.121264,0.091523,0.055746,-0.028294,0.070177,0.144679,active
2,2,1.347487,-1.066134,0.930806,-0.326445,1.909936,0.292744,0.015491,0.427259,0.980201,...,0.217302,-0.109417,0.019410,-0.095346,-0.155197,0.019081,-0.141183,-0.306460,-0.062238,active
3,3,4.217589,-0.574677,1.399565,1.361755,-1.065899,0.487160,0.041603,0.652388,0.670705,...,0.157739,-0.189402,0.158445,0.220310,0.021845,-0.099789,0.070753,0.123486,0.066222,active
4,4,2.269227,-0.097871,0.590080,0.325115,1.649822,0.056987,-0.778049,0.349009,0.760214,...,0.067016,-0.050732,-0.009238,0.209047,-0.014614,-0.088393,0.046919,-0.085897,0.123251,active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,998,-0.609818,-0.656158,-0.131409,0.554176,-0.437736,-0.673396,-0.254374,-0.307080,-0.299093,...,-0.102843,0.016278,0.070193,-0.382453,0.163873,-0.657440,-0.134046,0.768515,0.256114,active
999,999,-0.418320,-0.805650,-0.109816,-0.376300,-0.549992,-0.458199,0.583562,0.367584,0.043453,...,-0.086251,-0.012426,0.085909,0.038385,0.079218,0.017346,0.007403,-0.071999,-0.063954,active
1000,1000,-1.637342,-0.955758,0.168724,0.684614,-0.258857,-0.898800,0.085315,0.578703,0.398927,...,-0.576817,0.119175,-0.115178,0.025776,-0.052349,0.095234,0.116501,0.038220,0.091057,active
1001,1001,0.573375,-0.944395,-0.090789,-0.004957,-0.256222,-0.095932,0.025505,-0.122203,0.328594,...,-0.200537,0.019137,-0.007516,0.055282,-0.002256,0.076485,-0.001794,-0.038076,-0.072265,inactive


In [4]:
lookup = {"inactive": 0, "active": 1}

knn_data = {
    "data": np.array(knn_data_raw.iloc[:, 2:-1]),
    "target": np.array([lookup[y] for y in knn_data_raw.iloc[0:, -1]]),
    "feature_names": knn_data_raw.columns[2:-1],
    "target_names": ["inactive", "active"],
}

split into train- and test-set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    knn_data["data"], knn_data["target"], test_size=0.3, random_state=4232
)

### Apply K-NearestNeighbour

In [6]:
parameters = {"kneighborsclassifier__n_neighbors": list(range(1, 20, 2))}
# normalize date with pipeline
pipe1 = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_models = GridSearchCV(pipe1, parameters, cv=20, n_jobs=-1)

Fit model with the provided parameters

In [7]:
knn_models.fit(X_train, y_train)

In [8]:
pd.DataFrame(knn_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008232,0.004017,0.050594,0.023588,1,{'kneighborsclassifier__n_neighbors': 1},0.694444,0.833333,0.714286,0.657143,...,0.628571,0.771429,0.828571,0.742857,0.628571,0.714286,0.742857,0.743532,0.065118,3
1,0.008518,0.005973,0.004778,0.002062,3,{'kneighborsclassifier__n_neighbors': 3},0.638889,0.75,0.714286,0.714286,...,0.771429,0.657143,0.914286,0.8,0.571429,0.742857,0.8,0.746587,0.072552,1
2,0.005882,0.002141,0.005083,0.002545,5,{'kneighborsclassifier__n_neighbors': 5},0.722222,0.777778,0.714286,0.685714,...,0.8,0.685714,0.828571,0.771429,0.6,0.742857,0.8,0.746429,0.061843,2
3,0.006223,0.004182,0.005705,0.005919,7,{'kneighborsclassifier__n_neighbors': 7},0.75,0.777778,0.742857,0.685714,...,0.8,0.657143,0.828571,0.742857,0.6,0.714286,0.771429,0.732103,0.061187,6
4,0.004276,0.000925,0.003547,0.001331,9,{'kneighborsclassifier__n_neighbors': 9},0.722222,0.777778,0.714286,0.771429,...,0.742857,0.657143,0.8,0.714286,0.714286,0.742857,0.771429,0.732143,0.05966,5
5,0.004058,0.000899,0.003133,0.000569,11,{'kneighborsclassifier__n_neighbors': 11},0.777778,0.75,0.714286,0.771429,...,0.771429,0.657143,0.8,0.742857,0.657143,0.771429,0.742857,0.737817,0.058531,4
6,0.00454,0.001031,0.00424,0.001735,13,{'kneighborsclassifier__n_neighbors': 13},0.75,0.777778,0.685714,0.657143,...,0.742857,0.542857,0.742857,0.8,0.685714,0.742857,0.714286,0.713532,0.075183,9
7,0.004524,0.001057,0.004004,0.001825,15,{'kneighborsclassifier__n_neighbors': 15},0.722222,0.861111,0.714286,0.685714,...,0.771429,0.542857,0.742857,0.8,0.628571,0.771429,0.685714,0.719167,0.07717,7
8,0.004404,0.00119,0.003614,0.00119,17,{'kneighborsclassifier__n_neighbors': 17},0.694444,0.777778,0.685714,0.742857,...,0.742857,0.6,0.742857,0.8,0.6,0.742857,0.714286,0.710754,0.062959,10
9,0.003977,0.000896,0.00333,0.000681,19,{'kneighborsclassifier__n_neighbors': 19},0.75,0.777778,0.657143,0.742857,...,0.742857,0.6,0.742857,0.8,0.628571,0.771429,0.714286,0.716389,0.062725,8


In [9]:
knn_models.best_score_

0.7465873015873017

In [10]:
knn_models.best_params_

{'kneighborsclassifier__n_neighbors': 3}

### Score

In [11]:
prediction = knn_models.best_estimator_.predict(X_test)
knn_models.best_estimator_.score(X_test, y_test)

0.7242524916943521

### Prepare data for evaluation

In [12]:
result_df = pd.DataFrame(columns=knn_data_raw.columns[:-1])


for i, row in enumerate(X_test):
    data = [i,i]
    data.extend(row)
    result_df.loc[len(result_df["0"])] = data

result_df = result_df.drop(columns=["0"])
result_df["LABEL"] = y_test
result_df["PRED"] = prediction

result_df.to_csv(result_dir / "ACHE/fe_pca_knn.csv", encoding="utf-8")