## Neural Network Approach



### Setup

In [13]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [14]:
# read dynamic path
base_dir = Path(os.getcwd())/"implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [15]:
knn_data_raw = pd.read_csv(data_dir / "ACHE/ache_pca.csv")

knn_data_raw

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,21,22,23,24,25,26,27,28,29,LABEL
0,0,2.775103,1.569092,-1.633093,0.509205,-0.365362,0.464463,-1.344698,0.340212,1.195709,...,-0.426203,0.157809,-0.384176,-0.775244,-0.265311,0.172972,-0.078754,0.175435,0.157472,active
1,1,1.160143,0.745342,3.626592,-0.482753,-0.501234,-0.702610,0.148980,1.191717,-0.315544,...,-0.197789,-0.733598,0.058455,0.356964,0.004727,0.491466,0.056198,-0.304857,0.442595,active
2,2,1.347487,-1.066134,0.930806,-0.326445,1.909936,0.292744,0.015491,0.427259,0.980201,...,-0.371311,-0.441166,-0.295813,0.288092,-0.258665,-0.158248,0.097048,-0.130219,-0.302125,active
3,3,4.217589,-0.574677,1.399565,1.361755,-1.065899,0.487160,0.041603,0.652388,0.670705,...,-0.035638,0.009457,0.003479,0.175791,0.480889,-0.397306,-0.118256,-0.429759,0.046488,active
4,4,2.269227,-0.097871,0.590080,0.325115,1.649822,0.056987,-0.778049,0.349009,0.760214,...,-0.450115,-0.645917,0.283596,-0.353558,-0.340660,0.206826,-0.132083,-0.185250,0.394408,active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,998,-0.609818,-0.656158,-0.131409,0.554176,-0.437736,-0.673396,-0.254374,-0.307080,-0.299093,...,0.298212,0.244026,0.018224,0.040865,-0.106602,0.218396,-0.221769,0.094682,0.076756,active
999,999,-0.418320,-0.805650,-0.109816,-0.376300,-0.549992,-0.458199,0.583562,0.367584,0.043453,...,-0.381802,0.069187,-0.053126,0.357976,0.090901,0.410654,-0.183452,0.216238,-0.213667,active
1000,1000,-1.637342,-0.955758,0.168724,0.684614,-0.258857,-0.898800,0.085315,0.578703,0.398927,...,0.703583,0.167216,-0.588820,-0.750097,-0.244467,0.106357,-0.130821,0.226009,-0.169840,active
1001,1001,0.573375,-0.944395,-0.090789,-0.004957,-0.256222,-0.095932,0.025505,-0.122203,0.328594,...,0.046983,0.173622,-0.314616,-0.280141,-0.116199,0.507827,-0.063389,0.449591,-0.264470,inactive


In [16]:
lookup = {"inactive": 0, "active": 1}

knn_data = {
    "data": np.array(knn_data_raw.iloc[:, 2:-1]),
    "target": np.array([lookup[y] for y in knn_data_raw.iloc[0:, -1]]),
    "feature_names": knn_data_raw.columns[2:-1],
    "target_names": ["inactive", "active"],
}

split into train- and test-set

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    knn_data["data"], knn_data["target"], test_size=0.3, random_state=4232
)

### Apply K-NearestNeighbour

In [18]:
parameters = {"kneighborsclassifier__n_neighbors": list(range(1, 20, 2))}
# normalize date with pipeline
pipe1 = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_models = GridSearchCV(pipe1, parameters, cv=20, n_jobs=-1)

Fit model with the provided parameters

In [19]:
knn_models.fit(X_train, y_train)

In [20]:
pd.DataFrame(knn_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004034,0.001612,0.029498,0.007957,1,{'kneighborsclassifier__n_neighbors': 1},0.777778,0.666667,0.857143,0.714286,...,0.771429,0.714286,0.885714,0.742857,0.657143,0.742857,0.771429,0.743651,0.065337,3
1,0.00367,0.001126,0.003196,0.001169,3,{'kneighborsclassifier__n_neighbors': 3},0.805556,0.666667,0.8,0.742857,...,0.742857,0.685714,0.8,0.828571,0.657143,0.657143,0.742857,0.749325,0.057108,2
2,0.003399,0.00084,0.003017,0.000957,5,{'kneighborsclassifier__n_neighbors': 5},0.777778,0.722222,0.828571,0.8,...,0.714286,0.685714,0.885714,0.828571,0.6,0.571429,0.8,0.755,0.07516,1
3,0.003071,0.000638,0.002468,0.000683,7,{'kneighborsclassifier__n_neighbors': 7},0.75,0.722222,0.828571,0.771429,...,0.714286,0.628571,0.8,0.771429,0.571429,0.628571,0.742857,0.726468,0.07478,5
4,0.003296,0.000999,0.002683,0.000701,9,{'kneighborsclassifier__n_neighbors': 9},0.777778,0.722222,0.8,0.771429,...,0.714286,0.6,0.771429,0.8,0.6,0.6,0.771429,0.722143,0.078377,9
5,0.002436,0.000924,0.001915,0.00072,11,{'kneighborsclassifier__n_neighbors': 11},0.75,0.694444,0.8,0.771429,...,0.714286,0.6,0.771429,0.828571,0.571429,0.628571,0.742857,0.723651,0.079256,7
6,0.002161,0.000744,0.001799,0.00059,13,{'kneighborsclassifier__n_neighbors': 13},0.75,0.722222,0.828571,0.8,...,0.742857,0.6,0.771429,0.857143,0.571429,0.657143,0.685714,0.722183,0.081344,8
7,0.002522,0.00082,0.002107,0.000732,15,{'kneighborsclassifier__n_neighbors': 15},0.75,0.694444,0.771429,0.742857,...,0.742857,0.6,0.828571,0.8,0.628571,0.657143,0.685714,0.720794,0.070278,10
8,0.002686,0.001075,0.002167,0.000806,17,{'kneighborsclassifier__n_neighbors': 17},0.722222,0.722222,0.8,0.771429,...,0.742857,0.6,0.828571,0.8,0.628571,0.657143,0.714286,0.725079,0.071228,6
9,0.001994,0.0004,0.001668,0.000226,19,{'kneighborsclassifier__n_neighbors': 19},0.75,0.722222,0.8,0.742857,...,0.828571,0.628571,0.8,0.828571,0.6,0.628571,0.685714,0.726468,0.077461,4


In [21]:
knn_models.best_score_

0.7550000000000001

In [22]:
knn_models.best_params_

{'kneighborsclassifier__n_neighbors': 5}

### Score

In [23]:
prediction = knn_models.best_estimator_.predict(X_test)
knn_models.best_estimator_.score(X_test, y_test)

0.7574750830564784

### Prepare data for evaluation

In [24]:
result_df = pd.DataFrame(columns=knn_data_raw.columns[:-1])


for i, row in enumerate(X_test):
    data = [i,i]
    data.extend(row)
    result_df.loc[len(result_df["0"])] = data

result_df = result_df.drop(columns=["0"])
result_df["LABEL"] = y_test
result_df["PRED"] = prediction

result_df.to_csv(result_dir / "ACHE/fe_pca_knn.csv", encoding="utf-8")