## Neural Network Approach



### Setup

In [13]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [14]:
# read dynamic path
base_dir = Path(os.getcwd()) / "implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [15]:
knn_data_raw = pd.read_csv(data_dir / "MOAB/MOAB.csv")

knn_data_raw

Unnamed: 0,INDEX,NAME,Salt_Bridge:GLU84B,Hydrophobic_Interaction:GLU84B,Halogen_Bond:GLN206B,Hydrophobic_Interaction:LEU328B,Water_Bridge:THR201B,Halogen_Bond:SER200B,Hydrophobic_Interaction:TYR435A,Hydrogen_Bond:PRO102B,...,Pi-Stacking:PHE168B,Pi-Stacking:TYR326B,Hydrogen_Bond:ASN203B,Hydrophobic_Interaction:GLN206A,Hydrophobic_Interaction:THR201B,Hydrophobic_Interaction:THR202B,Water_Bridge:GLU84B,Hydrophobic_Interaction:PRO104B,Hydrophobic_Interaction:PHE103B,LABEL
0,1,CHEMBL583128|actives_final|sdf|78,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,2,0,0,active
1,2,CHEMBL583128|actives_final|sdf|79,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,2,0,0,active
2,3,ZINC32575615|decoys_final|sdf|262,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,2,0,0,inactive
3,4,CHEMBL45069|actives_final|sdf|163,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,1,active
4,5,ZINC36683565|decoys_final|sdf|17,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,1,inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,367,CHEMBL174289|actives_final|sdf|17,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,active
367,368,CHEMBL552680|actives_final|sdf|91,0,0,0,0,1,0,1,1,...,0,0,0,1,0,0,0,0,1,active
368,369,CHEMBL522271|actives_final|sdf|2,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,active
369,370,CHEMBL26138|actives_final|sdf|107,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,active


In [16]:
lookup = {"inactive": 0, "active": 1}

knn_data = {
    "data": np.array(knn_data_raw.iloc[:, 2:-1]),
    "target": np.array([lookup[y] for y in knn_data_raw.iloc[0:, -1]]),
    "feature_names": knn_data_raw.columns[2:-1],
    "target_names": ["inactive", "active"],
}

split into train- and test-set

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    knn_data["data"], knn_data["target"], test_size=0.3, random_state=4232
)

### Apply K-NearestNeighbour

In [18]:
parameters = {"kneighborsclassifier__n_neighbors": list(range(1, 20, 2))}
# normalize date with pipeline
pipe1 = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_models = GridSearchCV(pipe1, parameters, cv=20, n_jobs=-1)

Fit model with the provided parameters

In [19]:
knn_models.fit(X_train, y_train)

In [20]:
pd.DataFrame(knn_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,0.026696,0.020918,0.025448,0.017783,1,{'kneighborsclassifier__n_neighbors': 1},0.615385,0.923077,0.461538,0.923077,...,0.769231,0.846154,0.615385,0.538462,0.307692,0.538462,0.666667,0.660256,0.146597,10
1,0.021193,0.023662,0.023149,0.021887,3,{'kneighborsclassifier__n_neighbors': 3},0.615385,0.769231,0.461538,0.846154,...,0.769231,0.769231,0.538462,0.538462,0.461538,0.615385,0.75,0.699038,0.132577,9
2,0.025718,0.01995,0.025174,0.023663,5,{'kneighborsclassifier__n_neighbors': 5},0.615385,0.846154,0.615385,0.769231,...,0.769231,0.846154,0.615385,0.692308,0.538462,0.538462,0.666667,0.702564,0.10708,8
3,0.023738,0.016469,0.019663,0.022952,7,{'kneighborsclassifier__n_neighbors': 7},0.615385,0.846154,0.692308,0.692308,...,0.846154,0.769231,0.769231,0.615385,0.615385,0.615385,0.583333,0.717628,0.097057,4
4,0.033748,0.02492,0.024737,0.021621,9,{'kneighborsclassifier__n_neighbors': 9},0.615385,0.846154,0.692308,0.769231,...,0.923077,0.769231,0.615385,0.692308,0.538462,0.615385,0.75,0.729808,0.109881,1
5,0.020823,0.016868,0.02009,0.014072,11,{'kneighborsclassifier__n_neighbors': 11},0.615385,0.769231,0.692308,0.769231,...,0.846154,0.769231,0.615385,0.692308,0.538462,0.538462,0.75,0.714423,0.094069,5
6,0.039013,0.021264,0.025637,0.016953,13,{'kneighborsclassifier__n_neighbors': 13},0.692308,0.769231,0.615385,0.692308,...,0.846154,0.692308,0.769231,0.692308,0.538462,0.615385,0.75,0.722115,0.081697,2
7,0.03615,0.031049,0.020016,0.019343,15,{'kneighborsclassifier__n_neighbors': 15},0.538462,0.769231,0.538462,0.692308,...,0.846154,0.692308,0.846154,0.615385,0.461538,0.692308,0.75,0.702885,0.103467,7
8,0.031606,0.025934,0.018269,0.016958,17,{'kneighborsclassifier__n_neighbors': 17},0.538462,0.769231,0.615385,0.769231,...,0.846154,0.692308,0.769231,0.692308,0.461538,0.692308,0.75,0.714423,0.097163,5
9,0.016387,0.013388,0.015476,0.013797,19,{'kneighborsclassifier__n_neighbors': 19},0.615385,0.692308,0.692308,0.769231,...,0.769231,0.692308,0.769231,0.692308,0.538462,0.615385,0.75,0.718269,0.07355,3


In [21]:
knn_models.best_score_

0.7298076923076923

In [22]:
knn_models.best_params_

{'kneighborsclassifier__n_neighbors': 9}

### Score

In [23]:
prediction = knn_models.best_estimator_.predict(X_test)
knn_models.best_estimator_.score(X_test, y_test)

0.6785714285714286

### Prepare data for evaluation

In [24]:
result_df = pd.DataFrame(columns=knn_data_raw.columns[:-1])
result_df = result_df.drop(columns="NAME")

for i, row in enumerate(X_test):
    data = [i]
    data.extend(row)
    result_df.loc[len(result_df["INDEX"])] = data

result_df["LABEL"] = y_test
result_df["PRED"] = prediction

result_df.to_csv(result_dir / "MOAB/baseline_knn.csv", encoding="utf-8")