## Neural Network Approach



### Setup

In [13]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [14]:
# read dynamic path
base_dir = Path(os.getcwd()).parents[3]
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [15]:
knn_data_raw = pd.read_csv(data_dir / "DPP4/DPP4.csv")

knn_data_raw

Unnamed: 0,INDEX,NAME,Pi-Cation_Interaction:HIS740A,Halogen_Bond:ASP709A,Halogen_Bond:VAL546A,Hydrogen_Bond:GLY741A,Water_Bridge:GLU204A,Water_Bridge:ARG125A,Halogen_Bond:ARG358A,Hydrophobic_Interaction:ALA743A,...,Hydrogen_Bond:ARG669A,Hydrogen_Bond:ASN710A,Hydrogen_Bond:GLU204A,Hydrophobic_Interaction:ARG125A,Halogen_Bond:SER630A,Water_Bridge:ASP739A,Salt_Bridge:ARG358A,Water_Bridge:GLU205A,Hydrophobic_Interaction:ASP739A,LABEL
0,1,CHEMBL386369|actives_final|sdf|444,0,0,0,0,0,0,0,0,...,2,1,0,0,0,0,1,0,0,active
1,2,ZINC38935877|decoys_final|sdf|121,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,inactive
2,3,ZINC63159848|decoys_final|sdf|138,0,0,0,0,0,3,0,0,...,0,1,0,0,0,0,0,0,0,inactive
3,4,ZINC23079060|decoys_final|sdf|264,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,inactive
4,5,CHEMBL290337|actives_final|sdf|331,0,0,0,0,0,3,0,0,...,1,1,0,0,0,0,0,0,0,active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1690,1691,ZINC49729498|decoys_final|sdf|645,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
1691,1692,ZINC43263233|decoys_final|sdf|584,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
1692,1693,ZINC36962060|decoys_final|sdf|615,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
1693,1694,CHEMBL564854|actives_final|sdf|55,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,active


In [16]:
lookup = {"inactive": 0, "active": 1}

knn_data = {
    "data": np.array(knn_data_raw.iloc[:, 2:-1]),
    "target": np.array([lookup[y] for y in knn_data_raw.iloc[0:, -1]]),
    "feature_names": knn_data_raw.columns[2:-1],
    "target_names": ["inactive", "active"],
}

split into train- and test-set

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    knn_data["data"], knn_data["target"], test_size=0.3, random_state=4232
)

### Apply K-NearestNeighbour

In [18]:
parameters = {"kneighborsclassifier__n_neighbors": list(range(1, 20, 2))}
# normalize date with pipeline
pipe1 = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_models = GridSearchCV(pipe1, parameters, cv=20, n_jobs=-1)

Fit model with the provided parameters

In [19]:
knn_models.fit(X_train, y_train)

In [20]:
pd.DataFrame(knn_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,0.021246,0.008051,0.010162,0.005236,1,{'kneighborsclassifier__n_neighbors': 1},0.733333,0.566667,0.733333,0.716667,...,0.745763,0.677966,0.677966,0.728814,0.677966,0.711864,0.661017,0.700819,0.055439,4
1,0.023578,0.008207,0.012821,0.008598,3,{'kneighborsclassifier__n_neighbors': 3},0.6,0.55,0.733333,0.766667,...,0.711864,0.661017,0.627119,0.644068,0.661017,0.728814,0.644068,0.679605,0.057745,10
2,0.022305,0.009463,0.015795,0.00891,5,{'kneighborsclassifier__n_neighbors': 5},0.65,0.583333,0.733333,0.833333,...,0.762712,0.694915,0.644068,0.677966,0.677966,0.711864,0.711864,0.69904,0.063926,5
3,0.022284,0.006359,0.012238,0.006008,7,{'kneighborsclassifier__n_neighbors': 7},0.666667,0.65,0.783333,0.8,...,0.79661,0.728814,0.627119,0.694915,0.677966,0.728814,0.677966,0.709929,0.060066,2
4,0.025199,0.009516,0.01698,0.006962,9,{'kneighborsclassifier__n_neighbors': 9},0.65,0.683333,0.766667,0.85,...,0.813559,0.694915,0.59322,0.677966,0.711864,0.728814,0.745763,0.714181,0.062055,1
5,0.021368,0.0101,0.011441,0.006481,11,{'kneighborsclassifier__n_neighbors': 11},0.683333,0.65,0.766667,0.8,...,0.779661,0.677966,0.627119,0.677966,0.711864,0.728814,0.694915,0.703249,0.054319,3
6,0.018546,0.004398,0.010742,0.003355,13,{'kneighborsclassifier__n_neighbors': 13},0.683333,0.683333,0.8,0.816667,...,0.728814,0.677966,0.644068,0.694915,0.711864,0.677966,0.644068,0.692189,0.063984,8
7,0.022323,0.006389,0.01023,0.003619,15,{'kneighborsclassifier__n_neighbors': 15},0.683333,0.666667,0.716667,0.8,...,0.762712,0.661017,0.627119,0.711864,0.762712,0.711864,0.644068,0.692274,0.061255,7
8,0.020181,0.004781,0.011533,0.004875,17,{'kneighborsclassifier__n_neighbors': 17},0.7,0.683333,0.766667,0.833333,...,0.762712,0.677966,0.644068,0.677966,0.79661,0.711864,0.677966,0.697232,0.067573,6
9,0.021827,0.004889,0.013299,0.004121,19,{'kneighborsclassifier__n_neighbors': 19},0.716667,0.683333,0.716667,0.8,...,0.728814,0.677966,0.627119,0.711864,0.79661,0.711864,0.694915,0.690537,0.063416,9


In [21]:
knn_models.best_score_

0.714180790960452

In [22]:
knn_models.best_params_

{'kneighborsclassifier__n_neighbors': 9}

### Score

In [23]:
prediction = knn_models.best_estimator_.predict(X_test)
knn_models.best_estimator_.score(X_test, y_test)

0.6895874263261297

### Prepare data for evaluation

In [24]:
result_df = pd.DataFrame(columns=knn_data_raw.columns[:-1])
result_df = result_df.drop(columns="NAME")

for i, row in enumerate(X_test):
    data = [i]
    data.extend(row)
    result_df.loc[len(result_df["INDEX"])] = data

result_df["LABEL"] = y_test
result_df["PRED"] = prediction

result_df.to_csv(result_dir / "DPP4/baseline_knn.csv", encoding="utf-8")