## Neural Network Approach



### Setup

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
# read dynamic path
base_dir = Path(os.getcwd()).parents[3]
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [3]:
knn_data_raw = pd.read_csv(data_dir / "SEH/SEH.csv")

knn_data_raw

Unnamed: 0,INDEX,NAME,Hydrophobic_Interaction:PHE387A,Hydrogen_Bond:TYR466A,Hydrogen_Bond:TRP336A,Salt_Bridge:HIS524A,Hydrophobic_Interaction:ASN472A,Hydrogen_Bond:ASN472A,Water_Bridge:ALA365A,Pi-Cation_Interaction:HIS524A,...,Water_Bridge:PHE267A,Hydrophobic_Interaction:PRO371A,Hydrophobic_Interaction:LEU428A,Hydrophobic_Interaction:ALA365A,Hydrogen_Bond:LEU408A,Hydrogen_Bond:TYR383A,Hydrogen_Bond:TYR343A,Hydrophobic_Interaction:HIS524A,Halogen_Bond:ASN472A,LABEL
0,1,IA_147.cdx|SeH_inactivess_minimized|sdf|48,1,1,1,0,0,0,0,0,...,0,0,0,0,0,2,0,0,1,inactive
1,2,IA_10|SeH_inactivess_minimized|sdf|1,0,1,0,1,0,0,0,0,...,2,0,0,0,0,2,0,0,0,inactive
2,3,A_103|SEH_inhibs_minimized|sdf|5,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,active
3,4,A_76|SEH_inhibs_minimized|sdf|46,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,active
4,5,IA_12|SeH_inactivess_minimized|sdf|22,0,1,0,1,1,0,0,0,...,1,1,0,0,0,0,0,0,0,inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,194,IA_178.cdx|SeH_inactivess_minimized|sdf|79,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
194,195,IA_211.cdx|SeH_inactivess_minimized|sdf|109,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
195,196,IA_155.cdx|SeH_inactivess_minimized|sdf|56,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
196,197,IA_163.cdx|SeH_inactivess_minimized|sdf|65,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive


In [4]:
lookup = {"inactive": 0, "active": 1}

knn_data = {
    "data": np.array(knn_data_raw.iloc[:, 2:-1]),
    "target": np.array([lookup[y] for y in knn_data_raw.iloc[0:, -1]]),
    "feature_names": knn_data_raw.columns[2:-1],
    "target_names": ["inactive", "active"],
}

split into train- and test-set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    knn_data["data"], knn_data["target"], test_size=0.3, random_state=4232
)

### Apply K-NearestNeighbour

In [6]:
parameters = {"kneighborsclassifier__n_neighbors": list(range(1, 20, 2))}
# normalize date with pipeline
pipe1 = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_models = GridSearchCV(pipe1, parameters, cv=20, n_jobs=-1)

Fit model with the provided parameters

In [7]:
knn_models.fit(X_train, y_train)

In [8]:
pd.DataFrame(knn_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,0.015264,0.016883,0.10066,0.103081,1,{'kneighborsclassifier__n_neighbors': 1},0.714286,0.571429,0.571429,0.571429,...,0.714286,0.285714,0.428571,0.571429,0.714286,0.5,0.5,0.621429,0.180843,10
1,0.022362,0.024811,0.026184,0.019937,3,{'kneighborsclassifier__n_neighbors': 3},0.571429,0.571429,0.857143,0.714286,...,0.714286,0.857143,0.714286,0.571429,0.714286,1.0,0.833333,0.720238,0.137987,9
2,0.006813,0.00562,0.015717,0.013783,5,{'kneighborsclassifier__n_neighbors': 5},0.714286,0.714286,0.857143,0.714286,...,0.571429,0.857143,0.714286,0.714286,0.714286,0.833333,0.833333,0.733333,0.101183,8
3,0.012464,0.012507,0.022922,0.021376,7,{'kneighborsclassifier__n_neighbors': 7},0.857143,0.857143,0.857143,0.714286,...,0.571429,0.857143,0.714286,0.714286,0.714286,0.833333,0.833333,0.747619,0.086307,2
4,0.026617,0.025636,0.016168,0.013617,9,{'kneighborsclassifier__n_neighbors': 9},0.857143,0.857143,0.857143,0.714286,...,0.714286,0.857143,0.714286,0.714286,0.714286,0.833333,0.833333,0.754762,0.07682,1
5,0.022382,0.021937,0.015314,0.018948,11,{'kneighborsclassifier__n_neighbors': 11},0.857143,0.857143,0.857143,0.714286,...,0.714286,0.714286,0.714286,0.714286,0.714286,0.833333,0.833333,0.740476,0.069375,7
6,0.021734,0.021882,0.017677,0.010453,13,{'kneighborsclassifier__n_neighbors': 13},0.857143,0.857143,0.857143,0.714286,...,0.714286,0.714286,0.714286,0.714286,0.714286,0.833333,0.833333,0.747619,0.058029,2
7,0.02272,0.023754,0.025862,0.02231,15,{'kneighborsclassifier__n_neighbors': 15},0.857143,0.857143,0.857143,0.714286,...,0.714286,0.714286,0.714286,0.714286,0.714286,0.833333,0.833333,0.747619,0.058029,2
8,0.015291,0.01128,0.022407,0.015759,17,{'kneighborsclassifier__n_neighbors': 17},0.857143,0.857143,0.857143,0.714286,...,0.714286,0.714286,0.714286,0.714286,0.714286,0.833333,0.833333,0.747619,0.058029,2
9,0.009705,0.010104,0.016173,0.012633,19,{'kneighborsclassifier__n_neighbors': 19},0.857143,0.857143,0.857143,0.714286,...,0.714286,0.714286,0.714286,0.714286,0.714286,0.833333,0.833333,0.747619,0.058029,2


In [9]:
knn_models.best_score_

0.7547619047619047

In [10]:
knn_models.best_params_

{'kneighborsclassifier__n_neighbors': 9}

### Score

In [11]:
prediction = knn_models.best_estimator_.predict(X_test)
knn_models.best_estimator_.score(X_test, y_test)

0.7333333333333333

### Prepare data for evaluation

In [12]:
result_df = pd.DataFrame(columns=knn_data_raw.columns[:-1])
result_df = result_df.drop(columns="NAME")

for i, row in enumerate(X_test):
    data = [i]
    data.extend(row)
    result_df.loc[len(result_df["INDEX"])] = data

result_df["LABEL"] = y_test
result_df["PRED"] = prediction

result_df.to_csv(result_dir / "SEH/baseline_knn.csv", encoding="utf-8")