## Neural Network Approach



### Setup

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
# read dynamic path
base_dir = Path(os.getcwd()) / "implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [3]:
knn_data_raw = pd.read_csv(data_dir / "ACHE/ache_freq.csv")

knn_data_raw

Unnamed: 0.1,Unnamed: 0,INDEX,NAME,Water_Bridge:ASP74A,Hydrophobic_Interaction:TYR341A,Hydrophobic_Interaction:TRP86A,Hydrophobic_Interaction:TRP286A,Hydrophobic_Interaction:PHE338A,Pi-Stacking:TRP86A,Hydrophobic_Interaction:TYR337A,...,Hydrogen_Bond:TYR337A,Water_Bridge:HIS447A,Pi-Stacking:HIS447A,Hydrogen_Bond:GLY122A,Hydrophobic_Interaction:VAL294A,Hydrogen_Bond:TYR133A,Water_Bridge:SER203A,Pi-Stacking:TYR124A,Water_Bridge:ALA204A,LABEL
0,0,1,CHEMBL397271|actives_final|sdf|151,4,1,1,0,1,3,1,...,0,0,0,0,0,0,0,0,0,active
1,1,2,CHEMBL481|actives_final|sdf|20,3,1,1,1,0,0,1,...,0,0,0,0,0,0,0,1,0,active
2,2,3,CHEMBL244230|actives_final|sdf|54,3,1,1,1,0,0,1,...,0,0,0,1,0,0,0,0,0,active
3,3,4,CHEMBL1094633|actives_final|sdf|85,6,1,1,1,1,0,1,...,0,1,0,0,0,0,0,1,0,active
4,4,5,CHEMBL191386|actives_final|sdf|308,4,1,1,1,1,1,1,...,0,0,0,1,0,0,0,0,1,active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,998,999,CHEMBL576005|actives_final|sdf|279,1,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,active
999,999,1000,CHEMBL153865|actives_final|sdf|341,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,active
1000,1000,1001,CHEMBL146674|actives_final|sdf|297,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,active
1001,1001,1002,ZINC04195090|decoys_final2|sdf|324,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive


In [4]:
lookup = {"inactive": 0, "active": 1}

knn_data = {
    "data": np.array(knn_data_raw.iloc[:, 3:-1]),
    "target": np.array([lookup[y] for y in knn_data_raw.iloc[0:, -1]]),
    "feature_names": knn_data_raw.columns[2:-1],
    "target_names": ["inactive", "active"],
}

split into train- and test-set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    knn_data["data"], knn_data["target"], test_size=0.3, random_state=4232
)

### Apply K-NearestNeighbour

In [6]:
parameters = {"kneighborsclassifier__n_neighbors": list(range(1, 20, 2))}
# normalize date with pipeline
pipe1 = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_models = GridSearchCV(pipe1, parameters, cv=20, n_jobs=-1)

Fit model with the provided parameters

In [7]:
knn_models.fit(X_train, y_train)

In [8]:
pd.DataFrame(knn_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005973,0.002019,0.03452,0.007733,1,{'kneighborsclassifier__n_neighbors': 1},0.75,0.722222,0.828571,0.685714,...,0.685714,0.8,0.8,0.828571,0.6,0.571429,0.742857,0.73504,0.076688,10
1,0.005181,0.002357,0.004071,0.001144,3,{'kneighborsclassifier__n_neighbors': 3},0.75,0.666667,0.742857,0.685714,...,0.857143,0.685714,0.8,0.828571,0.542857,0.657143,0.742857,0.750833,0.0744,8
2,0.003937,0.001205,0.003459,0.001027,5,{'kneighborsclassifier__n_neighbors': 5},0.75,0.777778,0.771429,0.771429,...,0.828571,0.6,0.857143,0.857143,0.628571,0.771429,0.714286,0.766389,0.071426,2
3,0.003379,0.000366,0.003329,0.000941,7,{'kneighborsclassifier__n_neighbors': 7},0.722222,0.722222,0.828571,0.771429,...,0.8,0.714286,0.828571,0.828571,0.571429,0.8,0.714286,0.763651,0.067748,3
4,0.004465,0.001864,0.003661,0.001736,9,{'kneighborsclassifier__n_neighbors': 9},0.777778,0.722222,0.857143,0.828571,...,0.828571,0.714286,0.828571,0.8,0.542857,0.771429,0.742857,0.766429,0.07887,1
5,0.003875,0.00098,0.00335,0.000929,11,{'kneighborsclassifier__n_neighbors': 11},0.777778,0.722222,0.828571,0.771429,...,0.771429,0.685714,0.8,0.828571,0.571429,0.742857,0.742857,0.746429,0.073873,9
6,0.003636,0.000923,0.003298,0.000756,13,{'kneighborsclassifier__n_neighbors': 13},0.833333,0.694444,0.771429,0.771429,...,0.771429,0.685714,0.828571,0.828571,0.6,0.8,0.742857,0.756389,0.069521,5
7,0.003702,0.00149,0.002538,0.000521,15,{'kneighborsclassifier__n_neighbors': 15},0.833333,0.722222,0.8,0.771429,...,0.771429,0.657143,0.828571,0.857143,0.6,0.771429,0.714286,0.754921,0.068507,6
8,0.003441,0.000697,0.002732,0.000606,17,{'kneighborsclassifier__n_neighbors': 17},0.805556,0.694444,0.828571,0.742857,...,0.8,0.685714,0.857143,0.857143,0.628571,0.742857,0.714286,0.752143,0.067484,7
9,0.002861,0.000603,0.002413,0.00055,19,{'kneighborsclassifier__n_neighbors': 19},0.777778,0.722222,0.8,0.742857,...,0.828571,0.685714,0.857143,0.828571,0.657143,0.714286,0.685714,0.757857,0.062107,4


In [9]:
knn_models.best_score_

0.7664285714285715

In [10]:
knn_models.best_params_

{'kneighborsclassifier__n_neighbors': 9}

### Score

In [11]:
prediction = knn_models.best_estimator_.predict(X_test)
knn_models.best_estimator_.score(X_test, y_test)

0.7109634551495017

### Prepare data for evaluation

In [12]:
result_df = pd.DataFrame(columns=knn_data_raw.columns[:-1])
result_df = result_df.drop(columns="NAME")

for i, row in enumerate(X_test):
    data = [i,i]
    data.extend(row)
    result_df.loc[len(result_df["INDEX"])] = data

result_df["LABEL"] = y_test
result_df["PRED"] = prediction

result_df.to_csv(result_dir / "ACHE/fe_freq_knn.csv", encoding="utf-8")