## Random Forest Approach



### Setup

In [1]:
import pandas as pd
import imp
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

import os 
from pathlib import Path


  import imp


In [2]:
# read dynamic path
base_dir = Path(os.getcwd()) / "implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [3]:
rf_data_raw = pd.read_csv(data_dir / "MOAB/MOAB.csv")

rf_data_raw

Unnamed: 0,INDEX,NAME,Salt_Bridge:GLU84B,Hydrophobic_Interaction:GLU84B,Halogen_Bond:GLN206B,Hydrophobic_Interaction:LEU328B,Water_Bridge:THR201B,Halogen_Bond:SER200B,Hydrophobic_Interaction:TYR435A,Hydrogen_Bond:PRO102B,...,Pi-Stacking:PHE168B,Pi-Stacking:TYR326B,Hydrogen_Bond:ASN203B,Hydrophobic_Interaction:GLN206A,Hydrophobic_Interaction:THR201B,Hydrophobic_Interaction:THR202B,Water_Bridge:GLU84B,Hydrophobic_Interaction:PRO104B,Hydrophobic_Interaction:PHE103B,LABEL
0,1,CHEMBL583128|actives_final|sdf|78,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,2,0,0,active
1,2,CHEMBL583128|actives_final|sdf|79,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,2,0,0,active
2,3,ZINC32575615|decoys_final|sdf|262,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,2,0,0,inactive
3,4,CHEMBL45069|actives_final|sdf|163,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,1,active
4,5,ZINC36683565|decoys_final|sdf|17,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,1,inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,367,CHEMBL174289|actives_final|sdf|17,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,active
367,368,CHEMBL552680|actives_final|sdf|91,0,0,0,0,1,0,1,1,...,0,0,0,1,0,0,0,0,1,active
368,369,CHEMBL522271|actives_final|sdf|2,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,active
369,370,CHEMBL26138|actives_final|sdf|107,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,active


In [4]:
lookup = {'inactive':0,'active':1}

rf_data = {'data': np.array(rf_data_raw.iloc[:, 2:-1]),
             'target': np.array([lookup[y] for y in rf_data_raw.iloc[0:,-1]]),
             'feature_names': rf_data_raw.columns[2:-1],
             'target_names': ['inactive', 'active']}


split into train- and test-set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(rf_data['data'], rf_data['target'],
                                                    test_size=0.3, random_state=4232)

### Apply Random Forrest

In [6]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [7]:
rf_models.fit(X_train, y_train)

In [8]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,3.271019,0.207361,0.085293,0.010089,3,{'max_depth': 3},0.615385,0.692308,0.692308,0.692308,...,0.692308,0.692308,0.769231,0.692308,0.615385,0.538462,0.666667,0.702564,0.092236,9
1,3.633142,0.292345,0.103997,0.032549,6,{'max_depth': 6},0.615385,0.769231,0.692308,0.846154,...,0.692308,0.692308,0.769231,0.692308,0.692308,0.615385,0.75,0.741346,0.088435,7
2,3.984999,0.46299,0.121996,0.048066,9,{'max_depth': 9},0.615385,0.769231,0.692308,0.846154,...,0.769231,0.769231,0.846154,0.692308,0.692308,0.615385,0.75,0.745192,0.073197,6
3,5.543078,0.861881,0.229029,0.128262,12,{'max_depth': 12},0.615385,0.769231,0.692308,0.846154,...,0.769231,0.769231,0.846154,0.769231,0.615385,0.615385,0.75,0.760577,0.087384,1
4,7.930717,0.893435,0.163967,0.063851,15,{'max_depth': 15},0.615385,0.769231,0.692308,0.846154,...,0.769231,0.769231,0.846154,0.769231,0.615385,0.615385,0.75,0.756731,0.095047,2
5,5.16658,0.573956,0.120694,0.031899,18,{'max_depth': 18},0.615385,0.769231,0.692308,0.846154,...,0.846154,0.769231,0.846154,0.692308,0.692308,0.538462,0.75,0.749038,0.10559,5
6,5.131516,0.536494,0.151589,0.054307,21,{'max_depth': 21},0.538462,0.769231,0.692308,0.846154,...,0.846154,0.692308,0.846154,0.692308,0.615385,0.615385,0.75,0.741346,0.109375,7
7,8.544644,1.523364,0.264967,0.070928,24,{'max_depth': 24},0.615385,0.769231,0.692308,0.846154,...,0.846154,0.769231,0.846154,0.769231,0.692308,0.615385,0.666667,0.752564,0.100058,4
8,9.350906,0.736654,0.240498,0.053296,27,{'max_depth': 27},0.615385,0.769231,0.692308,0.846154,...,0.846154,0.769231,0.846154,0.692308,0.692308,0.538462,0.75,0.752885,0.107636,3


In [9]:
rf_models.best_score_

0.760576923076923

In [10]:
rf_models.best_params_

{'max_depth': 12}

### Score

In [11]:
prediction = rf_models.best_estimator_.predict(X_test)
rf_models.best_estimator_.score(X_test,y_test)

0.7589285714285714

### Prepare Data for evaluation

In [12]:
result_df = pd.DataFrame(columns=rf_data_raw.columns[:-1])
result_df = result_df.drop(columns="NAME")

for i, row in enumerate(X_test):
    data = [i]
    data.extend(row)
    result_df.loc[len(result_df["INDEX"])] = data

result_df["LABEL"] = y_test
result_df["PRED"] = prediction

result_df.to_csv(result_dir/"MOAB/baseline_rf.csv",encoding="utf-8")