## Random Forest Approach



### Setup

In [1]:
import pandas as pd
import imp
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

import os 
from pathlib import Path


  import imp


In [2]:
# read dynamic path
base_dir = Path(os.getcwd()) / "implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [3]:
rf_data_raw = pd.read_csv(data_dir / "ACHE/ache_non_hydrop.csv")

rf_data_raw

Unnamed: 0.1,Unnamed: 0,INDEX,NAME,Pi-Cation_Interaction:TRP86A,Water_Bridge:GLY120A,Pi-Stacking:TRP86A,Halogen_Bond:THR75A,Halogen_Bond:TRP286A,Hydrogen_Bond:ASN87A,Hydrogen_Bond:GLY120A,...,Hydrogen_Bond:TYR72A,Water_Bridge:THR83A,Hydrogen_Bond:GLN291A,Halogen_Bond:GLY120A,Water_Bridge:THR75A,Pi-Cation_Interaction:TYR341A,Hydrogen_Bond:SER125A,Water_Bridge:ALA204A,Pi-Stacking:TYR124A,LABEL
0,0,1,CHEMBL397271|actives_final|sdf|151,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,active
1,1,2,CHEMBL481|actives_final|sdf|20,2,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,active
2,2,3,CHEMBL244230|actives_final|sdf|54,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,active
3,3,4,CHEMBL1094633|actives_final|sdf|85,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,active
4,4,5,CHEMBL191386|actives_final|sdf|308,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,998,999,CHEMBL576005|actives_final|sdf|279,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,active
999,999,1000,CHEMBL153865|actives_final|sdf|341,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,active
1000,1000,1001,CHEMBL146674|actives_final|sdf|297,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,active
1001,1001,1002,ZINC04195090|decoys_final2|sdf|324,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive


In [4]:
lookup = {'inactive':0,'active':1}

rf_data = {'data': np.array(rf_data_raw.iloc[:, 3:-1]),
             'target': np.array([lookup[y] for y in rf_data_raw.iloc[0:,-1]]),
             'feature_names': rf_data_raw.columns[2:-1],
             'target_names': ['inactive', 'active']}


split into train- and test-set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(rf_data['data'], rf_data['target'],
                                                    test_size=0.3, random_state=4232)

### Apply Random Forrest

In [6]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [7]:
rf_models.fit(X_train, y_train)

In [8]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,3.93378,0.587903,0.164908,0.059153,3,{'max_depth': 3},0.833333,0.666667,0.771429,0.742857,...,0.742857,0.771429,0.857143,0.742857,0.714286,0.742857,0.657143,0.750714,0.058772,9
1,5.298435,0.357443,0.099207,0.0313,6,{'max_depth': 6},0.861111,0.694444,0.828571,0.742857,...,0.714286,0.742857,0.857143,0.685714,0.685714,0.771429,0.657143,0.764921,0.070298,8
2,3.823606,0.260203,0.085913,0.02108,9,{'max_depth': 9},0.861111,0.694444,0.828571,0.771429,...,0.742857,0.8,0.885714,0.771429,0.685714,0.714286,0.657143,0.779206,0.066588,7
3,4.097902,0.24357,0.10133,0.033818,12,{'max_depth': 12},0.805556,0.722222,0.857143,0.828571,...,0.742857,0.742857,0.914286,0.771429,0.714286,0.742857,0.685714,0.792103,0.064814,6
4,4.148638,0.338411,0.096029,0.015077,15,{'max_depth': 15},0.805556,0.694444,0.857143,0.8,...,0.742857,0.771429,0.914286,0.8,0.742857,0.771429,0.685714,0.795,0.059616,5
5,4.440167,0.264769,0.101483,0.028927,18,{'max_depth': 18},0.777778,0.722222,0.885714,0.771429,...,0.8,0.771429,0.885714,0.857143,0.714286,0.742857,0.685714,0.799286,0.062271,2
6,4.863537,0.397737,0.109136,0.025066,21,{'max_depth': 21},0.777778,0.75,0.885714,0.771429,...,0.771429,0.771429,0.885714,0.857143,0.714286,0.771429,0.714286,0.80496,0.055352,1
7,5.253603,0.373917,0.11651,0.030363,24,{'max_depth': 24},0.75,0.75,0.857143,0.771429,...,0.771429,0.742857,0.885714,0.828571,0.714286,0.771429,0.714286,0.799286,0.058244,2
8,5.029231,0.343384,0.077113,0.026333,27,{'max_depth': 27},0.722222,0.75,0.857143,0.771429,...,0.771429,0.742857,0.885714,0.857143,0.657143,0.8,0.714286,0.796468,0.060665,4


In [13]:
df = pd.read_csv(result_dir/"ACHE/val/ache_val.csv",sep=",",index_col="Index")
df.loc[len(df["Name"])] = ["fe_nonhydrop_rf",rf_models.best_score_]
df.to_csv(result_dir/"ACHE/val/ache_val.csv",sep=",",index="Index")

In [10]:
rf_models.best_params_

{'max_depth': 21}

### Score

In [11]:
prediction = rf_models.best_estimator_.predict(X_test)
rf_models.best_estimator_.score(X_test,y_test)

0.8006644518272426

### Prepare Data for evaluation

In [12]:
result_df = pd.DataFrame(columns=rf_data_raw.columns[:-1])
result_df = result_df.drop(columns="NAME")

for i, row in enumerate(X_test):
    data = [i,i]
    data.extend(row)
    result_df.loc[i+1] = data

result_df["LABEL"] = y_test
result_df["PRED"] = prediction


result_df.to_csv(result_dir/"ACHE/fe_nonhydrop_rf.csv",encoding="utf-8")