## Random Forest Approach



### Setup

In [12]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV


In [13]:
## adapt this directory to your needs
# TODO-> change path to be dynamic 
base_dir = '/home/bac/activity_prediction/implementation/'
data_dir = base_dir + 'data/'

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [14]:
rf_data_raw = pd.read_csv(data_dir+"MOAB/MOAB.csv")

rf_data_raw

Unnamed: 0,INDEX,NAME,Salt_Bridge:GLU84B,Hydrophobic_Interaction:GLU84B,Halogen_Bond:GLN206B,Hydrophobic_Interaction:LEU328B,Water_Bridge:THR201B,Halogen_Bond:SER200B,Hydrophobic_Interaction:TYR435A,Hydrogen_Bond:PRO102B,...,Pi-Stacking:PHE168B,Pi-Stacking:TYR326B,Hydrogen_Bond:ASN203B,Hydrophobic_Interaction:GLN206A,Hydrophobic_Interaction:THR201B,Hydrophobic_Interaction:THR202B,Water_Bridge:GLU84B,Hydrophobic_Interaction:PRO104B,Hydrophobic_Interaction:PHE103B,LABEL
0,1,CHEMBL583128|actives_final|sdf|78,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,2,0,0,active
1,2,CHEMBL583128|actives_final|sdf|79,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,2,0,0,active
2,3,ZINC32575615|decoys_final|sdf|262,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,2,0,0,inactive
3,4,CHEMBL45069|actives_final|sdf|163,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,1,active
4,5,ZINC36683565|decoys_final|sdf|17,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,1,inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,367,CHEMBL174289|actives_final|sdf|17,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,active
367,368,CHEMBL552680|actives_final|sdf|91,0,0,0,0,1,0,1,1,...,0,0,0,1,0,0,0,0,1,active
368,369,CHEMBL522271|actives_final|sdf|2,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,active
369,370,CHEMBL26138|actives_final|sdf|107,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,active


In [15]:
lookup = {'inactive':0,'active':1}

rf_data = {'data': np.array(rf_data_raw.iloc[:, 2:-1]),
             'target': np.array([lookup[y] for y in rf_data_raw.iloc[0:,-1]]),
             'feature_names': rf_data_raw.columns[2:-1],
             'target_names': ['inactive', 'active']}


split into train- and test-set

In [16]:
X_train, X_test, y_train, y_test = train_test_split(rf_data['data'], rf_data['target'],
                                                    test_size=0.3, random_state=4232)

### Apply Random Forrest

In [17]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [18]:
rf_models.fit(X_train, y_train)

In [19]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,2.641181,0.20241,0.077588,0.015937,3,{'max_depth': 3},0.615385,0.692308,0.692308,0.769231,...,0.692308,0.692308,0.769231,0.692308,0.615385,0.538462,0.666667,0.710256,0.084304,9
1,2.97142,0.221285,0.076057,0.013724,6,{'max_depth': 6},0.615385,0.769231,0.692308,0.846154,...,0.692308,0.692308,0.769231,0.692308,0.692308,0.615385,0.75,0.7375,0.081878,7
2,2.873929,0.145974,0.072821,0.014667,9,{'max_depth': 9},0.615385,0.769231,0.692308,0.846154,...,0.769231,0.769231,0.846154,0.692308,0.692308,0.615385,0.75,0.752885,0.075289,3
3,3.251892,0.309582,0.084875,0.015618,12,{'max_depth': 12},0.615385,0.769231,0.692308,0.846154,...,0.769231,0.769231,0.846154,0.769231,0.615385,0.615385,0.75,0.760577,0.087384,1
4,2.972884,0.132803,0.07796,0.011515,15,{'max_depth': 15},0.615385,0.769231,0.692308,0.846154,...,0.846154,0.692308,0.846154,0.769231,0.692308,0.538462,0.75,0.749038,0.10559,4
5,2.930447,0.147579,0.078944,0.015202,18,{'max_depth': 18},0.615385,0.769231,0.692308,0.846154,...,0.846154,0.692308,0.846154,0.769231,0.538462,0.615385,0.75,0.741346,0.109375,6
6,2.86685,0.109102,0.07093,0.011297,21,{'max_depth': 21},0.615385,0.769231,0.692308,0.846154,...,0.846154,0.769231,0.846154,0.692308,0.538462,0.538462,0.75,0.7375,0.112344,7
7,2.789195,0.109309,0.072138,0.01501,24,{'max_depth': 24},0.615385,0.769231,0.692308,0.846154,...,0.846154,0.769231,0.846154,0.692308,0.692308,0.615385,0.75,0.760577,0.100014,2
8,2.430081,0.32836,0.047758,0.017226,27,{'max_depth': 27},0.615385,0.769231,0.692308,0.846154,...,0.846154,0.692308,0.846154,0.692308,0.692308,0.538462,0.75,0.745192,0.106184,5


In [20]:
rf_models.best_score_

0.760576923076923

In [21]:
rf_models.best_params_

{'max_depth': 12}

In [22]:
rf_models.best_estimator_.score(X_test,y_test)

0.9922779922779923