## Random Forest Approach



### Setup

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV


In [2]:
## adapt this directory to your needs
# TODO-> change path to be dynamic 
base_dir = '/home/bac/activity_prediction/implementation/'
data_dir = base_dir + 'data/'

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [3]:
rf_data_raw = pd.read_csv(data_dir+"COX1/COX1.csv")

rf_data_raw

Unnamed: 0,INDEX,NAME,Hydrogen_Bond:ALA527A,Halogen_Bond:ASP110A,Hydrophobic_Interaction:ILE345A,Hydrophobic_Interaction:TYR466A,Hydrogen_Bond:GLU520A,Hydrogen_Bond:VAL116A,Pi-Stacking:PHE205A,Hydrophobic_Interaction:LEU115A,...,Hydrophobic_Interaction:PRO86A,Hydrogen_Bond:TYR348A,Halogen_Bond:TYR355A,Hydrophobic_Interaction:PHE201A,Hydrophobic_Interaction:PRO363A,Hydrogen_Bond:GLN351A,Hydrogen_Bond:SER353A,Water_Bridge:PRO86A,Halogen_Bond:LEU112A,LABEL
0,1,CHEMBL314337|COX1_actives_final_part2|sdf|163,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,active
1,2,CHEMBL305971|COX1_actives_final_part2|sdf|13,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,active
2,3,ZINC00705195|decoys_final|sdf|51,0,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,0,inactive
3,4,ZINC17088576|decoys_final|sdf|59,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,inactive
4,5,ZINC06204226|decoys_final|sdf|97,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033,1034,ZINC53927572|decoys_final|sdf|36,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
1034,1035,ZINC48571703|decoys_final|sdf|388,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,inactive
1035,1036,ZINC50122389|decoys_final_part3|sdf|257,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
1036,1037,ZINC60280739|decoys_final_part3|sdf|282,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive


In [4]:
lookup = {'inactive':0,'active':1}

rf_data = {'data': np.array(rf_data_raw.iloc[:, 2:-1]),
             'target': np.array([lookup[y] for y in rf_data_raw.iloc[0:,-1]]),
             'feature_names': rf_data_raw.columns[2:-1],
             'target_names': ['inactive', 'active']}


split into train- and test-set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(rf_data['data'], rf_data['target'],
                                                    test_size=0.3, random_state=4232)

### Apply Random Forrest

In [6]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [7]:
rf_models.fit(X_train, y_train)

In [8]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,3.733133,0.373888,0.088495,0.023069,3,{'max_depth': 3},0.702703,0.702703,0.702703,0.72973,...,0.722222,0.722222,0.722222,0.722222,0.722222,0.722222,0.722222,0.72042,0.007897,8
1,3.764315,0.29973,0.095193,0.022102,6,{'max_depth': 6},0.702703,0.702703,0.702703,0.72973,...,0.722222,0.722222,0.722222,0.722222,0.722222,0.722222,0.722222,0.72042,0.007897,8
2,3.73704,0.388574,0.080989,0.017516,9,{'max_depth': 9},0.702703,0.702703,0.702703,0.72973,...,0.722222,0.75,0.722222,0.722222,0.722222,0.722222,0.722222,0.721809,0.010199,7
3,4.28725,0.569949,0.101561,0.019246,12,{'max_depth': 12},0.72973,0.702703,0.702703,0.72973,...,0.75,0.75,0.75,0.694444,0.75,0.694444,0.694444,0.728716,0.02575,6
4,4.380821,0.407449,0.092041,0.022635,15,{'max_depth': 15},0.72973,0.72973,0.702703,0.702703,...,0.777778,0.75,0.75,0.638889,0.777778,0.638889,0.694444,0.730068,0.041275,5
5,4.691603,0.590709,0.111506,0.035509,18,{'max_depth': 18},0.756757,0.72973,0.702703,0.675676,...,0.805556,0.722222,0.75,0.666667,0.75,0.694444,0.694444,0.735511,0.041229,3
6,5.755445,0.449566,0.13481,0.03987,21,{'max_depth': 21},0.756757,0.72973,0.702703,0.675676,...,0.805556,0.722222,0.777778,0.638889,0.75,0.666667,0.694444,0.739677,0.052609,1
7,5.55687,0.488,0.110507,0.021382,24,{'max_depth': 24},0.756757,0.756757,0.702703,0.702703,...,0.805556,0.694444,0.722222,0.638889,0.75,0.694444,0.694444,0.739602,0.048498,2
8,4.916724,0.763367,0.073272,0.026312,27,{'max_depth': 27},0.756757,0.756757,0.702703,0.675676,...,0.805556,0.694444,0.722222,0.638889,0.75,0.666667,0.694444,0.734122,0.051287,4


In [9]:
rf_models.best_score_

0.7396771771771772

In [10]:
rf_models.best_params_

{'max_depth': 21}

In [11]:
rf_models.best_estimator_.score(X_test,y_test)

0.987603305785124