## Random Forest Approach



### Setup

In [13]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV


In [14]:
## adapt this directory to your needs
# TODO-> change path to be dynamic 
base_dir = '/home/bac/activity_prediction/implementation/'
data_dir = base_dir + 'data/'

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [15]:
rf_data_raw = pd.read_csv(data_dir+"ACHE/ache.csv")

rf_data_raw

Unnamed: 0,INDEX,NAME,Hydrophobic_Interaction:PHE297A,Pi-Cation_Interaction:TRP86A,Water_Bridge:GLY120A,Pi-Stacking:TRP86A,Hydrophobic_Interaction:TYR341A,Halogen_Bond:THR75A,Halogen_Bond:TRP286A,Hydrogen_Bond:ASN87A,...,Water_Bridge:THR83A,Hydrogen_Bond:GLN291A,Halogen_Bond:GLY120A,Water_Bridge:THR75A,Pi-Cation_Interaction:TYR341A,Hydrogen_Bond:SER125A,Water_Bridge:ALA204A,Pi-Stacking:TYR124A,Hydrophobic_Interaction:GLU202A,LABEL
0,1,CHEMBL397271|actives_final|sdf|151,0,0,0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,active
1,2,CHEMBL481|actives_final|sdf|20,1,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,active
2,3,CHEMBL244230|actives_final|sdf|54,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,active
3,4,CHEMBL1094633|actives_final|sdf|85,0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,active
4,5,CHEMBL191386|actives_final|sdf|308,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,999,CHEMBL576005|actives_final|sdf|279,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,active
999,1000,CHEMBL153865|actives_final|sdf|341,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,active
1000,1001,CHEMBL146674|actives_final|sdf|297,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,active
1001,1002,ZINC04195090|decoys_final2|sdf|324,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive


In [16]:
lookup = {'inactive':0,'active':1}

rf_data = {'data': np.array(rf_data_raw.iloc[:, 2:-1]),
             'target': np.array([lookup[y] for y in rf_data_raw.iloc[0:,-1]]),
             'feature_names': rf_data_raw.columns[2:-1],
             'target_names': ['inactive', 'active']}


split into train- and test-set

In [17]:
X_train, X_test, y_train, y_test = train_test_split(rf_data['data'], rf_data['target'],
                                                    test_size=0.3, random_state=4232)

### Apply Random Forrest

In [18]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [19]:
rf_models.fit(X_train, y_train)

In [20]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,2.893259,0.233919,0.074987,0.017726,3,{'max_depth': 3},0.805556,0.722222,0.857143,0.828571,...,0.714286,0.8,0.8,0.771429,0.685714,0.742857,0.714286,0.75496,0.053737,9
1,2.879136,0.091875,0.064741,0.011438,6,{'max_depth': 6},0.833333,0.805556,0.885714,0.771429,...,0.742857,0.8,0.885714,0.8,0.742857,0.8,0.771429,0.797659,0.05431,8
2,3.191365,0.234401,0.092928,0.015302,9,{'max_depth': 9},0.833333,0.805556,0.914286,0.771429,...,0.8,0.8,0.885714,0.828571,0.771429,0.828571,0.771429,0.819087,0.058719,7
3,4.262116,0.095154,0.09342,0.01693,12,{'max_depth': 12},0.833333,0.833333,0.914286,0.771429,...,0.8,0.771429,0.914286,0.8,0.742857,0.828571,0.8,0.829048,0.071728,6
4,4.3294,0.050713,0.085162,0.013882,15,{'max_depth': 15},0.861111,0.861111,0.914286,0.771429,...,0.8,0.742857,0.942857,0.828571,0.742857,0.828571,0.828571,0.840397,0.074277,1
5,4.416293,0.166578,0.09857,0.016977,18,{'max_depth': 18},0.805556,0.833333,0.914286,0.771429,...,0.8,0.771429,0.914286,0.8,0.714286,0.828571,0.828571,0.83623,0.071497,4
6,4.390105,0.078138,0.087688,0.015547,21,{'max_depth': 21},0.805556,0.861111,0.914286,0.771429,...,0.828571,0.828571,0.942857,0.8,0.742857,0.8,0.8,0.837619,0.069968,2
7,4.45143,0.167957,0.101255,0.024569,24,{'max_depth': 24},0.833333,0.833333,0.914286,0.771429,...,0.8,0.771429,0.942857,0.8,0.771429,0.8,0.8,0.837619,0.072295,2
8,4.236897,0.24979,0.067452,0.024884,27,{'max_depth': 27},0.805556,0.833333,0.914286,0.771429,...,0.8,0.771429,0.914286,0.8,0.771429,0.828571,0.8,0.833373,0.071174,5


In [21]:
rf_models.best_score_

0.8403968253968255

In [22]:
rf_models.best_params_

{'max_depth': 15}

In [23]:
rf_models.best_estimator_.score(X_train,y_train)

0.9928774928774928