## Random Forest Approach



### Setup

In [12]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV


In [13]:
## adapt this directory to your needs
# TODO-> change path to be dynamic 
base_dir = '/home/bac/activity_prediction/implementation/'
data_dir = base_dir + 'data/'

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [14]:
rf_data_raw = pd.read_csv(data_dir+"DPP4/DPP4.csv")

rf_data_raw

Unnamed: 0,INDEX,NAME,Pi-Cation_Interaction:HIS740A,Halogen_Bond:ASP709A,Halogen_Bond:VAL546A,Hydrogen_Bond:GLY741A,Water_Bridge:GLU204A,Water_Bridge:ARG125A,Halogen_Bond:ARG358A,Hydrophobic_Interaction:ALA743A,...,Hydrogen_Bond:ARG669A,Hydrogen_Bond:ASN710A,Hydrogen_Bond:GLU204A,Hydrophobic_Interaction:ARG125A,Halogen_Bond:SER630A,Water_Bridge:ASP739A,Salt_Bridge:ARG358A,Water_Bridge:GLU205A,Hydrophobic_Interaction:ASP739A,LABEL
0,1,CHEMBL386369|actives_final|sdf|444,0,0,0,0,0,0,0,0,...,2,1,0,0,0,0,1,0,0,active
1,2,ZINC38935877|decoys_final|sdf|121,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,inactive
2,3,ZINC63159848|decoys_final|sdf|138,0,0,0,0,0,3,0,0,...,0,1,0,0,0,0,0,0,0,inactive
3,4,ZINC23079060|decoys_final|sdf|264,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,inactive
4,5,CHEMBL290337|actives_final|sdf|331,0,0,0,0,0,3,0,0,...,1,1,0,0,0,0,0,0,0,active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1690,1691,ZINC49729498|decoys_final|sdf|645,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
1691,1692,ZINC43263233|decoys_final|sdf|584,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
1692,1693,ZINC36962060|decoys_final|sdf|615,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
1693,1694,CHEMBL564854|actives_final|sdf|55,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,active


In [15]:
lookup = {'inactive':0,'active':1}

rf_data = {'data': np.array(rf_data_raw.iloc[:, 2:-1]),
             'target': np.array([lookup[y] for y in rf_data_raw.iloc[0:,-1]]),
             'feature_names': rf_data_raw.columns[2:-1],
             'target_names': ['inactive', 'active']}


split into train- and test-set

In [16]:
X_train, X_test, y_train, y_test = train_test_split(rf_data['data'], rf_data['target'],
                                                    test_size=0.3, random_state=4232)

### Apply Random Forrest

In [17]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [18]:
rf_models.fit(X_train, y_train)

In [19]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,4.955121,0.369739,0.1206,0.030125,3,{'max_depth': 3},0.733333,0.816667,0.766667,0.75,...,0.779661,0.610169,0.677966,0.711864,0.745763,0.762712,0.694915,0.720056,0.060086,9
1,5.497189,0.392893,0.101229,0.023624,6,{'max_depth': 6},0.7,0.783333,0.833333,0.766667,...,0.813559,0.694915,0.677966,0.745763,0.79661,0.779661,0.728814,0.752175,0.061217,8
2,5.479358,0.431871,0.105912,0.024401,9,{'max_depth': 9},0.75,0.783333,0.883333,0.8,...,0.847458,0.728814,0.728814,0.779661,0.847458,0.79661,0.711864,0.77572,0.062441,7
3,5.344108,0.532012,0.115316,0.039572,12,{'max_depth': 12},0.75,0.8,0.866667,0.85,...,0.864407,0.745763,0.762712,0.830508,0.864407,0.813559,0.779661,0.797641,0.057488,6
4,5.576003,0.417987,0.123433,0.031929,15,{'max_depth': 15},0.75,0.816667,0.833333,0.833333,...,0.881356,0.745763,0.762712,0.830508,0.847458,0.813559,0.762712,0.799364,0.051897,5
5,6.093907,0.568683,0.117491,0.0223,18,{'max_depth': 18},0.75,0.8,0.883333,0.833333,...,0.881356,0.745763,0.762712,0.830508,0.847458,0.830508,0.79661,0.806907,0.052463,4
6,6.091084,0.463585,0.117769,0.01955,21,{'max_depth': 21},0.733333,0.816667,0.883333,0.85,...,0.864407,0.745763,0.762712,0.847458,0.847458,0.830508,0.813559,0.807726,0.053283,3
7,6.073078,0.47194,0.121388,0.017002,24,{'max_depth': 24},0.733333,0.833333,0.866667,0.866667,...,0.847458,0.728814,0.762712,0.864407,0.847458,0.830508,0.813559,0.811102,0.050279,1
8,5.289572,0.51131,0.076395,0.026611,27,{'max_depth': 27},0.75,0.816667,0.85,0.85,...,0.881356,0.762712,0.762712,0.830508,0.847458,0.830508,0.813559,0.810311,0.049662,2


In [20]:
rf_models.best_score_

0.8111016949152543

In [21]:
rf_models.best_params_

{'max_depth': 24}

In [22]:
rf_models.best_estimator_.score(X_test,y_test)

0.9991568296795953