## Random Forest Approach



### Setup

In [1]:
import pandas as pd
import imp
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

import os 
from pathlib import Path


  import imp


In [2]:
# read dynamic path
base_dir = Path(os.getcwd()) / "implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [3]:
rf_data_raw = pd.read_csv(data_dir / "DPP4/DPP4.csv")

rf_data_raw

Unnamed: 0,INDEX,NAME,Pi-Cation_Interaction:HIS740A,Halogen_Bond:ASP709A,Halogen_Bond:VAL546A,Hydrogen_Bond:GLY741A,Water_Bridge:GLU204A,Water_Bridge:ARG125A,Halogen_Bond:ARG358A,Hydrophobic_Interaction:ALA743A,...,Hydrogen_Bond:ARG669A,Hydrogen_Bond:ASN710A,Hydrogen_Bond:GLU204A,Hydrophobic_Interaction:ARG125A,Halogen_Bond:SER630A,Water_Bridge:ASP739A,Salt_Bridge:ARG358A,Water_Bridge:GLU205A,Hydrophobic_Interaction:ASP739A,LABEL
0,1,CHEMBL386369|actives_final|sdf|444,0,0,0,0,0,0,0,0,...,2,1,0,0,0,0,1,0,0,active
1,2,ZINC38935877|decoys_final|sdf|121,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,inactive
2,3,ZINC63159848|decoys_final|sdf|138,0,0,0,0,0,3,0,0,...,0,1,0,0,0,0,0,0,0,inactive
3,4,ZINC23079060|decoys_final|sdf|264,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,inactive
4,5,CHEMBL290337|actives_final|sdf|331,0,0,0,0,0,3,0,0,...,1,1,0,0,0,0,0,0,0,active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1690,1691,ZINC49729498|decoys_final|sdf|645,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
1691,1692,ZINC43263233|decoys_final|sdf|584,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
1692,1693,ZINC36962060|decoys_final|sdf|615,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,inactive
1693,1694,CHEMBL564854|actives_final|sdf|55,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,active


In [4]:
lookup = {'inactive':0,'active':1}

rf_data = {'data': np.array(rf_data_raw.iloc[:, 2:-1]),
             'target': np.array([lookup[y] for y in rf_data_raw.iloc[0:,-1]]),
             'feature_names': rf_data_raw.columns[2:-1],
             'target_names': ['inactive', 'active']}


split into train- and test-set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(rf_data['data'], rf_data['target'],
                                                    test_size=0.3, random_state=4232)

### Apply Random Forrest

In [6]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [7]:
rf_models.fit(X_train, y_train)

In [8]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,3.982717,0.486419,0.080367,0.02084,3,{'max_depth': 3},0.733333,0.8,0.8,0.75,...,0.779661,0.627119,0.677966,0.661017,0.779661,0.728814,0.728814,0.718347,0.060103,9
1,3.770069,0.18711,0.09319,0.021886,6,{'max_depth': 6},0.733333,0.783333,0.85,0.733333,...,0.830508,0.711864,0.677966,0.728814,0.79661,0.830508,0.711864,0.758941,0.06182,8
2,4.095029,0.174453,0.085982,0.023184,9,{'max_depth': 9},0.75,0.816667,0.85,0.766667,...,0.847458,0.711864,0.745763,0.762712,0.813559,0.779661,0.694915,0.774025,0.061708,7
3,4.367976,0.176267,0.09909,0.03453,12,{'max_depth': 12},0.766667,0.833333,0.833333,0.833333,...,0.881356,0.711864,0.745763,0.79661,0.864407,0.813559,0.745763,0.794251,0.05754,6
4,4.44747,0.152914,0.086543,0.01461,15,{'max_depth': 15},0.75,0.816667,0.85,0.866667,...,0.847458,0.728814,0.728814,0.79661,0.864407,0.813559,0.745763,0.796766,0.05494,5
5,5.879855,1.158913,0.224331,0.115399,18,{'max_depth': 18},0.75,0.833333,0.85,0.85,...,0.864407,0.762712,0.779661,0.847458,0.847458,0.813559,0.779661,0.80774,0.046376,2
6,8.591077,0.718871,0.152222,0.051264,21,{'max_depth': 21},0.75,0.8,0.866667,0.833333,...,0.898305,0.728814,0.779661,0.830508,0.847458,0.813559,0.813559,0.808602,0.054245,1
7,7.271932,0.422428,0.131779,0.029249,24,{'max_depth': 24},0.733333,0.816667,0.866667,0.85,...,0.864407,0.745763,0.779661,0.830508,0.847458,0.79661,0.813559,0.806907,0.048122,4
8,6.117839,0.500641,0.084206,0.033236,27,{'max_depth': 27},0.733333,0.816667,0.866667,0.866667,...,0.830508,0.762712,0.745763,0.847458,0.830508,0.830508,0.779661,0.807712,0.049119,3


In [9]:
rf_models.best_score_

0.8086016949152542

In [10]:
rf_models.best_params_

{'max_depth': 21}

### Score

In [11]:
prediction = rf_models.best_estimator_.predict(X_test)
rf_models.best_estimator_.score(X_test,y_test)

0.7721021611001965

### Prepare Data for evaluation

In [12]:
result_df = pd.DataFrame(columns=rf_data_raw.columns[:-1])
result_df = result_df.drop(columns="NAME")

for i, row in enumerate(X_test):
    data = [i]
    data.extend(row)
    result_df.loc[len(result_df["INDEX"])] = data

result_df["LABEL"] = y_test
result_df["PRED"] = prediction

result_df.to_csv(result_dir/"DPP4/baseline_rf.csv",encoding="utf-8")