## Random Forest Approach



### Setup

In [14]:
import pandas as pd
import imp
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

import os 
from pathlib import Path


In [15]:
# read dynamic path
base_dir = Path(os.getcwd())/"implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [16]:
rf_data_raw_train = pd.read_csv(data_dir / "DPP4/dpp4_smote_train.csv")
rf_data_raw_test = pd.read_csv(data_dir / "DPP4/dpp4_smote_test.csv")



In [17]:

rf_data_train = {'data': np.array(rf_data_raw_train.iloc[:, 2:-1]),
             'target': np.array(rf_data_raw_train.iloc[0:,-1]),
             'feature_names': rf_data_raw_train.columns[2:-1],
             'target_names': ['inactive', 'active']}

rf_data_test = {'data': np.array(rf_data_raw_test.iloc[:, 2:-1]),
             'target': np.array(rf_data_raw_test.iloc[0:,-1]),
             'feature_names': rf_data_raw_test.columns[2:-1],
             'target_names': ['inactive', 'active']}


### Apply Random Forrest

In [18]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [19]:
rf_models.fit(rf_data_train["data"], rf_data_train["target"])

In [20]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,4.213925,0.445349,0.094143,0.016424,3,{'max_depth': 3},0.7,0.8,0.783333,0.716667,...,0.813559,0.627119,0.694915,0.728814,0.779661,0.762712,0.728814,0.72346,0.056604,9
1,4.425104,0.323687,0.097636,0.02156,6,{'max_depth': 6},0.7,0.8,0.833333,0.733333,...,0.830508,0.711864,0.694915,0.745763,0.79661,0.830508,0.711864,0.75048,0.06611,8
2,4.630815,0.338745,0.096472,0.025061,9,{'max_depth': 9},0.75,0.783333,0.85,0.833333,...,0.830508,0.728814,0.728814,0.745763,0.830508,0.79661,0.728814,0.777415,0.055516,7
3,4.873995,0.31477,0.099314,0.023168,12,{'max_depth': 12},0.75,0.833333,0.85,0.833333,...,0.898305,0.728814,0.762712,0.813559,0.847458,0.813559,0.762712,0.798517,0.058483,6
4,5.391364,0.406927,0.115459,0.031881,15,{'max_depth': 15},0.75,0.816667,0.85,0.85,...,0.864407,0.762712,0.762712,0.813559,0.847458,0.813559,0.762712,0.802712,0.05062,5
5,5.626844,0.497399,0.112985,0.021122,18,{'max_depth': 18},0.75,0.816667,0.85,0.816667,...,0.864407,0.745763,0.762712,0.864407,0.847458,0.830508,0.79661,0.806088,0.047269,3
6,6.065153,0.613658,0.147716,0.04002,21,{'max_depth': 21},0.75,0.833333,0.9,0.866667,...,0.864407,0.745763,0.762712,0.847458,0.864407,0.830508,0.813559,0.815325,0.054545,1
7,7.098557,0.483734,0.151613,0.051448,24,{'max_depth': 24},0.766667,0.8,0.833333,0.85,...,0.864407,0.745763,0.779661,0.847458,0.830508,0.830508,0.813559,0.809463,0.047357,2
8,6.152998,0.688418,0.090304,0.02864,27,{'max_depth': 27},0.733333,0.8,0.883333,0.866667,...,0.847458,0.779661,0.779661,0.864407,0.830508,0.813559,0.79661,0.805226,0.049475,4


In [21]:
rf_models.best_params_

{'max_depth': 21}

### Score

In [22]:
prediction = rf_models.best_estimator_.predict(rf_data_test["data"])
rf_models.best_estimator_.score(rf_data_test["data"],rf_data_test["target"])

0.7662082514734774

### Prepare Data for evaluation

In [23]:
result_df = pd.DataFrame(columns=rf_data_raw_train.columns[:-1])

for i, row in enumerate(rf_data_test["data"]):
    data = [i,i]
    data.extend(row)
    result_df.loc[i+1] = data

result_df["LABEL"] = rf_data_test["target"]
result_df["PRED"] = prediction

result_df.to_csv(result_dir/"DPP4/fe_smote_rf.csv",encoding="utf-8")