## Random Forest Approach



### Setup

In [1]:
import pandas as pd
import imp
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

import os 
from pathlib import Path


  import imp


In [2]:
# read dynamic path
base_dir = Path(os.getcwd())/"implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [3]:
rf_data_raw_train = pd.read_csv(data_dir / "ACHE/ache_smote_train.csv")
rf_data_raw_test = pd.read_csv(data_dir / "ACHE/ache_smote_test.csv")



In [4]:

rf_data_train = {'data': np.array(rf_data_raw_train.iloc[:, 2:-1]),
             'target': np.array(rf_data_raw_train.iloc[0:,-1]),
             'feature_names': rf_data_raw_train.columns[2:-1],
             'target_names': ['inactive', 'active']}

rf_data_test = {'data': np.array(rf_data_raw_test.iloc[:, 2:-1]),
             'target': np.array(rf_data_raw_test.iloc[0:,-1]),
             'feature_names': rf_data_raw_test.columns[2:-1],
             'target_names': ['inactive', 'active']}


### Apply Random Forrest

In [5]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [6]:
rf_models.fit(rf_data_train["data"], rf_data_train["target"])

In [7]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,11.129931,0.684999,0.246124,0.059594,3,{'max_depth': 3},0.805556,0.694444,0.857143,0.857143,...,0.742857,0.8,0.857143,0.771429,0.628571,0.742857,0.742857,0.760714,0.064484,9
1,10.437465,0.8195,0.264168,0.059856,6,{'max_depth': 6},0.833333,0.805556,0.885714,0.771429,...,0.771429,0.8,0.885714,0.8,0.771429,0.8,0.8,0.801944,0.053949,8
2,12.674569,0.493132,0.309189,0.08611,9,{'max_depth': 9},0.833333,0.833333,0.914286,0.771429,...,0.771429,0.8,0.885714,0.828571,0.828571,0.828571,0.771429,0.823333,0.061073,7
3,13.559566,0.554487,0.307562,0.083342,12,{'max_depth': 12},0.833333,0.833333,0.914286,0.8,...,0.8,0.8,0.885714,0.8,0.714286,0.828571,0.8,0.82619,0.066981,6
4,15.35609,1.037551,0.427354,0.139605,15,{'max_depth': 15},0.833333,0.833333,0.914286,0.771429,...,0.8,0.771429,0.942857,0.828571,0.742857,0.828571,0.8,0.834762,0.072595,2
5,17.621574,0.958945,0.28462,0.051062,18,{'max_depth': 18},0.861111,0.861111,0.914286,0.771429,...,0.8,0.8,0.914286,0.828571,0.742857,0.8,0.8,0.834683,0.069892,3
6,16.756791,1.013024,0.393331,0.091875,21,{'max_depth': 21},0.833333,0.861111,0.914286,0.771429,...,0.771429,0.742857,0.942857,0.828571,0.714286,0.828571,0.8,0.833294,0.069628,4
7,10.958792,1.154002,0.173135,0.05537,24,{'max_depth': 24},0.833333,0.833333,0.914286,0.771429,...,0.8,0.828571,0.942857,0.8,0.742857,0.8,0.8,0.834762,0.07031,1
8,7.978935,0.777646,0.195319,0.067897,27,{'max_depth': 27},0.777778,0.833333,0.914286,0.771429,...,0.8,0.8,0.914286,0.8,0.771429,0.828571,0.8,0.831984,0.063536,5


In [8]:
rf_models.best_score_

0.834761904761905

In [9]:
rf_models.best_params_

{'max_depth': 24}

### Score

In [10]:
prediction = rf_models.best_estimator_.predict(rf_data_test["data"])
rf_models.best_estimator_.score(rf_data_test["data"],rf_data_test["target"])

0.8039867109634552

### Prepare Data for evaluation

In [11]:
result_df = pd.DataFrame(columns=rf_data_raw_train.columns[:-1])

for i, row in enumerate(rf_data_test["data"]):
    data = [i,i]
    data.extend(row)
    result_df.loc[i+1] = data

result_df["LABEL"] = rf_data_test["target"]
result_df["PRED"] = prediction

result_df.to_csv(result_dir/"ACHE/fe_smote_rf.csv",encoding="utf-8")