## Random Forest Approach



### Setup

In [1]:
import pandas as pd
import imp
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

import os 
from pathlib import Path


  import imp


In [2]:
# read dynamic path
base_dir = Path(os.getcwd())/"implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [3]:
rf_data_raw_train = pd.read_csv(data_dir / "ACHE/ache_smote_train.csv")
rf_data_raw_test = pd.read_csv(data_dir / "ACHE/ache_smote_test.csv")



In [4]:

rf_data_train = {'data': np.array(rf_data_raw_train.iloc[:, 2:-1]),
             'target': np.array(rf_data_raw_train.iloc[0:,-1]),
             'feature_names': rf_data_raw_train.columns[2:-1],
             'target_names': ['inactive', 'active']}

rf_data_test = {'data': np.array(rf_data_raw_test.iloc[:, 2:-1]),
             'target': np.array(rf_data_raw_test.iloc[0:,-1]),
             'feature_names': rf_data_raw_test.columns[2:-1],
             'target_names': ['inactive', 'active']}


### Apply Random Forrest

In [5]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [6]:
rf_models.fit(rf_data_train["data"], rf_data_train["target"])

In [7]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,3.347633,0.267404,0.085347,0.028451,3,{'max_depth': 3},0.805556,0.75,0.857143,0.828571,...,0.742857,0.771429,0.857143,0.771429,0.657143,0.742857,0.742857,0.764921,0.057126,9
1,3.654907,0.256142,0.091614,0.012452,6,{'max_depth': 6},0.833333,0.805556,0.885714,0.771429,...,0.742857,0.8,0.885714,0.8,0.742857,0.828571,0.771429,0.79623,0.053852,8
2,4.010587,0.312557,0.10174,0.02912,9,{'max_depth': 9},0.861111,0.777778,0.885714,0.771429,...,0.8,0.8,0.885714,0.828571,0.771429,0.828571,0.771429,0.817659,0.058043,7
3,4.336065,0.321541,0.103555,0.025443,12,{'max_depth': 12},0.861111,0.805556,0.914286,0.8,...,0.8,0.771429,0.857143,0.8,0.714286,0.828571,0.8,0.823333,0.063655,6
4,4.584424,0.338807,0.118221,0.024331,15,{'max_depth': 15},0.861111,0.861111,0.914286,0.8,...,0.8,0.8,0.942857,0.8,0.714286,0.8,0.8,0.836111,0.070044,3
5,5.168833,0.283914,0.115393,0.031105,18,{'max_depth': 18},0.861111,0.861111,0.914286,0.771429,...,0.771429,0.828571,0.914286,0.8,0.742857,0.8,0.8,0.83754,0.070746,1
6,4.78907,0.40481,0.103357,0.017837,21,{'max_depth': 21},0.833333,0.833333,0.914286,0.771429,...,0.8,0.771429,0.942857,0.8,0.742857,0.8,0.8,0.834762,0.074263,4
7,4.676196,0.359755,0.111811,0.033757,24,{'max_depth': 24},0.777778,0.861111,0.914286,0.771429,...,0.8,0.771429,0.942857,0.8,0.742857,0.828571,0.8,0.83623,0.074784,2
8,4.434109,0.393427,0.066866,0.031837,27,{'max_depth': 27},0.777778,0.861111,0.914286,0.771429,...,0.8,0.771429,0.914286,0.8,0.742857,0.8,0.8,0.829087,0.069825,5


In [12]:
df = pd.read_csv(result_dir/"ACHE/val/ache_val.csv",sep=",",index_col="Index")
df.loc[len(df["Name"])] = ["fe_smote_rf",rf_models.best_score_]
df.to_csv(result_dir/"ACHE/val/ache_val.csv",sep=",",index="Index")

In [9]:
rf_models.best_params_

{'max_depth': 18}

### Score

In [10]:
prediction = rf_models.best_estimator_.predict(rf_data_test["data"])
rf_models.best_estimator_.score(rf_data_test["data"],rf_data_test["target"])

0.8006644518272426

### Prepare Data for evaluation

In [11]:
result_df = pd.DataFrame(columns=rf_data_raw_train.columns[:-1])

for i, row in enumerate(rf_data_test["data"]):
    data = [i,i]
    data.extend(row)
    result_df.loc[i+1] = data

result_df["LABEL"] = rf_data_test["target"]
result_df["PRED"] = prediction

result_df.to_csv(result_dir/"ACHE/fe_smote_rf.csv",encoding="utf-8")