## Random Forest Approach



### Setup

In [15]:
import pandas as pd
import imp
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

import os 
from pathlib import Path


In [16]:
# read dynamic path
base_dir = Path(os.getcwd())/"implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [17]:
rf_data_raw_train = pd.read_csv(data_dir / "COX1/cox1_smote_train.csv")
rf_data_raw_test = pd.read_csv(data_dir / "COX1/cox1_smote_test.csv")



In [18]:

rf_data_train = {'data': np.array(rf_data_raw_train.iloc[:, 2:-1]),
             'target': np.array(rf_data_raw_train.iloc[0:,-1]),
             'feature_names': rf_data_raw_train.columns[2:-1],
             'target_names': ['inactive', 'active']}

rf_data_test = {'data': np.array(rf_data_raw_test.iloc[:, 2:-1]),
             'target': np.array(rf_data_raw_test.iloc[0:,-1]),
             'feature_names': rf_data_raw_test.columns[2:-1],
             'target_names': ['inactive', 'active']}


### Apply Random Forrest

In [19]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [20]:
rf_models.fit(rf_data_train["data"], rf_data_train["target"])

In [21]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,3.187149,0.224939,0.07494,0.01464,3,{'max_depth': 3},0.702703,0.702703,0.702703,0.72973,...,0.722222,0.722222,0.722222,0.722222,0.722222,0.722222,0.722222,0.72042,0.007897,8
1,3.529264,0.205538,0.078937,0.010314,6,{'max_depth': 6},0.702703,0.702703,0.702703,0.72973,...,0.722222,0.722222,0.722222,0.722222,0.722222,0.722222,0.722222,0.72042,0.007897,8
2,3.858047,0.304966,0.100984,0.035628,9,{'max_depth': 9},0.702703,0.702703,0.702703,0.72973,...,0.722222,0.75,0.722222,0.722222,0.722222,0.722222,0.722222,0.721809,0.010199,7
3,4.386565,0.296092,0.101372,0.032625,12,{'max_depth': 12},0.72973,0.702703,0.702703,0.72973,...,0.75,0.75,0.75,0.694444,0.75,0.666667,0.694444,0.727327,0.02531,5
4,5.112981,0.466862,0.111621,0.026693,15,{'max_depth': 15},0.72973,0.702703,0.702703,0.675676,...,0.777778,0.75,0.75,0.638889,0.777778,0.638889,0.694444,0.725938,0.041944,6
5,5.767569,0.448735,0.114115,0.036069,18,{'max_depth': 18},0.702703,0.702703,0.702703,0.675676,...,0.833333,0.75,0.75,0.638889,0.75,0.666667,0.694444,0.732845,0.049722,4
6,5.071594,0.381003,0.108791,0.013737,21,{'max_depth': 21},0.72973,0.72973,0.702703,0.675676,...,0.805556,0.722222,0.75,0.638889,0.75,0.694444,0.694444,0.738326,0.047076,1
7,5.121971,0.338031,0.10421,0.021507,24,{'max_depth': 24},0.756757,0.72973,0.702703,0.675676,...,0.777778,0.694444,0.75,0.638889,0.75,0.694444,0.694444,0.736899,0.047326,2
8,4.513702,0.52164,0.062947,0.019609,27,{'max_depth': 27},0.756757,0.756757,0.702703,0.675676,...,0.805556,0.722222,0.722222,0.638889,0.75,0.666667,0.694444,0.736899,0.049013,2


In [22]:
rf_models.best_params_

{'max_depth': 21}

### Score

In [23]:
prediction = rf_models.best_estimator_.predict(rf_data_test["data"])
rf_models.best_estimator_.score(rf_data_test["data"],rf_data_test["target"])

0.7628205128205128

### Prepare Data for evaluation

In [24]:
result_df = pd.DataFrame(columns=rf_data_raw_train.columns[:-1])

for i, row in enumerate(rf_data_test["data"]):
    data = [i,i]
    data.extend(row)
    result_df.loc[i+1] = data

result_df["LABEL"] = rf_data_test["target"]
result_df["PRED"] = prediction

result_df.to_csv(result_dir/"COX1/fe_smote_rf.csv",encoding="utf-8")