## Random Forest Approach



### Setup

In [13]:
import pandas as pd
import imp
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

import os 
from pathlib import Path


In [14]:
# read dynamic path
base_dir = Path(os.getcwd())/"implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare dataset

the following code needs to be adapted for each protein-ligand complex individually

In [15]:
rf_data_raw = pd.read_csv(data_dir / "ACHE/ache_pca.csv")

rf_data_raw

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,21,22,23,24,25,26,27,28,29,LABEL
0,0,2.775103,1.569092,-1.633093,0.509205,-0.365362,0.464463,-1.344698,0.340212,1.195709,...,-0.426203,0.157809,-0.384176,-0.775244,-0.265311,0.172972,-0.078754,0.175435,0.157472,active
1,1,1.160143,0.745342,3.626592,-0.482753,-0.501234,-0.702610,0.148980,1.191717,-0.315544,...,-0.197789,-0.733598,0.058455,0.356964,0.004727,0.491466,0.056198,-0.304857,0.442595,active
2,2,1.347487,-1.066134,0.930806,-0.326445,1.909936,0.292744,0.015491,0.427259,0.980201,...,-0.371311,-0.441166,-0.295813,0.288092,-0.258665,-0.158248,0.097048,-0.130219,-0.302125,active
3,3,4.217589,-0.574677,1.399565,1.361755,-1.065899,0.487160,0.041603,0.652388,0.670705,...,-0.035638,0.009457,0.003479,0.175791,0.480889,-0.397306,-0.118256,-0.429759,0.046488,active
4,4,2.269227,-0.097871,0.590080,0.325115,1.649822,0.056987,-0.778049,0.349009,0.760214,...,-0.450115,-0.645917,0.283596,-0.353558,-0.340660,0.206826,-0.132083,-0.185250,0.394408,active
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,998,-0.609818,-0.656158,-0.131409,0.554176,-0.437736,-0.673396,-0.254374,-0.307080,-0.299093,...,0.298212,0.244026,0.018224,0.040865,-0.106602,0.218396,-0.221769,0.094682,0.076756,active
999,999,-0.418320,-0.805650,-0.109816,-0.376300,-0.549992,-0.458199,0.583562,0.367584,0.043453,...,-0.381802,0.069187,-0.053126,0.357976,0.090901,0.410654,-0.183452,0.216238,-0.213667,active
1000,1000,-1.637342,-0.955758,0.168724,0.684614,-0.258857,-0.898800,0.085315,0.578703,0.398927,...,0.703583,0.167216,-0.588820,-0.750097,-0.244467,0.106357,-0.130821,0.226009,-0.169840,active
1001,1001,0.573375,-0.944395,-0.090789,-0.004957,-0.256222,-0.095932,0.025505,-0.122203,0.328594,...,0.046983,0.173622,-0.314616,-0.280141,-0.116199,0.507827,-0.063389,0.449591,-0.264470,inactive


In [16]:
lookup = {'inactive':0,'active':1}

rf_data = {'data': np.array(rf_data_raw.iloc[:, 2:-1]),
             'target': np.array([lookup[y] for y in rf_data_raw.iloc[0:,-1]]),
             'feature_names': rf_data_raw.columns[2:-1],
             'target_names': ['inactive', 'active']}


split into train- and test-set

In [17]:
X_train, X_test, y_train, y_test = train_test_split(rf_data['data'], rf_data['target'],
                                                    test_size=0.3, random_state=4232)

### Apply Random Forrest

In [18]:
parameters = {'max_depth': [i for i in range(3, 28,3)]}
rf_models = GridSearchCV(RandomForestClassifier(n_estimators=1001, oob_score=True), parameters, cv=20, n_jobs=-1)


Fit model with the provided parameters

In [19]:
rf_models.fit(X_train, y_train)

In [20]:
pd.DataFrame(rf_models.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,6.176049,0.545802,0.133498,0.044148,3,{'max_depth': 3},0.805556,0.722222,0.8,0.685714,...,0.742857,0.685714,0.828571,0.685714,0.628571,0.8,0.685714,0.737817,0.064433,9
1,7.446863,0.418767,0.104184,0.034882,6,{'max_depth': 6},0.833333,0.777778,0.828571,0.685714,...,0.742857,0.657143,0.914286,0.714286,0.657143,0.885714,0.685714,0.781984,0.077988,8
2,8.245101,0.627416,0.114942,0.0259,9,{'max_depth': 9},0.833333,0.777778,0.828571,0.685714,...,0.742857,0.714286,0.914286,0.714286,0.714286,0.828571,0.742857,0.784841,0.063031,7
3,8.76888,0.394039,0.108641,0.031858,12,{'max_depth': 12},0.888889,0.805556,0.828571,0.685714,...,0.742857,0.685714,0.914286,0.714286,0.742857,0.828571,0.771429,0.797579,0.069568,2
4,8.563093,0.526138,0.116421,0.040368,15,{'max_depth': 15},0.861111,0.805556,0.828571,0.685714,...,0.771429,0.714286,0.942857,0.714286,0.685714,0.857143,0.714286,0.797619,0.071278,1
5,8.072697,0.407387,0.105916,0.037675,18,{'max_depth': 18},0.833333,0.777778,0.828571,0.657143,...,0.742857,0.714286,0.914286,0.714286,0.714286,0.8,0.742857,0.789127,0.068533,6
6,8.020528,0.365695,0.101539,0.031905,21,{'max_depth': 21},0.861111,0.805556,0.828571,0.685714,...,0.742857,0.714286,0.914286,0.714286,0.685714,0.828571,0.714286,0.793333,0.071863,4
7,8.098024,0.295777,0.107752,0.040296,24,{'max_depth': 24},0.888889,0.75,0.828571,0.685714,...,0.742857,0.714286,0.914286,0.714286,0.657143,0.8,0.742857,0.790516,0.077864,5
8,7.634572,0.370951,0.071577,0.021183,27,{'max_depth': 27},0.833333,0.777778,0.828571,0.714286,...,0.742857,0.714286,0.914286,0.742857,0.685714,0.828571,0.771429,0.793413,0.065746,3


In [21]:
rf_models.best_score_

0.7976190476190476

In [22]:
rf_models.best_params_

{'max_depth': 15}

### Score

In [23]:
prediction = rf_models.best_estimator_.predict(X_test)
rf_models.best_estimator_.score(X_test,y_test)

0.7740863787375415

### Prepare Data for evaluation

In [24]:
result_df = pd.DataFrame(columns=rf_data_raw.columns[:-1])

for i, row in enumerate(X_test):
    data = [i,i]
    data.extend(row)
    result_df.loc[i+1] = data

result_df["LABEL"] = y_test
result_df["PRED"] = prediction

result_df.to_csv(result_dir/"ACHE/fe_pca_rf.csv",encoding="utf-8")