IMPLEMENTATION

In [1]:
import numpy as np
import os
os.chdir("..")
data = np.loadtxt(('data/preprocessed-dataset.csv'), delimiter=',', skiprows=1)
os.chdir("methods")
x = data[:,1:35]
y = data[:,35]

In [2]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.2)

In [3]:
from sklearn.ensemble import RandomForestClassifier

rf_model=RandomForestClassifier()
rf_model.fit(x_train, y_train)

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score

y_predict=rf_model.predict(x_test)

print('Accuracy: {}'.format(accuracy_score(y_test, y_predict)))
print('Precision: {}'.format(precision_score(y_test, y_predict)))
print('Recall: {}'.format(recall_score(y_test, y_predict)))
print(confusion_matrix(y_test, y_predict))
print(roc_auc_score(y_test, y_predict))

Accuracy: 0.8892794376098418
Precision: 0.8832807570977917
Recall: 0.9150326797385621
[[226  37]
 [ 26 280]]
0.8871741345460871


OPTIMIZATION

In [5]:
import numpy as np
n_estimators=[int(x) for x in np.linspace(start=10, stop=100, num=10)]
max_features=['auto','sqrt']
max_depth=[2,4]
min_samples_split=[2,5]
min_samples_leaf=[1,2]
bootstrap=[True,False]

param_grid={'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}


In [6]:
rf_model=RandomForestClassifier()
RandomForestClassifier().get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [7]:
from sklearn.model_selection import GridSearchCV
rf_grid=GridSearchCV(rf_model, param_grid=param_grid, cv=10, verbose=2, n_jobs=4)
rf_grid.fit(x_train, y_train)

Fitting 10 folds for each of 320 candidates, totalling 3200 fits


In [8]:
rf_grid.best_params_

{'bootstrap': True,
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 90}

TESTING OPTIMIZATION


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score
rf_model=RandomForestClassifier(bootstrap=True, max_depth=4, max_features='sqrt', min_samples_leaf=2, min_samples_split=2, n_estimators=20)
rf_model.fit(x_train, y_train)
y_predict=rf_model.predict(x_test)

print('Accuracy: {}'.format(accuracy_score(y_test, y_predict)))
print('Precision: {}'.format(precision_score(y_test, y_predict)))
print('Recall: {}'.format(recall_score(y_test, y_predict)))
print(confusion_matrix(y_test, y_predict))
print(roc_auc_score(y_test, y_predict))

Accuracy: 0.8734622144112478
Precision: 0.8726114649681529
Recall: 0.8954248366013072
[[223  40]
 [ 32 274]]
0.8716667909242278
