In [44]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

%run util.ipynb

# RF

In [45]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

# Load Data

In [46]:
X, Y = get_data()

In [47]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42)

scalar = StandardScaler()
Xtrain = scalar.fit_transform(Xtrain)
Xtest = scalar.transform(Xtest)

In [48]:
clf = RandomForestClassifier(random_state=42)

clf.fit(Xtrain, Ytrain)

In [49]:
Ypred = clf.predict(Xtest)

accuracy_score(Ytest, Ypred)

0.8229166666666666

### hyperparam tuning 

In [50]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [51]:
param_grid_rf = {
    'max_depth': [5, 10, 15, 20, 25],
    'min_samples_leaf': range(1,4),
    'min_samples_split': range(3,8)
}

grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                              param_grid=param_grid_rf, cv=10, scoring='accuracy', n_jobs=-1)

In [52]:
grid_search_rf.fit(Xtrain, Ytrain)

In [53]:
grid_search_rf.best_params_, grid_search_rf.best_score_

({'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5},
 np.float64(0.7938735177865612))

In [54]:
model = grid_search_rf.best_estimator_

Ypred = model.predict(Xtest)

accuracy_score(Ytest, Ypred)

0.8125

In [55]:
print(classification_report(Ytest, Ypred))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82        50
           1       0.82      0.78      0.80        46

    accuracy                           0.81        96
   macro avg       0.81      0.81      0.81        96
weighted avg       0.81      0.81      0.81        96



In [56]:
### check for overfitting ,

In [57]:
YpredTrain = grid_search_rf.best_estimator_.predict(Xtrain)

accuracy_score(Ytrain, YpredTrain)

1.0