# Random Forest with Feature selection comparison

In [None]:

from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from pickle import dump
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Load training sets
X_train_scaled = pd.read_csv("test_train_data/X_train_scaled.csv")
X_train_scaled_sk = pd.read_csv("test_train_data/X_train_scaled_sk.csv")
X_train_scaled_cs = pd.read_csv("test_train_data/X_train_scaled_cs.csv")
encoded_y_train = pd.read_csv("test_train_data/encoded_y_train.csv")

In [None]:
# Load testing sets
X_test_scaled = pd.read_csv("test_train_data/X_test_scaled.csv")
X_test_scaled_sk = pd.read_csv("test_train_data/X_test_scaled_sk.csv")
X_test_scaled_cs = pd.read_csv("test_train_data/X_test_scaled_cs.csv")
encoded_y_test = pd.read_csv("test_train_data/encoded_y_test.csv")

In [None]:
# Code for Random Forest for all features


rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, np.ravel(encoded_y_train))
rf.score(X_test_scaled, np.ravel(encoded_y_test))

In [None]:
# Code for Random Forest with SelectKBest features

rf_sk = RandomForestClassifier(n_estimators=200)
rf_sk = rf_sk.fit(X_train_scaled_sk, np.ravel(encoded_y_train))
rf_sk.score(X_test_scaled_sk, np.ravel(encoded_y_test))

In [None]:
# Code for Random Forest with Correlation based selection

rf_cs = RandomForestClassifier(n_estimators=200)
rf_cs = rf_cs.fit(X_train_scaled_cs, np.ravel(encoded_y_train))
rf_cs.score(X_test_scaled_cs, np.ravel(encoded_y_test))

In [None]:
# Get parameters
from pprint import pprint
pprint(rf.get_params())

In [None]:
# Adjust # of trees

rf = RandomForestClassifier(n_estimators=10)
rf = rf.fit(X_train_scaled, np.ravel(encoded_y_train))
rf.score(X_test_scaled, np.ravel(encoded_y_test))

In [None]:
# Adjust # of max features for node splitting

rf = RandomForestClassifier(n_estimators=10, max_features = 'sqrt')
rf = rf.fit(X_train_scaled, np.ravel(encoded_y_train))
rf.score(X_test_scaled, np.ravel(encoded_y_test))

### Better to go with a Grid Search Estimator on best feature selection - Very Time Consuming (~1min runtime)

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

grid = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 3)

In [None]:
grid.fit(X_train_scaled, np.ravel(encoded_y_train))

In [None]:
print(grid.best_params_)

In [None]:
predictions = grid.predict(X_test_scaled)

In [None]:
print('Test Acc: %.3f' % grid.score(X_test_scaled, np.ravel(encoded_y_test)))

In [None]:
dump(rf, open('randomforest_trained_all.pkl', 'wb'))
dump(rf_sk, open('randomforest_trained_sk.pkl', 'wb'))
dump(rf_cs, open('randomforest_trained_cs.pkl', 'wb'))