## Random Forest

## Install and import necessary packages

In [24]:
# You may need to install xgboost (it's not part of the sklearn package)
# !conda install xgboost 

In [25]:
# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier

np.random.seed(1)

## Load data 

In [26]:
df = pd.read_csv('data/UniversalBank.csv')
df.head(5)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


## Explore the dataset

In [27]:
# Explore the dataset
# read the first row of the dataset 
print(df.head())
print(df.columns)
print(df.describe())
print(df.info())

   ID  Age  Experience  Income  ZIP Code  Family  CCAvg  Education  Mortgage  \
0   1   25           1      49     91107       4    1.6          1         0   
1   2   45          19      34     90089       3    1.5          1         0   
2   3   39          15      11     94720       1    1.0          1         0   
3   4   35           9     100     94112       1    2.7          2         0   
4   5   35           8      45     91330       4    1.0          2         0   

   Personal Loan  Securities Account  CD Account  Online  CreditCard  
0              0                   1           0       0           0  
1              0                   1           0       0           0  
2              0                   0           0       0           0  
3              0                   0           0       0           0  
4              0                   0           0       0           1  
Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education'

## Clean/transform data (where necessary)

In [28]:
# based on findings from data exploration, we need to clean up colum names, as there are some leading whitespace characters
df.columns = [s.strip() for s in df.columns] 
df.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

Drop the columns we are not using as predictors (see previous notebooks -- we are given a subset of input variables to consider)

In [29]:
df = df.drop(columns=['ID', 'ZIP Code'])

In [30]:
# translation education categories into dummy vars
df = df.join(pd.get_dummies(df['Education'], prefix='Edu', drop_first=True))
df.drop('Education', axis=1, inplace = True)

df.head(3)

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard,Edu_2,Edu_3
0,25,1,49,4,1.6,0,0,1,0,0,0,0,0
1,45,19,34,3,1.5,0,0,1,0,0,0,0,0
2,39,15,11,1,1.0,0,0,0,0,0,0,0,0


## Split data intro training and validation sets

In [31]:
# construct datasets for analysis
target = 'Personal Loan'
predictors = list(df.columns)
predictors.remove(target)
X = df[predictors]
y = df[target]

In [32]:
# create the training set and the test set 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=1)

In [33]:
X_train.to_csv('data/ub_train_X.csv', index=False)
y_train.to_csv('data/ub_train_y.csv', index=False)
X_test.to_csv('data/ub_test_X.csv', index=False)
y_test.to_csv('data/ub_test_y.csv', index=False)

In [34]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

## Prediction with RandomForest (using default parameters)

* n_estimators: The number of trees in the forsest
    - A deeper tree might increase the performance, but also the complexity and chances to overfit.
    - The value must be an integer greater than 0. Default is 100.  
* max_depth: The maximum depth per tree. 
    - Deeper trees might increase the performance, but also the complexity and chances to overfit.
    - The value must be an integer greater than 0. Default is None, which allows the tree to grow without constraint.

In [35]:
rforest = RandomForestClassifier()

In [36]:
_ = rforest.fit(X_train, y_train)

In [37]:
y_pred = rforest.predict(X_test)

In [38]:
model_preds = rforest.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Random_forest", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Random_forest,0.983333,0.984375,0.845638,0.909747


## Random forest with random search

In [39]:
score_measure = "recall"
rf = RandomForestClassifier()
kfolds = 5
param_grid = {
    'min_samples_split': [2,3,4,5,6,8,10],  
    'min_samples_leaf': [1, 2,3, 4],
    'max_leaf_nodes': np.arange(5, 250), 
    'max_features' : ['auto','sqrt'],
    'bootstrap' : [True , False],
    'max_depth': [10,20,30,40,50,60,70,None]
} 
  
grid = RandomizedSearchCV(rf, param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)
print(f"The best {score_measure} score is {grid.best_score_}")
print(f"... with parameters: {grid.best_params_}")

bestRecallTree = grid.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END bootstrap=True, max_depth=60, max_features=sqrt, max_leaf_nodes=195, min_samples_leaf=4, min_samples_split=6;, score=0.989 total time=   0.2s
[CV 2/5] END bootstrap=True, max_depth=60, max_features=sqrt, max_leaf_nodes=195, min_samples_leaf=4, min_samples_split=6;, score=0.984 total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=60, max_features=sqrt, max_leaf_nodes=195, min_samples_leaf=4, min_samples_split=6;, score=0.983 total time=   0.2s
[CV 4/5] END bootstrap=True, max_depth=60, max_features=sqrt, max_leaf_nodes=195, min_samples_leaf=4, min_samples_split=6;, score=0.989 total time=   0.2s
[CV 5/5] END bootstrap=True, max_depth=60, max_features=sqrt, max_leaf_nodes=195, min_samples_leaf=4, min_samples_split=6;, score=0.989 total time=   0.2s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=sqrt, max_leaf_nodes=12, min_samples_leaf=3, min_samples_split=5;, score=0.976 total time=   0.1s
[CV 2/5] E

In [40]:
model_preds = bestRecallTree.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Random_Rforest", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Random_forest,0.983333,0.984375,0.845638,0.909747
0,Random_Rforest,0.981333,0.968992,0.838926,0.899281


### Features

In [42]:
np.round(grid.best_estimator_.feature_importances_,2)

array([0.03, 0.02, 0.33, 0.13, 0.18, 0.03, 0.  , 0.06, 0.  , 0.01, 0.11,
       0.1 ])

## RF with Grid

In [46]:
score_measure = "recall"
rf = RandomForestClassifier()
kfolds = 5
param_grid = {
    'min_samples_split': [2,3],  
    'min_samples_leaf': [1, 2],
    'max_leaf_nodes': np.arange(5, 8), 
    'max_features' : ['auto','sqrt'],
    'bootstrap' : [True , False],
    'max_depth': [10,None]
} 
  
grid = GridSearchCV(rf, param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)
print(f"The best {score_measure} score is {grid.best_score_}")
print(f"... with parameters: {grid.best_params_}")

bestRecallTree = grid.best_estimator_

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5] END bootstrap=True, max_depth=10, max_features=auto, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=2;, score=0.936 total time=   0.1s
[CV 2/5] END bootstrap=True, max_depth=10, max_features=auto, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=2;, score=0.937 total time=   0.1s
[CV 3/5] END bootstrap=True, max_depth=10, max_features=auto, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=2;, score=0.937 total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=10, max_features=auto, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=2;, score=0.934 total time=   0.1s
[CV 5/5] END bootstrap=True, max_depth=10, max_features=auto, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=2;, score=0.943 total time=   0.1s
[CV 1/5] END bootstrap=True, max_depth=10, max_features=auto, max_leaf_nodes=5, min_samples_leaf=1, min_samples_split=3;, score=0.923 total time=   0.1s
[CV 2/5] END bootstr

In [47]:
model_preds = bestRecallTree.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Grid_Rforest", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Random_forest,0.983333,0.984375,0.845638,0.909747
0,Random_Rforest,0.981333,0.968992,0.838926,0.899281
0,Grid_Rforest,0.95,1.0,0.496644,0.663677


### Features

In [48]:
np.round(grid.best_estimator_.feature_importances_,2)

array([0.  , 0.  , 0.33, 0.12, 0.23, 0.04, 0.  , 0.09, 0.  , 0.  , 0.09,
       0.09])

In [49]:
performance.sort_values(by='Accuracy')

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Grid_Rforest,0.95,1.0,0.496644,0.663677
0,Random_Rforest,0.981333,0.968992,0.838926,0.899281
0,Random_forest,0.983333,0.984375,0.845638,0.909747
