# 1. Lab random forests

# Loading libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor #regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler

import matplotlib.pyplot as plt
import dtreeviz
import graphviz
import graphviz.backend as be
from IPython.display import Image, display_svg, SVG
import warnings
warnings.filterwarnings( "ignore", module = "matplotlib\..*" )

## Read & Load data

In [2]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

In [3]:
#concat
data = pd.concat([target, numerical, categorical], axis=1) 

In [4]:
data.head(1)

Unnamed: 0,TARGET_B,TARGET_D,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,0,0.0,0,60.0,5,9,0,0,39,34,...,37,12,92,8,94,2,95,12,89,11


In [5]:
data.isna().sum()

TARGET_B        0
TARGET_D        0
TCODE           0
AGE             0
INCOME          0
               ..
MAXRDATE_MM     0
LASTDATE_YR     0
LASTDATE_MM     0
FIRSTDATE_YR    0
FIRSTDATE_MM    0
Length: 339, dtype: int64

In [6]:
data['TARGET_B'].value_counts()

TARGET_B
0    90569
1     4843
Name: count, dtype: int64

## Instructions

**1.-** 

Apply the Random Forest algorithm to predict the TARGET_B. 

Please note that this column suffers from class imbalance. 

Fix the class imbalance using upsampling.

In [7]:
X = data.drop(['TARGET_B'], axis=1)
y = data['TARGET_B']

In [8]:
#class imbalance

ros = RandomOverSampler(sampling_strategy="not majority") 
X_res, y_res = ros.fit_resample(X, y)

In [9]:
X_res = X_res.select_dtypes(include='number') #df.select_dtypes(include=numerics)

In [10]:
#train_split

X = X_res
y = y_res

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train.columns = X.columns
X_test.columns = X.columns

In [11]:
#RandomForestClassifier

randomforest = RandomForestClassifier(n_estimators=10)
randomforest.fit(X_train, y_train)

In [12]:
print("The R2 of the model in the TRAIN set is: {:.2f}".format(randomforest.score(X_train,y_train)))
print("The R2 of the model in the TEST  set is: {:.2f}".format(randomforest.score(X_test,y_test)))

The R2 of the model in the TRAIN set is: 1.00
The R2 of the model in the TEST  set is: 1.00


**2.-** 

Discuss the model predictions and it's impact in the bussiness scenario. 

Is the cost of a false positive equals to the cost of the false negative? 

How much the money the company will not earn because of missclassifications made by the model?

In [None]:
Donnor donates -- Target_B = 1
Donnor not donates -- Target_B = 0

In [None]:
Confusion matrix



**3.-** 

Sklearn classification models are trained to maximize the accuracy. 

However, another error metric will be more relevant here. Which one? 

Please checkout make_scorer alongside with GridSearchCV in order to train the model to maximize the error metric of interest in this case.

In [13]:
n_estimators = [int(x) for x in range(50,100,150)]

max_features = ['auto', 'sqrt']

max_depth = [3,5]

min_samples_split = [10, 20]

min_samples_leaf = [10, 20]

bootstrap = [True, False]

In [14]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(param_grid)

{'n_estimators': [50], 'max_features': ['auto', 'sqrt'], 'max_depth': [3, 5], 'min_samples_split': [10, 20], 'min_samples_leaf': [10, 20], 'bootstrap': [True, False]}


In [16]:
from sklearn.model_selection import GridSearchCV

rf_Grid = GridSearchCV(estimator = randomforest, param_grid = param_grid, cv = 5, verbose=20, n_jobs = 6)

In [None]:
rf_Grid.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [None]:
rf_Grid.best_params_

In [None]:
print (f'Train Accuracy - : {rf_Grid.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {rf_Grid.score(X_test,y_test):.3f}')