# **Random forests - Bank marketing campaign**

**Classification problem**

In [15]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('/content/bank_mark_campaign.csv', sep=';')

df = df.replace('unknown', np.nan) 

col_nan = df.columns[df.isna().any(axis=0)].to_list()
col_num = df.describe().columns.to_list()
df.columns.difference(col_nan + col_num)
col_cat = df.columns.difference(col_nan + col_num + ['y']).to_list()

na_treat = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('oneh', OneHotEncoder(drop='first'))])

preprocessor = ColumnTransformer([
    ('na_tr', na_treat, col_nan),
    ('cat_tr', OneHotEncoder(drop='first'), col_cat),
    ('scale_tr', StandardScaler(), col_num)], 
    remainder='passthrough')

# hyper = {
#     'ccp_alpha': [0.001, 0.01, 0.1, 0.2, 0.5]
# }

# pipe = Pipeline([
#     ('pre', preprocessor),
#     ('grid', GridSearchCV(DecisionTreeClassifier(class_weight="balanced"), hyper, cv=5, scoring='roc_auc'))])

pipe = Pipeline([
    ('pre', preprocessor),
    ('rf', RandomForestClassifier())])

X = df.drop('y', axis=1)
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)

acur = accuracy_score(y_train, y_pred)
print(f'Accuracy= {acur}')
cm = confusion_matrix(y_train, y_pred, labels=['yes', 'no'])
print(cm)
recall = recall_score(y_train, y_pred, pos_label='yes')
print(f'Recall= {recall}')


Accuracy= 1.0
[[ 3700     0]
 [    0 29250]]
Recall= 1.0


In [16]:
y_pred2 = pipe.predict(X_test)

acur = accuracy_score(y_test, y_pred2)
print(f'Accuracy= {acur}')
cm = confusion_matrix(y_test, y_pred2, labels=['yes', 'no'])
print(cm)
recall = recall_score(y_test, y_pred2, pos_label='yes')
print(f'Recall= {recall}')

Accuracy= 0.9100509832483612
[[ 429  511]
 [ 230 7068]]
Recall= 0.4563829787234043


The model is overfitting because the results for the predictive performance are very different, the model is fitting perfectly for the training set, including the noise. And does not fit good with unseen or new data as we can see in the results for the predictive performance with the test set.

In fact, the model ir very bad. To be good, ideally, the recall should be closer to 1 and is very bad.

WHAT CAN BE HAPPENING?

Maybe the dataset is imbalanced. Let's see.


In [19]:
y.value_counts(normalize=True)

no     0.887346
yes    0.112654
Name: y, dtype: float64

Yes, it is. One way to solve is to force python to penalize the majority class.

In [20]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('/content/bank_mark_campaign.csv', sep=';')

df = df.replace('unknown', np.nan) 

col_nan = df.columns[df.isna().any(axis=0)].to_list()
col_num = df.describe().columns.to_list()
df.columns.difference(col_nan + col_num)
col_cat = df.columns.difference(col_nan + col_num + ['y']).to_list()

na_treat = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('oneh', OneHotEncoder(drop='first'))])

preprocessor = ColumnTransformer([
    ('na_tr', na_treat, col_nan),
    ('cat_tr', OneHotEncoder(drop='first'), col_cat),
    ('scale_tr', StandardScaler(), col_num)], 
    remainder='passthrough')

# hyper = {
#     'ccp_alpha': [0.001, 0.01, 0.1, 0.2, 0.5]
# }

# pipe = Pipeline([
#     ('pre', preprocessor),
#     ('grid', GridSearchCV(DecisionTreeClassifier(class_weight="balanced"), hyper, cv=5, scoring='roc_auc'))])

pipe = Pipeline([
    ('pre', preprocessor),
    ('rf', RandomForestClassifier(class_weight='balanced'))])

X = df.drop('y', axis=1)
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)

acur = accuracy_score(y_train, y_pred)
print(f'Accuracy= {acur}')
cm = confusion_matrix(y_train, y_pred, labels=['yes', 'no'])
print(cm)
recall = recall_score(y_train, y_pred, pos_label='yes')
print(f'Recall= {recall}')


Accuracy= 0.9999696509863429
[[ 3699     1]
 [    0 29250]]
Recall= 0.9997297297297297


In [21]:
y_pred2 = pipe.predict(X_test)

acur = accuracy_score(y_test, y_pred2)
print(f'Accuracy= {acur}')
cm = confusion_matrix(y_test, y_pred2, labels=['yes', 'no'])
print(cm)
recall = recall_score(y_test, y_pred2, pos_label='yes')
print(f'Recall= {recall}')

Accuracy= 0.9095654285020636
[[ 396  544]
 [ 201 7097]]
Recall= 0.42127659574468085


Doing the grid search:

In [22]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('/content/bank_mark_campaign.csv', sep=';')

df = df.replace('unknown', np.nan) 

col_nan = df.columns[df.isna().any(axis=0)].to_list()
col_num = df.describe().columns.to_list()
df.columns.difference(col_nan + col_num)
col_cat = df.columns.difference(col_nan + col_num + ['y']).to_list()

na_treat = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('oneh', OneHotEncoder(drop='first'))])

preprocessor = ColumnTransformer([
    ('na_tr', na_treat, col_nan),
    ('cat_tr', OneHotEncoder(drop='first'), col_cat),
    ('scale_tr', StandardScaler(), col_num)], 
    remainder='passthrough')

hyper = {
    'ccp_alpha': [0.001, 0.01, 0.1, 0.2, 0.5]
}

# pipe = Pipeline([
#     ('pre', preprocessor),
#     ('rf', RandomForestClassifier(class_weight='balanced'))])

pipe = Pipeline([
    ('pre', preprocessor),
    ('grid', GridSearchCV(RandomForestClassifier(class_weight="balanced"), hyper, cv=5))])

X = df.drop('y', axis=1)
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)

acur = accuracy_score(y_train, y_pred)
print(f'Accuracy= {acur}')
cm = confusion_matrix(y_train, y_pred, labels=['yes', 'no'])
print(cm)
recall = recall_score(y_train, y_pred, pos_label='yes')
print(f'Recall= {recall}')


Accuracy= 0.8298330804248862
[[ 3526   174]
 [ 5433 23817]]
Recall= 0.952972972972973


In [23]:
y_pred2 = pipe.predict(X_test)

acur = accuracy_score(y_test, y_pred2)
print(f'Accuracy= {acur}')
cm = confusion_matrix(y_test, y_pred2, labels=['yes', 'no'])
print(cm)
recall = recall_score(y_test, y_pred2, pos_label='yes')
print(f'Recall= {recall}')

Accuracy= 0.8241077931536781
[[ 884   56]
 [1393 5905]]
Recall= 0.9404255319148936


Só agora é que o modelo ficou bom!!!

# Regression trees and Random forests - Data car radios

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

import datetime

df = pd.read_excel('/content/data_carradios.xlsx')

def get_ages(col):
  result = (datetime.datetime.now()-col).astype('<m8[Y]')
  result = pd.DataFrame(result)
  return result

ager = Pipeline([
    ('ages', FunctionTransformer(get_ages, feature_names_out='one-to-one')),
    ('scale', StandardScaler())
])

def get_weekdays(col):
  result = col.iloc[:,0].dt.weekday
  result = pd.DataFrame(result)
  return result

weeker = Pipeline([
    ('weekd', FunctionTransformer(get_weekdays, feature_names_out='one-to-one')),
    ('oneh', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer([
    ('ages_tr', ager, ['bdate']),
    ('weekd_tr', weeker, ['datep']),
    ('team_tr', OneHotEncoder(drop='first'), ['team']),
    ('scaler', StandardScaler(), ['prized', 'prizeq'])],
    remainder='passthrough')

X = df.drop('perc_defec', axis=1)
y = df['perc_defec']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

pipe = Pipeline([
    ('pre', preprocessor),
    #('lm', LinearRegression()),
    ('tree', DecisionTreeRegressor())])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)

mae = mean_absolute_error(y_train, y_pred)
rsme = mean_squared_error(y_train, y_pred, squared=False)
r2 = r2_score(y_train, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')
y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rsme = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')

MAE= 2.6502624234792984
RSME= 3.676162025277643
R2= 0.940571315604777
MAE= 3.904061677489177
RSME= 5.212007208008061
R2= 0.8846262266400601


R2 => training set => 0.94

R2 => test set => 0.88

They are too different so we can suspect overfitting. The model don't behave the same way in the test set (unseen data) as it behaves in the training set.

We have to play with the complexity of the tree using parameter `ccp_alpha`.

We need to do a grid search, define several values for the hyperparameter in order to find the best value of the hyperparameter.

In [3]:
from sklearn.model_selection import GridSearchCV

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

import datetime

df = pd.read_excel('/content/data_carradios.xlsx')

def get_ages(col):
  result = (datetime.datetime.now()-col).astype('<m8[Y]')
  result = pd.DataFrame(result)
  return result

ager = Pipeline([
    ('ages', FunctionTransformer(get_ages, feature_names_out='one-to-one')),
    ('scale', StandardScaler())
])

def get_weekdays(col):
  result = col.iloc[:,0].dt.weekday
  result = pd.DataFrame(result)
  return result

weeker = Pipeline([
    ('weekd', FunctionTransformer(get_weekdays, feature_names_out='one-to-one')),
    ('oneh', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer([
    ('ages_tr', ager, ['bdate']),
    ('weekd_tr', weeker, ['datep']),
    ('team_tr', OneHotEncoder(drop='first'), ['team']),
    ('scaler', StandardScaler(), ['prized', 'prizeq'])],
    remainder='passthrough')

X = df.drop('perc_defec', axis=1)
y = df['perc_defec']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

hyper = ({
    'ccp_alpha': [0.001, 0.003, 0.1, 0.3, 0.5]
})

pipe = Pipeline([
    ('pre', preprocessor),
    #('lm', LinearRegression()),
    #('tree', DecisionTreeRegressor()),
    ('grid', GridSearchCV(DecisionTreeRegressor(), hyper, cv=5))])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)

mae = mean_absolute_error(y_train, y_pred)
rsme = mean_squared_error(y_train, y_pred, squared=False)
r2 = r2_score(y_train, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')
y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rsme = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')

MAE= 3.2898610178578025
RSME= 4.271822451632366
R2= 0.9197521852026403
MAE= 3.6986164844013847
RSME= 4.921032250811609
R2= 0.8971487666569046


The overfitting seems not be present since the results of the predictive performance of the model for both sets are similar.

There is a set of hyperparameters in the professor's note that we can use for the final assignment!!!

# **RANDOM FORESTS**

* it works in both classification and regression problems;

* they are specially good for classification;

* it used multiple trees;

* it may be heavy computationally.

See tablet.


**RANDOM FORESTS - REGRESSION**

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
#from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import datetime

df = pd.read_excel('/content/data_carradios.xlsx')

def get_ages(col):
  result = (datetime.datetime.now()-col).astype('<m8[Y]')
  result = pd.DataFrame(result)
  return result

ager = Pipeline([
    ('ages', FunctionTransformer(get_ages, feature_names_out='one-to-one')),
    ('scale', StandardScaler())
])

def get_weekdays(col):
  result = col.iloc[:,0].dt.weekday
  result = pd.DataFrame(result)
  return result

weeker = Pipeline([
    ('weekd', FunctionTransformer(get_weekdays, feature_names_out='one-to-one')),
    ('oneh', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer([
    ('ages_tr', ager, ['bdate']),
    ('weekd_tr', weeker, ['datep']),
    ('team_tr', OneHotEncoder(drop='first'), ['team']),
    ('scaler', StandardScaler(), ['prized', 'prizeq'])],
    remainder='passthrough')

X = df.drop('perc_defec', axis=1)
y = df['perc_defec']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# hyper = ({
#     'ccp_alpha': [0.001, 0.003, 0.1, 0.3, 0.5]
# })

# pipe = Pipeline([
#     ('pre', preprocessor),
#     #('lm', LinearRegression()),
#     #('tree', DecisionTreeRegressor()),
#     ('grid', GridSearchCV(DecisionTreeRegressor(), hyper, cv=5))])

pipe = Pipeline([
    ('pre', preprocessor),
    ('rf', RandomForestRegressor())])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)

mae = mean_absolute_error(y_train, y_pred)
rsme = mean_squared_error(y_train, y_pred, squared=False)
r2 = r2_score(y_train, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')
y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rsme = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')

MAE= 2.7736142523148026
RSME= 3.7237695107788875
R2= 0.939022107456038
MAE= 3.7842542658143907
RSME= 5.0773780396263115
R2= 0.8905095900145092


Again, we can be a little bit suspicious of overfitting because there is some difference between the R2 of both sets.

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
#from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import datetime

df = pd.read_excel('/content/data_carradios.xlsx')

def get_ages(col):
  result = (datetime.datetime.now()-col).astype('<m8[Y]')
  result = pd.DataFrame(result)
  return result

ager = Pipeline([
    ('ages', FunctionTransformer(get_ages, feature_names_out='one-to-one')),
    ('scale', StandardScaler())
])

def get_weekdays(col):
  result = col.iloc[:,0].dt.weekday
  result = pd.DataFrame(result)
  return result

weeker = Pipeline([
    ('weekd', FunctionTransformer(get_weekdays, feature_names_out='one-to-one')),
    ('oneh', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer([
    ('ages_tr', ager, ['bdate']),
    ('weekd_tr', weeker, ['datep']),
    ('team_tr', OneHotEncoder(drop='first'), ['team']),
    ('scaler', StandardScaler(), ['prized', 'prizeq'])],
    remainder='passthrough')

X = df.drop('perc_defec', axis=1)
y = df['perc_defec']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

hyper = ({
    'ccp_alpha': [0.001, 0.003, 0.1, 0.3, 0.5],
    'n_estimators': [10, 50, 100, 150]
})

# pipe = Pipeline([
#     ('pre', preprocessor),
#     ('rf', RandomForestRegressor())])

pipe = Pipeline([
    ('pre', preprocessor),
    ('grid', GridSearchCV(RandomForestRegressor(), hyper, cv=5))])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)

mae = mean_absolute_error(y_train, y_pred)
rsme = mean_squared_error(y_train, y_pred, squared=False)
r2 = r2_score(y_train, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')
y_pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rsme = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE= {mae}')
print(f'RSME= {rsme}')
print(f'R2= {r2}')

MAE= 3.280953030291423
RSME= 4.264505828858355
R2= 0.9200268408922576
MAE= 3.727294990382348
RSME= 4.93468758066848
R2= 0.8965771727033631


What is the best hyperparameter and the best number of trees?

In [14]:
pipe.named_steps['grid'].best_params_

{'ccp_alpha': 0.5, 'n_estimators': 50}