In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# 0. Notebook description
In the previous notebook, we trained simple models on our dataset. We used f1 macro as our scoring metric, to account for the unbalanced class ratio in our multiclass problem (patients who returned to hospital within 30 days of a hospital encounter were far less frequent than patients who returned after 30 days, or not at all). The simple models with optimal hyperparameters only performed marginally better than a baseline estimator trained on one feature.
 
In this notebook, we create 3 ensemble models, perform hyperparameter tuning for each, and evaluate their performance on the validation set. 
- Voting classifier (we select the simple models with the best-performing hyperparameters from the previous notebook to use in a voting classifier)
- Adaboost classifier
- Random forest classifier

# 1. Load training and validation set

In [3]:
train = pd.read_csv("train_set.csv")
train

Unnamed: 0,most_frequent__race_AfricanAmerican,most_frequent__race_Asian,most_frequent__race_Caucasian,most_frequent__race_Hispanic,most_frequent__race_Other,none__payer_code_BC,none__payer_code_CH,none__payer_code_CM,none__payer_code_CP,none__payer_code_DM,...,numerical_values__time_in_hospital,numerical_values__num_lab_procedures,numerical_values__num_procedures,numerical_values__num_medications,numerical_values__number_outpatient,numerical_values__number_emergency,numerical_values__number_inpatient,numerical_values__number_diagnoses,index,readmitted
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.800245,-0.250665,1.573791,-0.614414,-0.295480,-0.204066,-0.504822,0.823550,15256,NO
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.884843,1.178358,-0.781626,0.506926,-0.295480,-0.204066,1.906044,0.823550,1354,NO
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.558878,0.974212,0.984937,-0.988194,-0.295480,-0.204066,-0.504822,0.823550,50947,<30
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,-1.526577,0.396083,1.379079,-0.295480,-0.204066,-0.504822,0.823550,81212,>30
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.137263,-0.301701,-0.781626,-1.112788,0.513414,-0.204066,-0.504822,-1.237903,39135,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63572,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.232914,0.157627,-0.781626,-0.365228,-0.295480,-0.204066,-0.504822,-1.237903,23828,>30
63573,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,-0.403774,-0.781626,-0.489821,-0.295480,-0.204066,-0.504822,-0.207176,39819,>30
63574,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.569931,1.535613,0.984937,1.628266,-0.295480,-0.204066,-0.504822,0.823550,21066,NO
63575,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.800245,1.331467,-0.781626,-0.116041,-0.295480,-0.204066,-0.504822,-0.722540,89048,NO


In [4]:
validation = pd.read_csv("validation_set.csv")
validation

Unnamed: 0,most_frequent__race_AfricanAmerican,most_frequent__race_Asian,most_frequent__race_Caucasian,most_frequent__race_Hispanic,most_frequent__race_Other,none__payer_code_BC,none__payer_code_CH,none__payer_code_CM,none__payer_code_CP,none__payer_code_DM,...,numerical_values__time_in_hospital,numerical_values__num_lab_procedures,numerical_values__num_procedures,numerical_values__num_medications,numerical_values__number_outpatient,numerical_values__number_emergency,numerical_values__number_inpatient,numerical_values__number_diagnoses,index,readmitted
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,-0.505847,-0.781626,-0.988194,-0.295480,-0.204066,-0.504822,0.823550,66543,<30
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,0.004518,-0.781626,-0.863601,-0.295480,-0.204066,0.298800,-0.722540,13478,NO
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1.137263,-0.505847,-0.781626,-1.860348,2.131202,0.824058,-0.504822,-0.722540,98787,NO
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.137263,-1.679687,-0.781626,-0.240634,-0.295480,-0.204066,-0.504822,-0.722540,67353,NO
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,0.412810,-0.781626,-0.240634,-0.295480,-0.204066,2.709666,-1.237903,34878,<30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15890,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,-0.148592,0.396083,0.382332,-0.295480,-0.204066,-0.504822,-2.268630,7170,NO
15891,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,1.280431,-0.192771,-0.365228,-0.295480,-0.204066,1.102422,0.823550,53317,>30
15892,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.126210,-0.097555,-0.781626,-1.361975,-0.295480,-0.204066,-0.504822,-1.753267,6302,>30
15893,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.463228,-0.709993,0.396083,-0.489821,-0.295480,-0.204066,-0.504822,-1.237903,79203,NO


In [5]:
X_train = train.drop("readmitted", axis=1)
y_train = train.readmitted
X_validation = validation.drop("readmitted", axis=1)
y_validation = validation.readmitted


We select the 25 k-best features that will be used for the different ensembles.

In [6]:
selector = SelectKBest(k=25)
selector.fit(X_train, train.readmitted)
best_features = selector.get_feature_names_out()

We also define a function that takes training data, a hyperparameter grid, and classifier, and performs a grid search to find the optimal hyperparameters for the ensemble. The function then returns the best hyperparameters.

In [7]:
def get_best_hyperparams(X_train, y_train, param_grid, clf):
    cv = 5

    # Select K Best Features
    selector = SelectKBest(k=25)
    selector.fit(X_train, y_train)

    # Get the selected feature names
    best_features = selector.get_feature_names_out()

    # Use the best features for training
    train_features = X_train[best_features]
    print(train_features.shape)

    # Perform GridSearchCV to find the best parameters
    gridsearch = GridSearchCV(clf, param_grid, cv=cv, scoring='f1_macro', verbose=3)
    gridsearch.fit(train_features, y_train)
    
    # Print the best parameters and the best score
    print("Best params: ", gridsearch.best_params_, "Best score:", gridsearch.best_score_)

# 2. Apply different ensembles

## 2.1 Voting Classifier

We tried various combinations for the voting classifier.

To get started, we first took the five best classifiers from our second notebook. As hard voting uses the majority vote, we can only use odd numbers of base learners.

In [35]:
# 5 base learners
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=5, weights='uniform')
dt_clf = DecisionTreeClassifier(criterion='gini', max_depth=14)
# build the ensemble
voting_clf = VotingClassifier(
estimators=[ ('lr', log_clf),
('gaussian', gnb_clf),
('bernoulli', bernoulli_clf),
('knn', knn_clf),
('dt', dt_clf)],
voting='hard'
)

# train the ensemble
# base learners are cloned, and clones are trained
voting_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = voting_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.26      0.11      0.15      1810
         >30       0.47      0.44      0.45      5681
          NO       0.63      0.74      0.68      8404

    accuracy                           0.56     15895
   macro avg       0.45      0.43      0.43     15895
weighted avg       0.53      0.56      0.54     15895



The f1 macro average score for five best performing classifiers is 0.43.

As a second step, we take the three best performing classifiers from the second notebook with hard voting. As hard voting uses the majority vote, we can only use odd numbers of base learners.

In [39]:
# 3 base learners with BernoulliNB as third base learner
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
# build the ensemble
voting_clf = VotingClassifier(
estimators=[ ('lr', log_clf),
('gaussian', gnb_clf),
('bernoulli', bernoulli_clf)],
voting='hard'
)

# train the ensemble
# base learners are cloned, and clones are trained
voting_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = voting_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.24      0.18      0.21      1810
         >30       0.47      0.41      0.44      5681
          NO       0.63      0.72      0.67      8404

    accuracy                           0.55     15895
   macro avg       0.45      0.44      0.44     15895
weighted avg       0.53      0.55      0.53     15895



The f1 macro average for three base learners with BernoulliNB as third base learner is 0.44 and is therefore slightly better than for five base learners.

In [37]:
# 3 base learners with KNeighborsClassifier as third base learner
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
gnb_clf = GaussianNB(var_smoothing=1e-05)
knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=5, weights='uniform')
# build the ensemble
voting_clf = VotingClassifier(
estimators=[ ('lr', log_clf),
('gaussian', gnb_clf),
('knn', knn_clf)],
voting='hard'
)

# train the ensemble
# base learners are cloned, and clones are trained
voting_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = voting_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.22      0.26      0.24      1810
         >30       0.48      0.36      0.41      5681
          NO       0.63      0.71      0.67      8404

    accuracy                           0.53     15895
   macro avg       0.44      0.44      0.44     15895
weighted avg       0.53      0.53      0.53     15895



The f1 macro average for three base learners with KNeighborsClassifier as third base learner is also 0.44.

In [38]:
# 3 base learners with DecisionTreeClassifier as third base learner
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
dt_clf = DecisionTreeClassifier(criterion='gini', max_depth=14)
# build the ensemble
voting_clf = VotingClassifier(
estimators=[ ('lr', log_clf),
('gaussian', gnb_clf),
('bernoulli', bernoulli_clf),
('dt', dt_clf)],
voting='hard'
)

# train the ensemble
# base learners are cloned, and clones are trained
voting_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = voting_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.29      0.14      0.19      1810
         >30       0.47      0.44      0.46      5681
          NO       0.63      0.72      0.67      8404

    accuracy                           0.56     15895
   macro avg       0.46      0.43      0.44     15895
weighted avg       0.53      0.56      0.54     15895



The f1 macro average for three base learners with DecisionTreeClassifier as third base learner is also 0.44.

Finally, we use four base learners that return probabilities in order to run the voting classifier with soft voting

In [8]:
# 4 base learners with soft voting
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
svc_clf = SVC(C=1.0, gamma=0.1, kernel='rbf', max_iter=1000, probability=True)
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
# build the ensemble
voting_clf = VotingClassifier(
estimators=[ ('lr', log_clf),
('gaussian', gnb_clf),
('svc', svc_clf),
('bernoulli', bernoulli_clf)],
voting='soft'
)

# train the ensemble
# base learners are cloned, and clones are trained
voting_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = voting_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.32      0.12      0.17      1810
         >30       0.47      0.40      0.43      5681
          NO       0.61      0.76      0.68      8404

    accuracy                           0.56     15895
   macro avg       0.47      0.43      0.43     15895
weighted avg       0.53      0.56      0.53     15895



The f1 macro average with soft voting is 0.43 and is therefore slightly worse than with hard voting.

## 2.2 Adaboost Classifier

Next, we applied the AdaBoost classifier. We used the Adaboost Classifier with various hyperparameters to find the best ones. After that we trained the AdaBoost classifier on the training data and evaluated it on the validation set.

In [9]:
param_grid = {
    'n_estimators': np.arange(50, 201, 50),
    'learning_rate': np.linspace(0.1, 1.0, 5),
    'algorithm': ['SAMME', 'SAMME.R']
}

get_best_hyperparams(X_train, y_train, param_grid, AdaBoostClassifier())

(63577, 25)
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=50;, score=0.329 total time=   2.7s
[CV 2/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=50;, score=0.329 total time=   2.7s
[CV 3/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=50;, score=0.331 total time=   2.9s
[CV 4/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=50;, score=0.330 total time=   3.8s
[CV 5/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=50;, score=0.331 total time=   4.8s
[CV 1/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=100;, score=0.329 total time=   9.6s
[CV 2/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=100;, score=0.334 total time=   9.9s
[CV 3/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=100;, score=0.331 total time=   9.8s
[CV 4/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=100;, score=0.330 total time=  11.0s
[CV 5/5] END algorithm=SAMME, learning_r



[CV 1/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=50;, score=0.352 total time=   3.2s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=50;, score=0.352 total time=   3.3s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=50;, score=0.351 total time=   3.2s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=50;, score=0.351 total time=   3.2s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=50;, score=0.350 total time=   3.2s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=100;, score=0.356 total time=   6.4s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=100;, score=0.359 total time=   6.4s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=100;, score=0.356 total time=   6.4s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=100;, score=0.354 total time=   6.4s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=100;, score=0.355 total time=   6.5s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=150;, score=0.362 total time=   9.5s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=150;, score=0.365 total time=  10.2s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=150;, score=0.363 total time=  10.1s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=150;, score=0.361 total time=   9.8s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=150;, score=0.364 total time=   9.7s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=200;, score=0.366 total time=  13.0s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=200;, score=0.372 total time=  13.1s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=200;, score=0.366 total time=  13.1s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=200;, score=0.366 total time=  13.0s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.1, n_estimators=200;, score=0.368 total time=  13.0s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=50;, score=0.365 total time=   3.2s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=50;, score=0.370 total time=   3.2s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=50;, score=0.366 total time=   3.3s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=50;, score=0.365 total time=   3.3s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=50;, score=0.367 total time=   3.2s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=100;, score=0.376 total time=   6.4s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=100;, score=0.379 total time=   6.4s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=100;, score=0.376 total time=   6.5s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=100;, score=0.377 total time=   6.5s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=100;, score=0.372 total time=   6.5s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=150;, score=0.379 total time=   9.6s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=150;, score=0.382 total time=   9.7s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=150;, score=0.379 total time=   9.8s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=150;, score=0.380 total time=   9.9s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=150;, score=0.378 total time=   9.8s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=200;, score=0.381 total time=  12.9s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=200;, score=0.385 total time=  13.3s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=200;, score=0.382 total time=  13.3s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=200;, score=0.380 total time=  13.4s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.325, n_estimators=200;, score=0.380 total time=  13.2s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=50;, score=0.378 total time=   3.2s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=50;, score=0.380 total time=   3.3s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=50;, score=0.378 total time=   3.2s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=50;, score=0.378 total time=   3.3s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=50;, score=0.376 total time=   3.2s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=100;, score=0.382 total time=   6.6s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=100;, score=0.384 total time=   6.4s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=100;, score=0.384 total time=   6.5s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=100;, score=0.381 total time=   6.5s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=100;, score=0.380 total time=   6.4s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=150;, score=0.385 total time=   9.6s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=150;, score=0.386 total time=   9.8s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=150;, score=0.385 total time=   9.9s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=150;, score=0.384 total time=   9.8s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=150;, score=0.382 total time=  10.0s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=200;, score=0.385 total time=  12.8s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=200;, score=0.386 total time=  12.9s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=200;, score=0.386 total time=  12.9s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=200;, score=0.383 total time=  12.9s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.55, n_estimators=200;, score=0.382 total time=  12.9s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=50;, score=0.387 total time=   3.2s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=50;, score=0.384 total time=   3.3s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=50;, score=0.383 total time=   3.5s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=50;, score=0.385 total time=   3.3s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=50;, score=0.384 total time=   3.3s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=100;, score=0.388 total time=   6.4s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=100;, score=0.389 total time=   6.5s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=100;, score=0.386 total time=   6.5s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=100;, score=0.386 total time=   6.5s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=100;, score=0.384 total time=   6.5s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=150;, score=0.387 total time=   9.6s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=150;, score=0.389 total time=   9.7s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=150;, score=0.385 total time=   9.7s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=150;, score=0.386 total time=  10.0s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=150;, score=0.385 total time=  10.0s




[CV 1/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=200;, score=0.388 total time=  13.0s




[CV 2/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=200;, score=0.388 total time=  12.9s




[CV 3/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=200;, score=0.387 total time=  13.2s




[CV 4/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=200;, score=0.384 total time=  13.4s




[CV 5/5] END algorithm=SAMME.R, learning_rate=0.775, n_estimators=200;, score=0.385 total time=  12.9s




[CV 1/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=50;, score=0.386 total time=   3.2s




[CV 2/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=50;, score=0.389 total time=   3.2s




[CV 3/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=50;, score=0.388 total time=   3.2s




[CV 4/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=50;, score=0.388 total time=   3.2s




[CV 5/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=50;, score=0.386 total time=   3.2s




[CV 1/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=100;, score=0.389 total time=   6.6s




[CV 2/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=100;, score=0.389 total time=   6.4s




[CV 3/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=100;, score=0.386 total time=   6.4s




[CV 4/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=100;, score=0.388 total time=   6.4s




[CV 5/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=100;, score=0.385 total time=   6.4s




[CV 1/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=150;, score=0.390 total time=   9.6s




[CV 2/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=150;, score=0.392 total time=   9.7s




[CV 3/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=150;, score=0.387 total time=   9.7s




[CV 4/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=150;, score=0.388 total time=   9.6s




[CV 5/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=150;, score=0.386 total time=  10.0s




[CV 1/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=200;, score=0.390 total time=  13.2s




[CV 2/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=200;, score=0.390 total time=  13.2s




[CV 3/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=200;, score=0.388 total time=  13.0s




[CV 4/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=200;, score=0.386 total time=  13.1s




[CV 5/5] END algorithm=SAMME.R, learning_rate=1.0, n_estimators=200;, score=0.386 total time=  13.0s




Best params:  {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 150} Best score: 0.3884299711754948


In [10]:
# Create a weak learner (base estimator), usually a decision tree stump
base_estimator = DecisionTreeClassifier(max_depth=1)
# Create an AdaBoost classifier
adaboost_clf = AdaBoostClassifier(algorithm='SAMME.R', learning_rate=1.0, n_estimators=150)
# Train the AdaBoost classifier on the training data
adaboost_clf.fit(X_train[best_features], y_train)
# Make predictions on the validation set
y_pred = adaboost_clf.predict(X_validation[best_features])
# Evaluate the f1_macro of the AdaBoost classifier
f1_macro = f1_score(y_validation, y_pred, average='macro')
print(f"f1 macro: {f1_macro:.2f}")



f1 macro: 0.39


## 2.3 Random Forest Classifier

Finally, we applied the Random Forest classifier. We used the Random Forest Classifier with various hyperparameters to find the best ones. After that we trained the Random Forest classifier on the training data and evaluated it on the validation set.

In [7]:
param_grid = {
    'n_estimators': [50, 100, 150],  
    'max_depth': [None, 10, 50, 100],
    'min_samples_split': [2, 5],  
    'min_samples_leaf': [1, 2],  
    'max_features': ['sqrt', 'log2'],  
    'bootstrap': [True]  
}

get_best_hyperparams(X_train, y_train, param_grid, RandomForestClassifier())

(63577, 25)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.388 total time=   3.6s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.391 total time=   3.4s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.395 total time=   3.7s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.395 total time=   3.3s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.393 total time=   3.2s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.383 total time=   6.7s
[

In [8]:
random_forest_clf = RandomForestClassifier(bootstrap=True, max_depth=50, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=50)
random_forest_clf.fit(X_train[best_features], y_train)
# Make predictions on the validation set
y_pred = random_forest_clf.predict(X_validation[best_features])
# Evaluate the f1_macro of the AdaBoost classifier
f1_macro = f1_score(y_validation, y_pred, average='macro')
print(f"f1 macro: {f1_macro:.2f}")

f1 macro: 0.39


## 2.4 Stacking

We tried various combinations for the stacking classifier.

To get started, we first took all of the estimators from our fourth notebook.

In [42]:
# 6 base learners
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=5, weights='uniform')
dt_clf = DecisionTreeClassifier(criterion='gini', max_depth=14)
svc_clf = SVC(C=1.0, gamma=0.1, kernel='rbf', max_iter=1000)
# build the ensemble
stacking_clf = StackingClassifier(
estimators=[ ('lr', log_clf),
('gaussian', gnb_clf),
('bernoulli', bernoulli_clf),
('knn', knn_clf),
('dt', dt_clf),
('svc', svc_clf)],
cv=5
)

# train the ensemble
# base learners are cloned, and clones are trained
stacking_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = stacking_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.47      0.03      0.06      1810
         >30       0.50      0.35      0.41      5681
          NO       0.60      0.84      0.70      8404

    accuracy                           0.57     15895
   macro avg       0.52      0.41      0.39     15895
weighted avg       0.55      0.57      0.52     15895



The f1 macro average for six base learners is 0.39.

Secondly, we took the five best estimators from our second notebook.

In [43]:
# 5 base learners
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=5, weights='uniform')
dt_clf = DecisionTreeClassifier(criterion='gini', max_depth=14)
# build the ensemble
stacking_clf = StackingClassifier(
estimators=[ ('lr', log_clf),
('gaussian', gnb_clf),
('bernoulli', bernoulli_clf),
('knn', knn_clf),
('dt', dt_clf)],
cv=5
)

# train the ensemble
# base learners are cloned, and clones are trained
stacking_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = stacking_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.48      0.03      0.06      1810
         >30       0.50      0.34      0.40      5681
          NO       0.60      0.85      0.70      8404

    accuracy                           0.57     15895
   macro avg       0.53      0.41      0.39     15895
weighted avg       0.55      0.57      0.52     15895



The f1 macro average for five base learners is 0.39 and is therefore no worse than for six base learners.

Then, we took the four best estimators from our second notebook.

In [44]:
# 4 base learners
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=5, weights='uniform')
# build the ensemble
stacking_clf = StackingClassifier(
estimators=[ ('lr', log_clf),
('gaussian', gnb_clf),
('bernoulli', bernoulli_clf),
('knn', knn_clf)],
cv=5
)

# train the ensemble
# base learners are cloned, and clones are trained
stacking_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = stacking_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.47      0.03      0.06      1810
         >30       0.50      0.34      0.41      5681
          NO       0.60      0.85      0.70      8404

    accuracy                           0.57     15895
   macro avg       0.52      0.41      0.39     15895
weighted avg       0.55      0.57      0.52     15895



The f1 macro average for four base learners is 0.39 and is therefore no worse than for five base learners.

Then, we took the three best estimators from our second notebook.

In [45]:
# 3 base learners
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
# build the ensemble
stacking_clf = StackingClassifier(
estimators=[ ('lr', log_clf),
('gaussian', gnb_clf),
('bernoulli', bernoulli_clf)],
cv=5
)

# train the ensemble
# base learners are cloned, and clones are trained
stacking_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = stacking_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.48      0.03      0.06      1810
         >30       0.50      0.33      0.40      5681
          NO       0.60      0.85      0.70      8404

    accuracy                           0.57     15895
   macro avg       0.52      0.41      0.39     15895
weighted avg       0.55      0.57      0.52     15895



The f1 macro average for three base learners is 0.39 and is therefore no worse than for four base learners.

# 4. Model selection
The experiments in this notebook found that the best performing models, in order, are as follows:

1. Voting Classifier with three base learners and hard voting
2. 
3. 
4. 

**Takeaways:**
