In [66]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# 0. Notebook description
In the previous notebook, we trained simple models on our dataset. We used f1 macro as our scoring metric, to account for the unbalanced class ratio in our multiclass problem (patients who returned to hospital within 30 days of a hospital encounter were far less frequent than patients who returned after 30 days, or not at all). The simple models with optimal hyperparameters only performed marginally better than a baseline estimator trained on one feature.
 
In this notebook, we create 3 ensemble models, perform hyperparameter tuning for each, and evaluate their performance on the validation set. 
- Different versions of voting classifier (we select the simple models with the best-performing hyperparameters from the previous notebook to use in a voting classifier)
- Adaboost classifier
- Random forest classifier

# 1. Load training and validation set

In [67]:
train = pd.read_csv("train_set_small.csv")
train

Unnamed: 0,index,most_frequent__race_AfricanAmerican,most_frequent__race_Asian,most_frequent__race_Caucasian,most_frequent__race_Hispanic,most_frequent__race_Other,none__payer_code_BC,none__payer_code_CH,none__payer_code_CM,none__payer_code_CP,...,numerical_values__time_in_hospital,numerical_values__num_lab_procedures,numerical_values__num_procedures,numerical_values__num_medications,numerical_values__number_outpatient,numerical_values__number_emergency,numerical_values__number_inpatient,numerical_values__number_diagnoses,index.1,readmitted
0,54336,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.137263,0.361773,-0.781626,-1.735755,-0.29548,-0.204066,-0.504822,-1.237903,67320,>30
1,12097,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.126210,0.719029,-0.781626,-0.365228,-0.29548,-0.204066,-0.504822,-1.237903,4817,NO
2,59811,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,0.770065,0.396083,0.008552,-0.29548,-0.204066,-0.504822,0.823550,79655,NO
3,27815,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.569931,2.301161,0.984937,1.254486,-0.29548,0.824058,-0.504822,0.823550,34440,>30
4,60487,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.547825,0.004518,2.751500,0.506926,-0.29548,-0.204066,-0.504822,0.823550,82921,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6352,34630,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,-1.373468,0.984937,-0.365228,-0.29548,-0.204066,-0.504822,0.823550,47427,>30
6353,4509,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.126210,1.484577,-0.781626,-0.489821,-0.29548,-0.204066,1.102422,0.308187,2251,<30
6354,6658,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,0.565919,-0.781626,-0.739008,-0.29548,-0.204066,-0.504822,-1.237903,29927,NO
6355,13247,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.800245,1.127321,-0.192771,-1.361975,-0.29548,-0.204066,-0.504822,0.308187,12890,NO


In [68]:
validation = pd.read_csv("validation_set.csv")
validation

Unnamed: 0,most_frequent__race_AfricanAmerican,most_frequent__race_Asian,most_frequent__race_Caucasian,most_frequent__race_Hispanic,most_frequent__race_Other,none__payer_code_BC,none__payer_code_CH,none__payer_code_CM,none__payer_code_CP,none__payer_code_DM,...,numerical_values__time_in_hospital,numerical_values__num_lab_procedures,numerical_values__num_procedures,numerical_values__num_medications,numerical_values__number_outpatient,numerical_values__number_emergency,numerical_values__number_inpatient,numerical_values__number_diagnoses,index,readmitted
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,-0.505847,-0.781626,-0.988194,-0.295480,-0.204066,-0.504822,0.823550,66543,<30
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,0.004518,-0.781626,-0.863601,-0.295480,-0.204066,0.298800,-0.722540,13478,NO
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-1.137263,-0.505847,-0.781626,-1.860348,2.131202,0.824058,-0.504822,-0.722540,98787,NO
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.137263,-1.679687,-0.781626,-0.240634,-0.295480,-0.204066,-0.504822,-0.722540,67353,NO
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,0.412810,-0.781626,-0.240634,-0.295480,-0.204066,2.709666,-1.237903,34878,<30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15890,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,-0.148592,0.396083,0.382332,-0.295480,-0.204066,-0.504822,-2.268630,7170,NO
15891,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.463228,1.280431,-0.192771,-0.365228,-0.295480,-0.204066,1.102422,0.823550,53317,>30
15892,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.126210,-0.097555,-0.781626,-1.361975,-0.295480,-0.204066,-0.504822,-1.753267,6302,>30
15893,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.463228,-0.709993,0.396083,-0.489821,-0.295480,-0.204066,-0.504822,-1.237903,79203,NO


In [69]:
X_train = train.drop("readmitted", axis=1)
y_train = train.readmitted
X_validation = validation.drop("readmitted", axis=1)
y_validation = validation.readmitted


We select the 25 k-best features that will be used for the different ensembles.

In [70]:
selector = SelectKBest(k=25)
selector.fit(X_train, train.readmitted)
best_features = selector.get_feature_names_out()

We also define a function that takes training data, a hyperparameter grid, and classifier, and performs a grid search to find the optimal hyperparameters for the ensemble. The function then returns the best hyperparameters.

In [71]:
def get_best_hyperparams(X_train, y_train, param_grid, clf):
    cv = 5

    # Select K Best Features
    selector = SelectKBest(k=25)
    selector.fit(X_train, y_train)

    # Get the selected feature names
    best_features = selector.get_feature_names_out()

    # Use the best features for training
    train_features = X_train[best_features]
    print(train_features.shape)

    # Perform GridSearchCV to find the best parameters
    gridsearch = GridSearchCV(clf, param_grid, cv=cv, scoring='f1_macro', verbose=3)
    gridsearch.fit(train_features, y_train)
    
    # Print the best parameters and the best score
    print("Best params: ", gridsearch.best_params_, "Best score:", gridsearch.best_score_)

# 2. Apply different ensembles

In this section, we apply different methods of ensembles that we have been introduced to in class to find out which method performs best with the top 25 features.

## 2.1 Voting Classifier

We tried various combinations for the voting classifier.

To get started, we first took the five best classifiers from our second notebook. As hard voting uses the majority vote, we can only use odd numbers of base learners.

In [61]:
# 5 base learners with hard voting
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
svc_clf = SVC(C=1.0, gamma=0.1, kernel='rbf', max_iter=1000)
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=5, weights='uniform')
# build the ensemble
voting_clf = VotingClassifier(
estimators=[ ('lr', log_clf),
('svc', svc_clf),
('gaussian', gnb_clf),
('bernoulli', bernoulli_clf),
('knn', knn_clf)],
voting='hard'
)

# train the ensemble
# base learners are cloned, and clones are trained
voting_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = voting_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.27      0.11      0.16      1810
         >30       0.45      0.49      0.47      5681
          NO       0.63      0.67      0.65      8404

    accuracy                           0.54     15895
   macro avg       0.45      0.42      0.43     15895
weighted avg       0.52      0.54      0.53     15895



The f1 macro average score for five best performing classifiers is 0.43.

As a second step, we take the three best performing classifiers from the second notebook with hard voting. As hard voting uses the majority vote, we can only use odd numbers of base learners.

In [62]:
# 3 base learners with hard voting
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
svc_clf = SVC(C=1.0, gamma=0.1, kernel='rbf', max_iter=1000)
gnb_clf = GaussianNB(var_smoothing=1e-05)
# build the ensemble
voting_clf = VotingClassifier(
estimators=[ ('lr', log_clf),
('gaussian', gnb_clf),
('svc', svc_clf)],
voting='hard'
)

# train the ensemble
# base learners are cloned, and clones are trained
voting_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = voting_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.21      0.21      0.21      1810
         >30       0.43      0.47      0.45      5681
          NO       0.64      0.60      0.62      8404

    accuracy                           0.51     15895
   macro avg       0.43      0.43      0.43     15895
weighted avg       0.52      0.51      0.51     15895



The f1 macro average for three base learners is also 0.43 and is therefore no worse than for five base learners.

Finally, we use four base learners that return probabilities in order to run the voting classifier with soft voting

In [64]:
# 4 base learners with soft voting
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
svc_clf = SVC(C=1.0, gamma=0.1, kernel='rbf', max_iter=1000, probability=True)
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
# build the ensemble
voting_clf = VotingClassifier(
estimators=[ ('lr', log_clf),
('gaussian', gnb_clf),
('svc', svc_clf),
('bernoulli', bernoulli_clf)],
voting='soft'
)

# train the ensemble
# base learners are cloned, and clones are trained
voting_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = voting_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.28      0.12      0.17      1810
         >30       0.43      0.56      0.49      5681
          NO       0.64      0.59      0.61      8404

    accuracy                           0.53     15895
   macro avg       0.45      0.42      0.42     15895
weighted avg       0.52      0.53      0.52     15895



The f1 macro average with soft voting is 0.42 and is therefore slightly worse than with hard voting.

## 2.2 Adaboost Classifier

Next, we applied the AdaBoost classifier. We used the Adaboost Classifier with various hyperparameters to find the best ones. After that we trained the AdaBoost classifier on the training data and evaluated it on the validation set.

In [35]:
param_grid = {
    'n_estimators': np.arange(50, 201, 50),
    'learning_rate': np.linspace(0.1, 1.0, 5),
    'algorithm': ['SAMME', 'SAMME.R']
}

get_best_hyperparams(X_train, y_train, param_grid, AdaBoostClassifier())

(6357, 25)
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=50;, score=0.357 total time=   0.2s
[CV 2/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=50;, score=0.350 total time=   0.2s
[CV 3/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=50;, score=0.347 total time=   0.2s
[CV 4/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=50;, score=0.336 total time=   0.2s
[CV 5/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=50;, score=0.344 total time=   0.2s
[CV 1/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=100;, score=0.357 total time=   0.4s
[CV 2/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=100;, score=0.350 total time=   0.5s
[CV 3/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=100;, score=0.347 total time=   0.4s
[CV 4/5] END algorithm=SAMME, learning_rate=0.1, n_estimators=100;, score=0.355 total time=   0.4s
[CV 5/5] END algorithm=SAMME, learning_ra

In [36]:
# Create a weak learner (base estimator), usually a decision tree stump
base_estimator = DecisionTreeClassifier(max_depth=1)
# Create an AdaBoost classifier
adaboost_clf = AdaBoostClassifier(algorithm='SAMME.R', learning_rate=0.775, n_estimators=200)
# Train the AdaBoost classifier on the training data
adaboost_clf.fit(X_train[best_features], y_train)
# Make predictions on the validation set
y_pred = adaboost_clf.predict(X_validation[best_features])
# Evaluate the f1_macro of the AdaBoost classifier
f1_macro = f1_score(y_validation, y_pred, average='macro')
print(f"f1 macro: {f1_macro:.2f}")

f1 macro: 0.38


## 2.3 Random Forest Classifier

Finally, we applied the Random Forest classifier. We used the Random Forest Classifier with various hyperparameters to find the best ones. After that we trained the Random Forest classifier on the training data and evaluated it on the validation set.

In [37]:
param_grid = {
    'n_estimators': [50, 100, 150],  
    'max_depth': [None, 10, 50, 100],
    'min_samples_split': [2, 5],  
    'min_samples_leaf': [1, 2],  
    'max_features': ['sqrt', 'log2'],  
    'bootstrap': [True]  
}

get_best_hyperparams(X_train, y_train, param_grid, RandomForestClassifier())

(6357, 25)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.406 total time=   0.2s
[CV 2/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.401 total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.374 total time=   0.2s
[CV 4/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.362 total time=   0.2s
[CV 5/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.389 total time=   0.2s
[CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.393 total time=   0.4s
[C

In [38]:
random_forest_clf = RandomForestClassifier(bootstrap=True, max_depth=50, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=100)
random_forest_clf.fit(X_train[best_features], y_train)
# Make predictions on the validation set
y_pred = random_forest_clf.predict(X_validation[best_features])
# Evaluate the f1_macro of the AdaBoost classifier
f1_macro = f1_score(y_validation, y_pred, average='macro')
print(f"f1 macro: {f1_macro:.2f}")

f1 macro: 0.38


## 2.4 Stacking

We tried various combinations for the stacking classifier.

To get started, we first took all of the estimators from our second notebook.

In [76]:
# 6 base learners
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
svc_clf = SVC(C=1.0, gamma=0.1, kernel='rbf', max_iter=1000)
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=5, weights='uniform')
dt_clf = DecisionTreeClassifier(criterion='gini', max_depth=14)
# build the ensemble
stacking_clf = StackingClassifier(
estimators=[ ('lr', log_clf),
('svc', svc_clf),
('gaussian', gnb_clf),
('bernoulli', bernoulli_clf),
('knn', knn_clf),
('dt', dt_clf)],
cv=5
)

# train the ensemble
# base learners are cloned, and clones are trained
stacking_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = stacking_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.39      0.04      0.08      1810
         >30       0.48      0.35      0.40      5681
          NO       0.60      0.83      0.70      8404

    accuracy                           0.57     15895
   macro avg       0.49      0.41      0.39     15895
weighted avg       0.53      0.57      0.52     15895



The f1 macro average for six base learners is 0.39.

Secondly, we took the five best estimators from our second notebook.

In [77]:
# 5 base learners
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
svc_clf = SVC(C=1.0, gamma=0.1, kernel='rbf', max_iter=1000)
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=5, weights='uniform')
# build the ensemble
stacking_clf = StackingClassifier(
estimators=[ ('lr', log_clf),
('svc', svc_clf),
('gaussian', gnb_clf),
('bernoulli', bernoulli_clf),
('knn', knn_clf)],
cv=5
)

# train the ensemble
# base learners are cloned, and clones are trained
stacking_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = stacking_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.39      0.04      0.08      1810
         >30       0.48      0.35      0.41      5681
          NO       0.60      0.83      0.70      8404

    accuracy                           0.57     15895
   macro avg       0.49      0.41      0.39     15895
weighted avg       0.53      0.57      0.52     15895



The f1 macro average for five base learners is 0.39 and is therefore no worse than for six base learners.

Then, we took the four best estimators from our second notebook.

In [80]:
# 4 base learners
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
svc_clf = SVC(C=1.0, gamma=0.1, kernel='rbf', max_iter=1000)
gnb_clf = GaussianNB(var_smoothing=1e-05)
bernoulli_clf = BernoulliNB(alpha=0.1, binarize=0.0)
# build the ensemble
stacking_clf = StackingClassifier(
estimators=[ ('lr', log_clf),
('svc', svc_clf),
('gaussian', gnb_clf),
('bernoulli', bernoulli_clf)],
cv=5
)

# train the ensemble
# base learners are cloned, and clones are trained
stacking_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = stacking_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.41      0.05      0.08      1810
         >30       0.48      0.35      0.41      5681
          NO       0.60      0.83      0.70      8404

    accuracy                           0.57     15895
   macro avg       0.50      0.41      0.40     15895
weighted avg       0.54      0.57      0.52     15895



The f1 macro average for four base learners is 0.40 and is therefore slightly better than with five base learners.

Finally, we took the three best estimators from our second notebook.

In [81]:
# 3 base learners
log_clf = LogisticRegression(C=10.0, class_weight='balanced', penalty='l2')
svc_clf = SVC(C=1.0, gamma=0.1, kernel='rbf', max_iter=1000)
gnb_clf = GaussianNB(var_smoothing=1e-05)
# build the ensemble
stacking_clf = StackingClassifier(
estimators=[ ('lr', log_clf),
('svc', svc_clf),
('gaussian', gnb_clf)],
cv=5
)

# train the ensemble
# base learners are cloned, and clones are trained
stacking_clf.fit(X_train[best_features], y_train)

# Evaluate the ensemble
ensemble_pred = stacking_clf.predict(X_validation[best_features])
ensemble_f1 = f1_score(y_validation, ensemble_pred, average='macro')  # or 'weighted'
print(classification_report(y_validation, ensemble_pred))

              precision    recall  f1-score   support

         <30       0.41      0.04      0.08      1810
         >30       0.48      0.34      0.40      5681
          NO       0.60      0.83      0.70      8404

    accuracy                           0.57     15895
   macro avg       0.50      0.41      0.39     15895
weighted avg       0.54      0.57      0.52     15895



The f1 macro average for three base learners is 0.39 and is therefore slightly worse than with four base learners.

# 4. Model selection
The experiments in this notebook found that the best performing models, in order, are as follows:

1. Voting Classifier with three base learners and hard voting
2. Stacking Classifier with four base learners
3. Adaboost Classifier / Random Forest Classifier 

**Takeaways:**

- The Voting Classifier with three base learners and hard voting achieved the highest f1 macro average score, making it the best performing model in this experiment.
- The Stacking Classifier with four base learners also performed well, slightly better than configurations with more or fewer base learners.
- Both the Adaboost Classifier and Random Forest Classifier showed competitive performance, making them viable alternatives depending on the specific requirements and constraints of the problem.
- Ensemble methods generally improved the performance over individual classifiers, highlighting the benefit of combining multiple models to leverage their strengths.