# Hyperparameter Tuning

In [1]:
import os
os.chdir('..')

In [19]:
import pickle
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from src.hyperparameter_tuning import tune_hyperparameters
from src.model_training import build_preprocessor, build_full_pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Both the initial cleaned data (df) and the modified version of the data after the Feature Analysis (df2) are included here for hyperparameter tuning

In [240]:
# Use this code in the event that it's needed
df = pd.read_csv('data/processed_data/df_clean.csv')

X = df.drop('at_risk', axis=1)
y = df.at_risk

In [241]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

preprocessor = build_preprocessor(numeric_features, categorical_features)

In [242]:
nb_model = build_full_pipeline(preprocessor, GaussianNB())
nn_model = build_full_pipeline(preprocessor, MLPClassifier())
xgb_model = build_full_pipeline(preprocessor, XGBClassifier())
gb_model = build_full_pipeline(preprocessor, GradientBoostingClassifier())
rf_model = build_full_pipeline(preprocessor, RandomForestClassifier())

## XGBoost Tuning

### 1. Using Recall as scoring metric

In [182]:
# Hyperparameter grid for XGBoost
xgb_param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.2, 0.3],
    'model__scale_pos_weight': [5, 7, 9]  # Adjust based on the class imbalance
}

In [183]:
# Tune hyperparameters
tuned_xgb_model = tune_hyperparameters(xgb_model, xgb_param_grid, X_train, y_train, scoring='recall')

# Get the best hyperparameters
best_xgb_params = tuned_xgb_model.best_params_

# Print or use the best hyperparameters as needed
print("Best XGBoost hyperparameters for Recall Score:", best_xgb_params)

Best XGBoost hyperparameters for Recall Score: {'model__learning_rate': 0.3, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__scale_pos_weight': 9}


In [184]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_xgb_model.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_xgb = tuned_xgb_model.best_estimator_

Top Models:
                                               params  mean_test_score  \
23  {'model__learning_rate': 0.3, 'model__max_dept...         0.736611   
20  {'model__learning_rate': 0.3, 'model__max_dept...         0.727700   
5   {'model__learning_rate': 0.2, 'model__max_dept...         0.727700   
2   {'model__learning_rate': 0.2, 'model__max_dept...         0.724759   
1   {'model__learning_rate': 0.2, 'model__max_dept...         0.718745   

    std_test_score  
23        0.026142  
20        0.046948  
5         0.036450  
2         0.043169  
1         0.048144  


In [243]:
# Evaluate the best model
y_pred_best = best_model_xgb.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9427753934191703
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97      1322
           1       0.48      0.80      0.60        76

    accuracy                           0.94      1398
   macro avg       0.74      0.88      0.79      1398
weighted avg       0.96      0.94      0.95      1398


Confusion Matrix:
 [[1257   65]
 [  15   61]]


### 2. Using roc_auc as scoring metric

In [258]:
# Hyperparameter grid for XGBoost
xgb_param_grid2 = {
    'model__n_estimators': [200, 300, 400],
    'model__max_depth': [3, 4],
    'model__learning_rate': [0.3, 0.4, 0.5],
    'model__scale_pos_weight': [5, 7, 9]  # Adjust based on the class imbalance
}

In [255]:
# Tune hyperparameters
tuned_xgb_model_roc = tune_hyperparameters(xgb_model, xgb_param_grid2, X_train, y_train, scoring='roc_auc')

# Get the best hyperparameters
best_xgb_params_roc = tuned_xgb_model_roc.best_params_

# Print or use the best hyperparameters as needed
print("Best XGBoost hyperparameters for ROC_AUC Score:", best_xgb_params_roc)

Best XGBoost hyperparameters for ROC_AUC Score: {'model__learning_rate': 0.4, 'model__max_depth': 3, 'model__n_estimators': 300, 'model__scale_pos_weight': 9}


In [256]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_xgb_model_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_xgb_roc = tuned_xgb_model_roc.best_estimator_

Top Models:
                                              params  mean_test_score  \
1  {'model__learning_rate': 0.4, 'model__max_dept...         0.941505   
0  {'model__learning_rate': 0.35, 'model__max_dep...         0.938204   
2  {'model__learning_rate': 0.45, 'model__max_dep...         0.933655   

   std_test_score  
1        0.008915  
0        0.008977  
2        0.016366  


In [257]:
# Evaluate the best model
y_pred_best_roc = best_model_xgb_roc.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best_roc)
classification_rep_best = classification_report(y_test, y_pred_best_roc)
conf_matrix_best = confusion_matrix(y_test, y_pred_best_roc)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9713876967095851
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      1322
           1       0.72      0.78      0.75        76

    accuracy                           0.97      1398
   macro avg       0.85      0.88      0.87      1398
weighted avg       0.97      0.97      0.97      1398


Confusion Matrix:
 [[1299   23]
 [  17   59]]


### Saving the best XGBoost Model

While the best model for recall on the under-represented class is from the tuning with 'recall' as our scoring metric, the overall best model for all-around metrics is the tuned model with 'roc-auc' as our scoring metric.  Although we want the at-risk students to be correctly identified, and as many of them identified as possible, we don't want it to come at the cost of mis-classifying students who are not at risk as at-risk.  Thus, the second model is better at finding this balance.

In [260]:
# Save the models and their best parameters
joblib.dump(best_model_xgb, 'models/best_model_xgb.pkl')
joblib.dump(best_xgb_params, 'models/best_xgb_params.pkl')

joblib.dump(best_model_xgb_roc, 'models/best_model_xgb_roc.pkl')
joblib.dump(best_xgb_params_roc, 'models/best_xgb_params_roc.pkl')

['models/best_xgb_params_roc.pkl']

## Naive Bayes Tuning
- There is not much tuning to do for NB, but we'll make some changes to the "priors" parameter to see if we can come up with even a slightly improved model from the one before

In [265]:
# Hyperparameter grid for Gaussian Naive Bayes
nb_param_grid = {
    'model__priors': [None, [0.1, 0.9], [0.2, 0.8], [0.3, 0.7], [0.4, 0.6], [0.5, 0.5], [0.6, 0.4], [0.7, 0.3], [0.8, 0.2], [0.9, 0.1]]
}

In [266]:
# Tune hyperparameters
tuned_nb_model_roc = tune_hyperparameters(nb_model, nb_param_grid, X_train, y_train, scoring='roc_auc')


# Get the best hyperparameters
best_nb_params_roc = tuned_nb_model_roc.best_params_


# Print or use the best hyperparameters as needed
print("Best Naive Bayes hyperparameters for ROC_AUC Score:", best_nb_params_roc)

Best Naive Bayes hyperparameters for ROC_AUC Score: {'model__priors': None}


In [267]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nb_model_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_nb_model_roc.best_estimator_

Top Models:
                          params  mean_test_score  std_test_score
0        {'model__priors': None}         0.699506        0.013018
1  {'model__priors': [0.1, 0.9]}         0.699506        0.013018
2  {'model__priors': [0.2, 0.8]}         0.699506        0.013018
3  {'model__priors': [0.3, 0.7]}         0.699506        0.013018
4  {'model__priors': [0.4, 0.6]}         0.699506        0.013018


In [268]:
# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.5565092989985694
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.54      0.70      1322
           1       0.10      0.86      0.17        76

    accuracy                           0.56      1398
   macro avg       0.54      0.70      0.44      1398
weighted avg       0.94      0.56      0.67      1398


Confusion Matrix:
 [[713 609]
 [ 11  65]]


### Obeservations:
- This model is no better than the one we had previously in that, it contains more false negatives.  Despite the fact that the NB model does have a substantially low accuracy (with many false positives), we used it in the ensemble meta model because of it's high recall rate for the under-represented class.
- When we retrain the ensemble model, we will stick with the out-of-the-box NB model.

## Neural Network Tuning

#### 1. Using 'roc_auc' for scoring

In [279]:
nn_param_grid = {
    'model__hidden_layer_sizes': [(50,), (100,), (50, 50), (200,)],
    'model__alpha': [0.01, 0.05, 0.1],
    'model__learning_rate': ['adaptive'],
    'model__solver': ['adam'],
    'model__max_iter': [500, 600],
}

In [276]:
# Tune hyperparameters
tuned_nn_model_roc = tune_hyperparameters(nn_model, nn_param_grid, X_train, y_train, scoring='roc_auc')


# Get the best hyperparameters
best_nn_params_roc = tuned_nn_model_roc.best_params_


# Print or use the best hyperparameters as needed
print("Best Neural Net hyperparameters for ROC_AUC Score:", best_nn_params_roc)

Best Neural Net hyperparameters for ROC_AUC Score: {'model__alpha': 0.05, 'model__hidden_layer_sizes': (100,), 'model__learning_rate': 'adaptive', 'model__max_iter': 500, 'model__solver': 'adam'}


In [277]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nn_model_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_nn_roc = tuned_nn_model_roc.best_estimator_

Top Models:
                                              params  mean_test_score  \
1  {'model__alpha': 0.05, 'model__hidden_layer_si...         0.931744   
3  {'model__alpha': 0.05, 'model__hidden_layer_si...         0.929255   
0  {'model__alpha': 0.05, 'model__hidden_layer_si...         0.924486   
2  {'model__alpha': 0.05, 'model__hidden_layer_si...         0.919635   

   std_test_score  
1        0.006954  
3        0.007944  
0        0.008172  
2        0.010047  


In [278]:
# Evaluate the best model
y_pred_best = best_model_nn_roc.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9670958512160229
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      1322
           1       0.78      0.55      0.65        76

    accuracy                           0.97      1398
   macro avg       0.88      0.77      0.81      1398
weighted avg       0.96      0.97      0.96      1398


Confusion Matrix:
 [[1310   12]
 [  34   42]]


#### 2. Using 'f1' for scoring

In [289]:
nn_param_grid2 = {
    'model__hidden_layer_sizes': [(50,), (100,), (50, 50), (200,)],
    'model__alpha': [0.01, 0.05, 0.1],
    'model__learning_rate': ['adaptive'],
    'model__solver': ['adam'],
    'model__max_iter': [500, 600],
}

In [286]:
# Tune hyperparameters
tuned_nn_model_f1 = tune_hyperparameters(nn_model, nn_param_grid2, X_train, y_train, scoring='f1')


# Get the best hyperparameters
best_nn_params_f1 = tuned_nn_model_f1.best_params_


# Print or use the best hyperparameters as needed
print("Best Neural Net hyperparameters for F1 Score:", best_nn_params_f1)

Best Neural Net hyperparameters for F1 Score: {'model__alpha': 0.01, 'model__hidden_layer_sizes': (50,), 'model__learning_rate': 'adaptive', 'model__max_iter': 500, 'model__solver': 'adam'}


In [287]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nn_model_f1.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_nn_f1 = tuned_nn_model_f1.best_estimator_

Top Models:
                                              params  mean_test_score  \
1  {'model__alpha': 0.01, 'model__hidden_layer_si...         0.610509   
0  {'model__alpha': 0.001, 'model__hidden_layer_s...         0.580973   

   std_test_score  
1        0.051827  
0        0.061445  


In [288]:
# Evaluate the best model
y_pred_best = best_model_nn_f1.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9685264663805436
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      1322
           1       0.77      0.61      0.68        76

    accuracy                           0.97      1398
   macro avg       0.87      0.80      0.83      1398
weighted avg       0.97      0.97      0.97      1398


Confusion Matrix:
 [[1308   14]
 [  30   46]]


### Saving the best Neural Net Model

In [284]:
# Save the models and their best parameters
joblib.dump(best_model_nn_f1, 'models/best_model_nn_f1.pkl')
joblib.dump(best_nn_params_f1, 'models/best_nn_params_f1.pkl')

['models/best_nn_params_f1.pkl']

## Random Forest Tuning

#### 1. Using 'roc_auc' for scoring

In [301]:
rf_param_grid = {
    'model__n_estimators': [400, 500, 600],
    'model__min_samples_split': [3, 5],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['sqrt'],
    'model__class_weight': ['balanced', {0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 4}]
}

In [302]:
# Tune hyperparameters
tuned_rf_model_roc = tune_hyperparameters(rf_model, rf_param_grid, X_train, y_train, scoring = 'roc_auc')


# Get the best hyperparameters
best_rf_params_roc = tuned_rf_model_roc.best_params_


# Print or use the best hyperparameters as needed
print("Best Random Forest hyperparameters for ROC_AUC Score:", best_rf_params_roc)

Best Random Forest hyperparameters for ROC_AUC Score: {'model__class_weight': {0: 1, 1: 2}, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 3, 'model__n_estimators': 500}


In [303]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_rf_model_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_rf_roc = tuned_rf_model_roc.best_estimator_

Top Models:
                                               params  mean_test_score  \
13  {'model__class_weight': {0: 1, 1: 2}, 'model__...         0.948203   
28  {'model__class_weight': {0: 1, 1: 3}, 'model__...         0.946693   
1   {'model__class_weight': 'balanced', 'model__ma...         0.946017   
15  {'model__class_weight': {0: 1, 1: 2}, 'model__...         0.945685   
2   {'model__class_weight': 'balanced', 'model__ma...         0.945515   

    std_test_score  
13        0.003776  
28        0.004543  
1         0.004731  
15        0.005345  
2         0.004854  


In [304]:
# Evaluate the best model
y_pred_best = best_model_rf_roc.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9642346208869814
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1322
           1       0.93      0.37      0.53        76

    accuracy                           0.96      1398
   macro avg       0.95      0.68      0.75      1398
weighted avg       0.96      0.96      0.96      1398


Confusion Matrix:
 [[1320    2]
 [  48   28]]


#### 2. Using recall score for under-represented class (pos =1)

In [305]:
rf_param_grid2 = {
    'model__n_estimators': [200, 300, 500],
    'model__min_samples_split': [3],
    'model__min_samples_leaf': [1],
    'model__max_features': ['sqrt'],
    'model__class_weight': ['balanced', {0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 4}]
}

In [306]:
# Tune hyperparameters
tuned_rf_model_recall = tune_hyperparameters(rf_model, rf_param_grid2, X_train, y_train, 
                                          scoring = make_scorer(recall_score, pos_label=1))


# Get the best hyperparameters
best_rf_params_recall = tuned_rf_model_recall.best_params_


# Print or use the best hyperparameters as needed
print("Best Random Forest hyperparameters for Recall Score:", best_rf_params_recall)

Best Random Forest hyperparameters for Recall Score: {'model__class_weight': 'balanced', 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 3, 'model__n_estimators': 300}


In [307]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_rf_model_recall.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_rf_recall = tuned_rf_model_recall.best_estimator_

Top Models:
                                               params  mean_test_score  \
1   {'model__class_weight': 'balanced', 'model__ma...         0.405180   
0   {'model__class_weight': 'balanced', 'model__ma...         0.393415   
2   {'model__class_weight': 'balanced', 'model__ma...         0.393371   
11  {'model__class_weight': {0: 1, 1: 4}, 'model__...         0.393284   
9   {'model__class_weight': {0: 1, 1: 4}, 'model__...         0.390430   

    std_test_score  
1         0.062496  
0         0.057679  
2         0.058802  
11        0.058147  
9         0.062475  


In [308]:
# Evaluate the best model
y_pred_best = best_model_rf_recall.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9613733905579399
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1322
           1       0.82      0.37      0.51        76

    accuracy                           0.96      1398
   macro avg       0.89      0.68      0.74      1398
weighted avg       0.96      0.96      0.95      1398


Confusion Matrix:
 [[1316    6]
 [  48   28]]


### Saving the best Random Forest Model

In this case, we have not been able to hypertune a Random Forest Model that performs better than the out-of-box model.

## Gradient Boost Tuning

#### 1. Using 'roc_auc' for scoring

In [313]:
gb_param_grid = {
    'model__n_estimators': [400, 500, 600],
    'model__max_depth': [3, 4],
    #'model__min_samples_split': [2, 5],
    #'model__min_samples_leaf': [2, 3],
    #'model__subsample': [0.8, 0.9],
    'model__learning_rate': [0.1, 0.2]
}

In [314]:
# Tune hyperparameters
tuned_gb_model_roc = tune_hyperparameters(gb_model, gb_param_grid, X_train, y_train, scoring='roc_auc')


# Get the best hyperparameters
best_gb_params_roc = tuned_gb_model_roc.best_params_


# Print or use the best hyperparameters as needed
print("Best Gradient Boost hyperparameters for ROC_AUC Score:", best_gb_params_roc)

Best Gradient Boost hyperparameters for ROC_AUC Score: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 400}


In [315]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_gb_model_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_gb_roc = tuned_gb_model_roc.best_estimator_

Top Models:
                                              params  mean_test_score  \
0  {'model__learning_rate': 0.1, 'model__max_dept...         0.932363   
1  {'model__learning_rate': 0.1, 'model__max_dept...         0.931222   
2  {'model__learning_rate': 0.1, 'model__max_dept...         0.930737   
7  {'model__learning_rate': 0.2, 'model__max_dept...         0.920476   
4  {'model__learning_rate': 0.1, 'model__max_dept...         0.920429   

   std_test_score  
0        0.013141  
1        0.013914  
2        0.013362  
7        0.018715  
4        0.009307  


In [316]:
# Evaluate the best model
y_pred_best = best_model_gb_roc.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9706723891273248
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      1322
           1       0.81      0.61      0.69        76

    accuracy                           0.97      1398
   macro avg       0.89      0.80      0.84      1398
weighted avg       0.97      0.97      0.97      1398


Confusion Matrix:
 [[1311   11]
 [  30   46]]


### Saving the best Gradient Boost Model

In [317]:
# Save the models and their best parameters
joblib.dump(best_model_gb_roc, 'models/best_model_gb_roc.pkl')
joblib.dump(best_gb_params_roc, 'models/best_gb_params_roc.pkl')

['models/best_gb_params_roc.pkl']