# Hyperparameter Tuning

In [1]:
import os
os.chdir('..')

In [19]:
import pickle
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from src.hyperparameter_tuning import tune_hyperparameters
from src.model_training import build_preprocessor, build_full_pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Both the initial cleaned data (df) and the modified version of the data after the Feature Analysis (df2) are included here for hyperparameter tuning

In [170]:
# Use this code in the event that it's needed
df = pd.read_csv('data/processed_data/df_cleaned.csv')
df2 = pd.read_csv('data/processed_data/df_modified.csv')

X = df.drop('at_risk', axis=1)
X2 = df2.drop('at_risk', axis=1)

y = df.at_risk
y2 = df2.at_risk

In [171]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
numeric_features2 = X_train2.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
categorical_features2 = X_train2.select_dtypes(include=['object']).columns.tolist()

preprocessor = build_preprocessor(numeric_features, categorical_features)
preprocessor2 = build_preprocessor(numeric_features2, categorical_features2)

In [172]:
nb_model = build_full_pipeline(preprocessor,GaussianNB())
nn_model = build_full_pipeline(preprocessor,MLPClassifier())
xgb_model = build_full_pipeline(preprocessor,XGBClassifier())
gb_model = build_full_pipeline(preprocessor, GradientBoostingClassifier())
rf_model = build_full_pipeline(preprocessor, RandomForestClassifier())

nb_model2 = build_full_pipeline(preprocessor2, GaussianNB())
nn_model2 = build_full_pipeline(preprocessor2, MLPClassifier())
xgb_model2 = build_full_pipeline(preprocessor2, XGBClassifier())
gb_model2 = build_full_pipeline(preprocessor2, GradientBoostingClassifier())
rf_model2 = build_full_pipeline(preprocessor2, RandomForestClassifier())

## XGB Tuning

### 1. Using Recall as scoring metric

In [173]:
# Hyperparameter grid for XGBoost
xgb_param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [2, 3, 5],
    'model__learning_rate': [0.1, 0.2],
    'model__scale_pos_weight': [1, 3, 5]  # Adjust based on the class imbalance
}

#### df_cleaned

In [8]:
# Tune hyperparameters
tuned_xgb_model_recall = tune_hyperparameters(xgb_model, xgb_param_grid, X_train, y_train, scoring='recall')

# Get the best hyperparameters
best_xgb_params_recall = tuned_xgb_model_recall.best_params_

# Print or use the best hyperparameters as needed
print("Best XGBoost hyperparameters for Recall Score:", best_xgb_params_recall)

Best XGBoost hyperparameters: {'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__scale_pos_weight': 5}


In [9]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_xgb_model_recall.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_xgb_model_recall.best_estimator_

Top Models:
                                               params  mean_test_score  \
59  {'model__learning_rate': 0.2, 'model__max_dept...         0.677305   
62  {'model__learning_rate': 0.2, 'model__max_dept...         0.677305   
35  {'model__learning_rate': 0.1, 'model__max_dept...         0.665496   
56  {'model__learning_rate': 0.2, 'model__max_dept...         0.665408   
38  {'model__learning_rate': 0.1, 'model__max_dept...         0.653600   

    std_test_score  
59        0.044918  
62        0.049499  
35        0.034971  
56        0.042583  
38        0.054818  

Best Model Accuracy: 0.9570815450643777
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      1322
           1       0.58      0.75      0.66        76

    accuracy                           0.96      1398
   macro avg       0.78      0.86      0.82      1398
weighted avg       0.96      0.96      0.96      1398


Confusion Mat

In [None]:
# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

### Observations:
- Compared to the base XGBoost performance seen in the model_training_notebook, we do have some improvement to the recall score of the under-represented class.  It does come at the cost of more false positives, but worth the nearly 20% increase in recall.

#### df_modified

In [182]:
# Hyperparameter grid for XGBoost
xgb_param_grid2 = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.2, 0.3],
    'model__scale_pos_weight': [5, 7, 9]  # Adjust based on the class imbalance
}

In [183]:
# Tune hyperparameters
tuned_xgb_model_modified = tune_hyperparameters(xgb_model2, xgb_param_grid2, X_train2, y_train2, scoring='recall')

# Get the best hyperparameters
best_xgb_params_modified = tuned_xgb_model_modified.best_params_

# Print or use the best hyperparameters as needed
print("Best XGBoost hyperparameters for Recall Score:", best_xgb_params_modified)

Best XGBoost hyperparameters for Recall Score: {'model__learning_rate': 0.3, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__scale_pos_weight': 9}


In [184]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_xgb_model_modified.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_xgb_modified = tuned_xgb_model_modified.best_estimator_

Top Models:
                                               params  mean_test_score  \
23  {'model__learning_rate': 0.3, 'model__max_dept...         0.736611   
20  {'model__learning_rate': 0.3, 'model__max_dept...         0.727700   
5   {'model__learning_rate': 0.2, 'model__max_dept...         0.727700   
2   {'model__learning_rate': 0.2, 'model__max_dept...         0.724759   
1   {'model__learning_rate': 0.2, 'model__max_dept...         0.718745   

    std_test_score  
23        0.026142  
20        0.046948  
5         0.036450  
2         0.043169  
1         0.048144  


In [185]:
# Evaluate the best model
y_pred_best2 = best_model_xgb_modified.predict(X_test2)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test2, y_pred_best2)
classification_rep_best = classification_report(y_test2, y_pred_best2)
conf_matrix_best = confusion_matrix(y_test2, y_pred_best2)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9427753934191703
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97      1322
           1       0.48      0.80      0.60        76

    accuracy                           0.94      1398
   macro avg       0.74      0.88      0.79      1398
weighted avg       0.96      0.94      0.95      1398


Confusion Matrix:
 [[1257   65]
 [  15   61]]


In [186]:
# Save the models and their best parameters
joblib.dump(best_model_xgb_modified, 'models/best_model_xgb_modified.pkl')
joblib.dump(best_xgb_params_modified, 'models/best_xgb_params_modified.pkl')

['models/best_xgb_params_modified.pkl']

### 2. Using roc_auc as scoring metric

#### df_cleaned

In [9]:
# Tune hyperparameters
tuned_xgb_model_roc = tune_hyperparameters(xgb_model, xgb_param_grid, X_train, y_train, scoring='roc_auc')

# Get the best hyperparameters
best_xgb_params_roc = tuned_xgb_model_roc.best_params_

# Print or use the best hyperparameters as needed
print("Best XGBoost hyperparameters for ROC_AUC Score:", best_xgb_params_roc)

Best XGBoost hyperparameters: {'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__scale_pos_weight': 1}


In [10]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_xgb_model_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_xgb_model_roc.best_estimator_

Top Models:
                                               params  mean_test_score  \
60  {'model__learning_rate': 0.2, 'model__max_dept...         0.938204   
62  {'model__learning_rate': 0.2, 'model__max_dept...         0.937879   
44  {'model__learning_rate': 0.1, 'model__max_dept...         0.936835   
33  {'model__learning_rate': 0.1, 'model__max_dept...         0.935167   
57  {'model__learning_rate': 0.2, 'model__max_dept...         0.935029   

    std_test_score  
60        0.009629  
62        0.005715  
44        0.009016  
33        0.008165  
57        0.010094  

Best Model Accuracy: 0.969241773962804
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      1322
           1       0.79      0.59      0.68        76

    accuracy                           0.97      1398
   macro avg       0.88      0.79      0.83      1398
weighted avg       0.97      0.97      0.97      1398


Confusion Matr

In [None]:
# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

### Observations:
- No increase in recall from the base model.

#### df_modified

In [187]:
# Hyperparameter grid for XGBoost
xgb_param_grid2 = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.2, 0.3],
    'model__scale_pos_weight': [1, 3, 5, 7]  # Adjust based on the class imbalance
}

In [188]:
# Tune hyperparameters
tuned_xgb_model_mod_roc = tune_hyperparameters(xgb_model2, xgb_param_grid2, X_train2, y_train2, scoring='roc_auc')

# Get the best hyperparameters
best_xgb_params_mod_roc = tuned_xgb_model_mod_roc.best_params_

# Print or use the best hyperparameters as needed
print("Best XGBoost hyperparameters for ROC_AUC Score:", best_xgb_params_mod_roc)

Best XGBoost hyperparameters for ROC_AUC Score: {'model__learning_rate': 0.3, 'model__max_depth': 3, 'model__n_estimators': 300, 'model__scale_pos_weight': 7}


In [189]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_xgb_model_mod_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_xgb_mod_roc = tuned_xgb_model_mod_roc.best_estimator_

Top Models:
                                               params  mean_test_score  \
35  {'model__learning_rate': 0.3, 'model__max_dept...         0.939251   
11  {'model__learning_rate': 0.2, 'model__max_dept...         0.939159   
10  {'model__learning_rate': 0.2, 'model__max_dept...         0.938476   
18  {'model__learning_rate': 0.2, 'model__max_dept...         0.938160   
31  {'model__learning_rate': 0.3, 'model__max_dept...         0.937831   

    std_test_score  
35        0.009519  
11        0.009015  
10        0.007850  
18        0.008472  
31        0.010016  


In [190]:
# Evaluate the best model
y_pred_best2 = best_model_xgb_mod_roc.predict(X_test2)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test2, y_pred_best2)
classification_rep_best = classification_report(y_test2, y_pred_best2)
conf_matrix_best = confusion_matrix(y_test2, y_pred_best2)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9635193133047211
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1322
           1       0.65      0.70      0.68        76

    accuracy                           0.96      1398
   macro avg       0.82      0.84      0.83      1398
weighted avg       0.96      0.96      0.96      1398


Confusion Matrix:
 [[1294   28]
 [  23   53]]


## Naive Bayes Tuning
- There is not much tuning to do for NB, but we'll make some changes to the "priors" parameter to see if we can come up with even a slightly improved model from the one before

In [None]:
# Hyperparameter grid for Gaussian Naive Bayes
nb_param_grid = {
    'model__priors': [None, [0.1, 0.9], [0.5, 0.5], [0.9, 0.1]]
}

In [32]:
# Tune hyperparameters
tuned_nb_model_roc = tune_hyperparameters(nb_model, nb_param_grid, X_train, y_train, scoring='roc_auc')


# Get the best hyperparameters
best_nb_params_roc = tuned_nb_model_roc.best_params_


# Print or use the best hyperparameters as needed
print("Best Naive Bayes hyperparameters for ROC_AUC Score:", best_nb_params_roc)

Best Naive Bayes hyperparameters: {'model__priors': None}


In [33]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nb_model_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_nb_model_roc.best_estimator_

Top Models:
                          params  mean_test_score  std_test_score
0        {'model__priors': None}         0.738422        0.026415
1  {'model__priors': [0.1, 0.9]}         0.738422        0.026415
2  {'model__priors': [0.5, 0.5]}         0.738422        0.026415
3  {'model__priors': [0.9, 0.1]}         0.738422        0.026415

Best Model Accuracy: 0.6609442060085837
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.65      0.78      1322
           1       0.11      0.78      0.20        76

    accuracy                           0.66      1398
   macro avg       0.55      0.72      0.49      1398
weighted avg       0.93      0.66      0.75      1398


Confusion Matrix:
 [[865 457]
 [ 17  59]]


In [None]:
# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

### Obeservations:
- This model is no better than the one we had previously in that, it contains more false negatives.  Despite the fact that the NB model does have a substantially low accuracy (with many false positives), we used it in the ensemble meta model because of it's high recall rate for the under-represented class.
- When we retrain the ensemble model, we will stick with the out-of-the-box NB model.

## Neural Network Tuning

#### 1. Using 'roc_auc' for scoring

#### a) df_cleaned

In [None]:
nn_param_grid = {
    'model__hidden_layer_sizes': [(50,), (100,)],
    'model__alpha': [0.01, 0.1, 0.2],
    'model__learning_rate': ['invscaling', 'adaptive'],
    'model__solver': ['adam'],
    'model__max_iter': [200, 400, 500],
}

In [12]:
# Tune hyperparameters
tuned_nn_model_roc = tune_hyperparameters(nn_model, nn_param_grid, X_train, y_train, scoring='roc_auc')


# Get the best hyperparameters
best_nn_params_roc = tuned_nn_model_roc.best_params_


# Print or use the best hyperparameters as needed
print("Best Neural Net hyperparameters for ROC_AUC Score:", best_nn_params_roc)

Best Neural Net hyperparameters: {'model__alpha': 0.1, 'model__hidden_layer_sizes': (100,), 'model__learning_rate': 'adaptive', 'model__max_iter': 500, 'model__solver': 'adam'}


In [13]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nn_model_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_nn_model_roc.best_estimator_

Top Models:
                                                params  mean_test_score  \
251  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.932271   
221  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.932210   
169  {'model__alpha': 0.01, 'model__hidden_layer_si...         0.931967   
223  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.931725   
233  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.931582   

     std_test_score  
251        0.008816  
221        0.009603  
169        0.008716  
223        0.009118  
233        0.008125  

Best Model Accuracy: 0.9649499284692418
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1322
           1       0.86      0.42      0.57        76

    accuracy                           0.96      1398
   macro avg       0.92      0.71      0.77      1398
weighted avg       0.96      0.96      0.96      1398


C



In [None]:
# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

#### b) df_modified

In [196]:
nn_param_grid2 = {
    'model__hidden_layer_sizes': [(50,), (100,), (50, 50),],
    'model__alpha': [0.01, 0.1],
    'model__learning_rate': ['adaptive'],
    'model__solver': ['adam'],
    'model__max_iter': [400, 500, 600],
}

In [200]:
# Tune hyperparameters
tuned_nn_model_mod_roc = tune_hyperparameters(nn_model2, nn_param_grid2, X_train2, y_train2, scoring='roc_auc')


# Get the best hyperparameters
best_nn_params_mod_roc = tuned_nn_model_mod_roc.best_params_


# Print or use the best hyperparameters as needed
print("Best Neural Net hyperparameters for ROC_AUC Score:", best_nn_params_mod_roc)

Best Neural Net hyperparameters for ROC_AUC Score: {'model__alpha': 0.1, 'model__hidden_layer_sizes': (100,), 'model__learning_rate': 'adaptive', 'model__max_iter': 500, 'model__solver': 'adam'}


In [201]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nn_model_mod_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_nn_mod_roc = tuned_nn_model_mod_roc.best_estimator_

Top Models:
                                               params  mean_test_score  \
13  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.930651   
16  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.930200   
1   {'model__alpha': 0.01, 'model__hidden_layer_si...         0.929340   
12  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.928897   
9   {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.928891   

    std_test_score  
13        0.011047  
16        0.008871  
1         0.009728  
12        0.006092  
9         0.010054  


In [202]:
# Evaluate the best model
y_pred_best2 = best_model_nn_mod_roc.predict(X_test2)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test2, y_pred_best2)
classification_rep_best = classification_report(y_test2, y_pred_best2)
conf_matrix_best = confusion_matrix(y_test2, y_pred_best2)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9656652360515021
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      1322
           1       0.79      0.50      0.61        76

    accuracy                           0.97      1398
   macro avg       0.88      0.75      0.80      1398
weighted avg       0.96      0.97      0.96      1398


Confusion Matrix:
 [[1312   10]
 [  38   38]]


#### 2. Using 'f1' for scoring

#### a) df_cleaned

In [16]:
# Tune hyperparameters
tuned_nn_model_f1 = tune_hyperparameters(nn_model, nn_param_grid, X_train, y_train, scoring='f1')


# Get the best hyperparameters
best_nn_params_f1 = tuned_nn_model_f1.best_params_


# Print or use the best hyperparameters as needed
print("Best Neural Net hyperparameters for F1 Score:", best_nn_params_f1)

Best Neural Net hyperparameters: {'model__alpha': 0.01, 'model__hidden_layer_sizes': (100,), 'model__learning_rate': 'adaptive', 'model__max_iter': 400, 'model__solver': 'adam'}


In [17]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nn_model_f1.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_nn_model_f1.best_estimator_

Top Models:
                                               params  mean_test_score  \
10  {'model__alpha': 0.01, 'model__hidden_layer_si...         0.612771   
1   {'model__alpha': 0.01, 'model__hidden_layer_si...         0.607381   
7   {'model__alpha': 0.01, 'model__hidden_layer_si...         0.599458   
14  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.597875   
8   {'model__alpha': 0.01, 'model__hidden_layer_si...         0.596106   

    std_test_score  
10        0.047941  
1         0.053718  
7         0.051740  
14        0.046307  
8         0.061256  

Best Model Accuracy: 0.9678111587982833
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1322
           1       0.72      0.67      0.69        76

    accuracy                           0.97      1398
   macro avg       0.85      0.83      0.84      1398
weighted avg       0.97      0.97      0.97      1398


Confusion Mat



In [None]:
# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

#### b) df_modified

In [196]:
nn_param_grid2 = {
    'model__hidden_layer_sizes': [(50,), (100,), (50, 50),],
    'model__alpha': [0.01, 0.1],
    'model__learning_rate': ['adaptive'],
    'model__solver': ['adam'],
    'model__max_iter': [400, 500, 600],
}

In [197]:
# Tune hyperparameters
tuned_nn_model_mod_f1 = tune_hyperparameters(nn_model2, nn_param_grid2, X_train2, y_train2, scoring='f1')


# Get the best hyperparameters
best_nn_params_mod_f1 = tuned_nn_model_mod_f1.best_params_


# Print or use the best hyperparameters as needed
print("Best Neural Net hyperparameters for F1 Score:", best_nn_params_mod_f1)

Best Neural Net hyperparameters for F1 Score: {'model__alpha': 0.01, 'model__hidden_layer_sizes': (50,), 'model__learning_rate': 'adaptive', 'model__max_iter': 500, 'model__solver': 'adam'}


In [198]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nn_model_mod_f1.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_nn_mod_f1 = tuned_nn_model_mod_f1.best_estimator_

Top Models:
                                              params  mean_test_score  \
1  {'model__alpha': 0.01, 'model__hidden_layer_si...         0.614777   
5  {'model__alpha': 0.01, 'model__hidden_layer_si...         0.604971   
0  {'model__alpha': 0.01, 'model__hidden_layer_si...         0.598024   
2  {'model__alpha': 0.01, 'model__hidden_layer_si...         0.590230   
8  {'model__alpha': 0.01, 'model__hidden_layer_si...         0.589899   

   std_test_score  
1        0.056036  
5        0.049803  
0        0.058534  
2        0.057334  
8        0.049547  


In [199]:
# Evaluate the best model
y_pred_best2 = best_model_nn_mod_f1.predict(X_test2)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test2, y_pred_best2)
classification_rep_best = classification_report(y_test2, y_pred_best2)
conf_matrix_best = confusion_matrix(y_test2, y_pred_best2)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9713876967095851
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      1322
           1       0.82      0.61      0.70        76

    accuracy                           0.97      1398
   macro avg       0.90      0.80      0.84      1398
weighted avg       0.97      0.97      0.97      1398


Confusion Matrix:
 [[1312   10]
 [  30   46]]


In [203]:
# Save the models and their best parameters
joblib.dump(best_model_nn_mod_f1, 'models/best_model_nn_modified.pkl')
joblib.dump(best_nn_params_mod_f1, 'models/best_nn_params_modified.pkl')

['models/best_nn_params_modified.pkl']

## Random Forest Tuning

#### 1. Using 'roc_auc' for scoring

#### a) df_cleaned

In [116]:
rf_param_grid = {
    'model__n_estimators': [50, 100, 200, 300],
    'model__min_samples_split': [2, 3],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['sqrt'],
}

In [123]:
# Tune hyperparameters
tuned_rf_model_roc = tune_hyperparameters(rf_model, rf_param_grid, X_train, y_train, scoring = 'roc_auc')


# Get the best hyperparameters
best_rf_params_roc = tuned_rf_model_roc.best_params_


# Print or use the best hyperparameters as needed
print("Best Random Forest hyperparameters for ROC_AUC Score:", best_rf_params_roc)

Best Random Forest hyperparameters for ROC_AUC Score: {'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 3, 'model__n_estimators': 300}


In [124]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_rf_model_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_rf_model_roc.best_estimator_

Top Models:
                                               params  mean_test_score  \
7   {'model__max_features': 'sqrt', 'model__min_sa...         0.943244   
5   {'model__max_features': 'sqrt', 'model__min_sa...         0.942753   
2   {'model__max_features': 'sqrt', 'model__min_sa...         0.942538   
3   {'model__max_features': 'sqrt', 'model__min_sa...         0.942102   
15  {'model__max_features': 'sqrt', 'model__min_sa...         0.941010   

    std_test_score  
7         0.006298  
5         0.006421  
2         0.010510  
3         0.007518  
15        0.008933  


In [125]:
# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9656652360515021
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1322
           1       0.97      0.38      0.55        76

    accuracy                           0.97      1398
   macro avg       0.97      0.69      0.76      1398
weighted avg       0.97      0.97      0.96      1398


Confusion Matrix:
 [[1321    1]
 [  47   29]]


#### b) df_modified

In [219]:
rf_param_grid2 = {
    'model__n_estimators': [200, 300, 400, 500],
    'model__min_samples_split': [3, 5, 6],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['sqrt'],
}

In [220]:
# Tune hyperparameters
tuned_rf_model_mod_roc = tune_hyperparameters(rf_model2, rf_param_grid2, X_train2, y_train2, scoring = 'roc_auc')


# Get the best hyperparameters
best_rf_params_mod_roc = tuned_rf_model_mod_roc.best_params_


# Print or use the best hyperparameters as needed
print("Best Random Forest hyperparameters for ROC_AUC Score:", best_rf_params_mod_roc)

Best Random Forest hyperparameters for ROC_AUC Score: {'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 3, 'model__n_estimators': 400}


In [221]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_rf_model_mod_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_rf_mod_roc = tuned_rf_model_mod_roc.best_estimator_

Top Models:
                                              params  mean_test_score  \
2  {'model__max_features': 'sqrt', 'model__min_sa...         0.945960   
1  {'model__max_features': 'sqrt', 'model__min_sa...         0.945637   
7  {'model__max_features': 'sqrt', 'model__min_sa...         0.945626   
3  {'model__max_features': 'sqrt', 'model__min_sa...         0.945365   
5  {'model__max_features': 'sqrt', 'model__min_sa...         0.945195   

   std_test_score  
2        0.008073  
1        0.005652  
7        0.005627  
3        0.004738  
5        0.005328  


In [222]:
# Evaluate the best model
y_pred_best2 = best_model_rf_mod_roc.predict(X_test2)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test2, y_pred_best2)
classification_rep_best = classification_report(y_test2, y_pred_best2)
conf_matrix_best = confusion_matrix(y_test2, y_pred_best2)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9642346208869814
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1322
           1       0.91      0.38      0.54        76

    accuracy                           0.96      1398
   macro avg       0.94      0.69      0.76      1398
weighted avg       0.96      0.96      0.96      1398


Confusion Matrix:
 [[1319    3]
 [  47   29]]


In [223]:
# Save the models and their best parameters
joblib.dump(best_model_rf_mod_roc, 'models/best_model_rf_modified.pkl')
joblib.dump(best_rf_params_mod_roc, 'models/best_rf_params_modified.pkl')

['models/best_rf_params_modified.pkl']

#### 2. Using recall score for under-represented class (pos =1)

#### a) df_cleaned

In [129]:
from sklearn.metrics import make_scorer, recall_score

# Tune hyperparameters
tuned_rf_model_recall = tune_hyperparameters(rf_model, rf_param_grid, X_train, y_train, 
                                          scoring = make_scorer(recall_score, pos_label=1))


# Get the best hyperparameters
best_rf_params_recall = tuned_rf_model_recall.best_params_


# Print or use the best hyperparameters as needed
print("Best Random Forest hyperparameters for Recall Score:", best_rf_params_recall)

Best Random Forest hyperparameters for Recall Score: {'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 50}


In [130]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_rf_model_recall.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_rf_model_recall.best_estimator_

Top Models:
                                              params  mean_test_score  \
0  {'model__max_features': 'sqrt', 'model__min_sa...         0.414047   
3  {'model__max_features': 'sqrt', 'model__min_sa...         0.414047   
2  {'model__max_features': 'sqrt', 'model__min_sa...         0.411062   
5  {'model__max_features': 'sqrt', 'model__min_sa...         0.411018   
6  {'model__max_features': 'sqrt', 'model__min_sa...         0.408077   

   std_test_score  
0        0.046326  
3        0.051625  
2        0.063153  
5        0.054760  
6        0.061148  


In [131]:
# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9649499284692418
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1322
           1       1.00      0.36      0.52        76

    accuracy                           0.96      1398
   macro avg       0.98      0.68      0.75      1398
weighted avg       0.97      0.96      0.96      1398


Confusion Matrix:
 [[1322    0]
 [  49   27]]


### Results:
- no signifcant change to model performance compared to the base model

#### b) df_modified

In [211]:
rf_param_grid2 = {
    'model__n_estimators': [200, 300, 500],
    'model__min_samples_split': [2, 3, 5],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['sqrt'],
}

In [212]:
# Tune hyperparameters
tuned_rf_model_mod_recall = tune_hyperparameters(rf_model2, rf_param_grid2, X_train2, y_train2, 
                                          scoring = make_scorer(recall_score, pos_label=1))


# Get the best hyperparameters
best_rf_params_mod_recall = tuned_rf_model_mod_recall.best_params_


# Print or use the best hyperparameters as needed
print("Best Random Forest hyperparameters for Recall Score:", best_rf_params_mod_recall)

Best Random Forest hyperparameters for Recall Score: {'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 3, 'model__n_estimators': 300}


In [213]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_rf_model_mod_recall.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_rf_mod_recall = tuned_rf_model_mod_recall.best_estimator_

Top Models:
                                              params  mean_test_score  \
4  {'model__max_features': 'sqrt', 'model__min_sa...         0.384504   
0  {'model__max_features': 'sqrt', 'model__min_sa...         0.381563   
3  {'model__max_features': 'sqrt', 'model__min_sa...         0.381519   
2  {'model__max_features': 'sqrt', 'model__min_sa...         0.375637   
1  {'model__max_features': 'sqrt', 'model__min_sa...         0.372651   

   std_test_score  
4        0.039647  
0        0.039149  
3        0.045856  
2        0.052568  
1        0.051340  


In [214]:
# Evaluate the best model
y_pred_best2 = best_model_rf_mod_recall.predict(X_test2)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test2, y_pred_best2)
classification_rep_best = classification_report(y_test2, y_pred_best2)
conf_matrix_best = confusion_matrix(y_test2, y_pred_best2)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9628040057224606
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1322
           1       0.90      0.36      0.51        76

    accuracy                           0.96      1398
   macro avg       0.93      0.68      0.75      1398
weighted avg       0.96      0.96      0.96      1398


Confusion Matrix:
 [[1319    3]
 [  49   27]]


## Gradient Boost Tuning

#### 1. Using 'roc_auc' for scoring

#### a) df_cleaned

In [162]:
gb_param_grid = {
    'model__n_estimators': [300],
    'model__max_depth': [3],
    #'model__min_samples_split': [2, 5],
    #'model__min_samples_leaf': [2, 3],
    #'model__subsample': [0.8, 0.9],
    'model__learning_rate': [0.1]
}

In [163]:
# Tune hyperparameters
tuned_gb_model_roc = tune_hyperparameters(gb_model, gb_param_grid, X_train, y_train, scoring='roc_auc')


# Get the best hyperparameters
best_gb_params_roc = tuned_gb_model_roc.best_params_


# Print or use the best hyperparameters as needed
print("Best Gradient Boost hyperparameters for ROC_AUC Score:", best_gb_params_roc)

Best Gradient Boost hyperparameters for ROC_AUC Score: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 300}


In [164]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_gb_model_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_gb_model_roc.best_estimator_

Top Models:
                                              params  mean_test_score  \
0  {'model__learning_rate': 0.1, 'model__max_dept...          0.93398   

   std_test_score  
0        0.016516  


In [165]:
# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9742489270386266
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      1322
           1       0.86      0.63      0.73        76

    accuracy                           0.97      1398
   macro avg       0.92      0.81      0.86      1398
weighted avg       0.97      0.97      0.97      1398


Confusion Matrix:
 [[1314    8]
 [  28   48]]


### Results:
- We have a substantial improvement in recall on the under-represented class.

#### b) df_modified

In [233]:
gb_param_grid2 = {
    'model__n_estimators': [400, 500, 600],
    'model__max_depth': [3, 4],
    #'model__min_samples_split': [2, 5],
    #'model__min_samples_leaf': [2, 3],
    #'model__subsample': [0.8, 0.9],
    'model__learning_rate': [0.1, 0.2]
}

In [234]:
# Tune hyperparameters
tuned_gb_model_mod_roc = tune_hyperparameters(gb_model2, gb_param_grid2, X_train2, y_train2, scoring='roc_auc')


# Get the best hyperparameters
best_gb_params_mod_roc = tuned_gb_model_mod_roc.best_params_


# Print or use the best hyperparameters as needed
print("Best Gradient Boost hyperparameters for ROC_AUC Score:", best_gb_params_mod_roc)

Best Gradient Boost hyperparameters for ROC_AUC Score: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 400}


In [235]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_gb_model_mod_roc.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model_gb_mod_roc = tuned_gb_model_mod_roc.best_estimator_

Top Models:
                                              params  mean_test_score  \
0  {'model__learning_rate': 0.1, 'model__max_dept...         0.934056   
1  {'model__learning_rate': 0.1, 'model__max_dept...         0.932812   
2  {'model__learning_rate': 0.1, 'model__max_dept...         0.932633   
7  {'model__learning_rate': 0.2, 'model__max_dept...         0.922623   
6  {'model__learning_rate': 0.2, 'model__max_dept...         0.922213   

   std_test_score  
0        0.012834  
1        0.013407  
2        0.013093  
7        0.019075  
6        0.017607  


In [236]:
# Evaluate the best model
y_pred_best2 = best_model_gb_mod_roc.predict(X_test2)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test2, y_pred_best2)
classification_rep_best = classification_report(y_test2, y_pred_best2)
conf_matrix_best = confusion_matrix(y_test2, y_pred_best2)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)


Best Model Accuracy: 0.9706723891273248
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      1322
           1       0.81      0.61      0.69        76

    accuracy                           0.97      1398
   macro avg       0.89      0.80      0.84      1398
weighted avg       0.97      0.97      0.97      1398


Confusion Matrix:
 [[1311   11]
 [  30   46]]


In [232]:
# Save the models and their best parameters
joblib.dump(best_model_gb_mod_roc, 'models/best_model_gb_modified.pkl')
joblib.dump(best_gb_params_mod_roc, 'models/best_gb_params_modified.pkl')

['models/best_gb_params_modified.pkl']

## Saving Tuned Models With Improved Recall
- We will unpack these to use in the model_training_notebook when we re-train our ensemble models

In [40]:
# Save the models and their best parameters
joblib.dump(tuned_xgb_model_recall, 'models/tuned_xgb_model.pkl')
joblib.dump(best_xgb_params_recall, 'models/best_xgb_params.pkl')

joblib.dump(tuned_nn_model_roc, 'models/tuned_nn_model.pkl')
joblib.dump(best_nn_params_roc, 'models/best_nn_params.pkl')

joblib.dump(tuned_gb_model_roc, 'models/tuned_gb_model.pkl')
joblib.dump(best_gb_params_roc, 'models/best_gb_params.pkl')

['models/best_nn_params.pkl']

