# Hyperparameter Tuning

In [1]:
import os
os.chdir('..')

In [2]:
import pickle
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from src.hyperparameter_tuning import tune_hyperparameters
from src.model_training import build_preprocessor, build_full_pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# ... (Read data, df_construct, and other necessary steps)

In [4]:
# Use this code in the event that it's needed
df = pd.read_csv('data/processed_data/df_cleaned.csv')
X = df.drop('at_risk', axis=1)
y = df.at_risk

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

preprocessor = build_preprocessor(numeric_features, categorical_features)
model = RandomForestClassifier()
full_pipeline = build_full_pipeline(preprocessor, model)

In [18]:
grid_search = tune_hyperparameters(full_pipeline, {}, X_train, y_train)

In [19]:
# Analyze grid search results
cv_results = pd.DataFrame(grid_search.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

Top Models:
  params  mean_test_score  std_test_score
0     {}         0.940868        0.015465

Best Model Accuracy: 0.9635193133047211
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1322
           1       0.93      0.36      0.51        76

    accuracy                           0.96      1398
   macro avg       0.95      0.68      0.75      1398
weighted avg       0.96      0.96      0.96      1398


Confusion Matrix:
 [[1320    2]
 [  49   27]]


In [9]:
top_models['params'][54]

{'model__class_weight': None,
 'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 5,
 'model__n_estimators': 300}

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from hyperparameter_tuning import tune_hyperparameters

# Example for RandomForestClassifier
rf_pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
rf_param_grid = {
    'randomforestclassifier__n_estimators': [30, 50, 100, 200, 300],
    'randomforestclassifier__max_depth': [None, 2, 3, 5, 10, 20, 30],
    'randomforestclassifier__class_weight': [None, 'balanced'],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2']
}
rf_grid_search = tune_hyperparameters(rf_pipeline, rf_param_grid, X_train, y_train)

# Example for GradientBoostingClassifier
gb_pipeline = make_pipeline(StandardScaler(), GradientBoostingClassifier())
gb_param_grid = {
    'gradientboostingclassifier__n_estimators': [50, 100, 200],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.2],
    'gradientboostingclassifier__max_depth': [3, 5, 7],
    'gradientboostingclassifier__min_samples_split': [2, 5, 10],
    'gradientboostingclassifier__min_samples_leaf': [1, 2, 4],
    'gradientboostingclassifier__max_features': ['auto', 'sqrt', 'log2']
}
gb_grid_search = tune_hyperparameters(gb_pipeline, gb_param_grid, X_train, y_train)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

preprocessor = build_preprocessor(numeric_features, categorical_features)

In [7]:
# Split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

nb_model = build_full_pipeline(preprocessor,GaussianNB())
nn_model = build_full_pipeline(preprocessor,MLPClassifier())
xgb_model = build_full_pipeline(preprocessor,XGBClassifier())

In [15]:
# Hyperparameter grid for XGBoost
xgb_param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__scale_pos_weight': [1, 3, 5]  # Adjust based on the class imbalance
}

# Hyperparameter grid for Gaussian Naive Bayes
nb_param_grid = {
    'model__priors': [None, [0.1, 0.9], [0.5, 0.5], [0.9, 0.1]]
}

nn_param_grid = {
    'model__hidden_layer_sizes': [(50,), (100,)],
    'model__alpha': [0.01, 0.1, 0.2],
    'model__learning_rate': ['invscaling', 'adaptive'],
    'model__solver': ['adam'],
    'model__max_iter': [200, 400, 500],
}

## XGB Tuning

### 1. Using Recall as scoring metric

In [8]:
# Tune hyperparameters
tuned_xgb_model = tune_hyperparameters(xgb_model, xgb_param_grid, X_train, y_train, scoring='recall')

# Get the best hyperparameters
best_xgb_params = tuned_xgb_model.best_params_

# Print or use the best hyperparameters as needed
print("Best XGBoost hyperparameters:", best_xgb_params)

Best XGBoost hyperparameters: {'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__scale_pos_weight': 5}


In [9]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_xgb_model.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_xgb_model.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

Top Models:
                                               params  mean_test_score  \
59  {'model__learning_rate': 0.2, 'model__max_dept...         0.677305   
62  {'model__learning_rate': 0.2, 'model__max_dept...         0.677305   
35  {'model__learning_rate': 0.1, 'model__max_dept...         0.665496   
56  {'model__learning_rate': 0.2, 'model__max_dept...         0.665408   
38  {'model__learning_rate': 0.1, 'model__max_dept...         0.653600   

    std_test_score  
59        0.044918  
62        0.049499  
35        0.034971  
56        0.042583  
38        0.054818  

Best Model Accuracy: 0.9570815450643777
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      1322
           1       0.58      0.75      0.66        76

    accuracy                           0.96      1398
   macro avg       0.78      0.86      0.82      1398
weighted avg       0.96      0.96      0.96      1398


Confusion Mat

### Observations:
- Compared to the base XGBoost performance seen in the model_training_notebook, we do have some improvement to the recall score of the under-represented class.  It does come at the cost of more false positives, but worth the nearly 20% increase in recall.

### 2. Using roc_auc as scoring metric

In [9]:
# Tune hyperparameters
tuned_xgb_model = tune_hyperparameters(xgb_model, xgb_param_grid, X_train, y_train, scoring='roc_auc')

# Get the best hyperparameters
best_xgb_params = tuned_xgb_model.best_params_

# Print or use the best hyperparameters as needed
print("Best XGBoost hyperparameters:", best_xgb_params)

Best XGBoost hyperparameters: {'model__learning_rate': 0.2, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__scale_pos_weight': 1}


In [10]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_xgb_model.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_xgb_model.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

Top Models:
                                               params  mean_test_score  \
60  {'model__learning_rate': 0.2, 'model__max_dept...         0.938204   
62  {'model__learning_rate': 0.2, 'model__max_dept...         0.937879   
44  {'model__learning_rate': 0.1, 'model__max_dept...         0.936835   
33  {'model__learning_rate': 0.1, 'model__max_dept...         0.935167   
57  {'model__learning_rate': 0.2, 'model__max_dept...         0.935029   

    std_test_score  
60        0.009629  
62        0.005715  
44        0.009016  
33        0.008165  
57        0.010094  

Best Model Accuracy: 0.969241773962804
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      1322
           1       0.79      0.59      0.68        76

    accuracy                           0.97      1398
   macro avg       0.88      0.79      0.83      1398
weighted avg       0.97      0.97      0.97      1398


Confusion Matr

### Observations:
- No increase in recall from the base model.

## Naive Bayes Tuning
- There is not much tuning to do for NB, but we'll make some changes to the "priors" parameter to see if we can come up with even a slightly improved model from the one before

In [32]:
# Tune hyperparameters
tuned_nb_model = tune_hyperparameters(nb_model, nb_param_grid, X_train, y_train)


# Get the best hyperparameters
best_nb_params = tuned_nb_model.best_params_


# Print or use the best hyperparameters as needed
print("Best Naive Bayes hyperparameters:", best_nb_params)

Best Naive Bayes hyperparameters: {'model__priors': None}


In [33]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nb_model.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_nb_model.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

Top Models:
                          params  mean_test_score  std_test_score
0        {'model__priors': None}         0.738422        0.026415
1  {'model__priors': [0.1, 0.9]}         0.738422        0.026415
2  {'model__priors': [0.5, 0.5]}         0.738422        0.026415
3  {'model__priors': [0.9, 0.1]}         0.738422        0.026415

Best Model Accuracy: 0.6609442060085837
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.65      0.78      1322
           1       0.11      0.78      0.20        76

    accuracy                           0.66      1398
   macro avg       0.55      0.72      0.49      1398
weighted avg       0.93      0.66      0.75      1398


Confusion Matrix:
 [[865 457]
 [ 17  59]]


### Obeservations:
- This model is no better than the one we had previously in that, it contains more false negatives.  Despite the fact that the NB model does have a substantially low accuracy (with many false positives), we used it in the ensemble meta model because of it's high recall rate for the under-represented class.
- When we retrain the ensemble model, we will stick with the out-of-the-box NB model.

## Neural Network Tuning

#### 1. Using 'roc_auc' for scoring

In [12]:
# Tune hyperparameters
tuned_nn_model = tune_hyperparameters(nn_model, nn_param_grid, X_train, y_train, scoring='roc_auc')


# Get the best hyperparameters
best_nn_params = tuned_nn_model.best_params_


# Print or use the best hyperparameters as needed
print("Best Neural Net hyperparameters:", best_nn_params)

Best Neural Net hyperparameters: {'model__alpha': 0.1, 'model__hidden_layer_sizes': (100,), 'model__learning_rate': 'adaptive', 'model__max_iter': 500, 'model__solver': 'adam'}


In [13]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nn_model.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_nn_model.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

Top Models:
                                                params  mean_test_score  \
251  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.932271   
221  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.932210   
169  {'model__alpha': 0.01, 'model__hidden_layer_si...         0.931967   
223  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.931725   
233  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.931582   

     std_test_score  
251        0.008816  
221        0.009603  
169        0.008716  
223        0.009118  
233        0.008125  

Best Model Accuracy: 0.9649499284692418
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1322
           1       0.86      0.42      0.57        76

    accuracy                           0.96      1398
   macro avg       0.92      0.71      0.77      1398
weighted avg       0.96      0.96      0.96      1398


C



#### 2. Using 'f1' for scoring

In [16]:
# Tune hyperparameters
tuned_nn_model = tune_hyperparameters(nn_model, nn_param_grid, X_train, y_train, scoring='f1')


# Get the best hyperparameters
best_nn_params = tuned_nn_model.best_params_


# Print or use the best hyperparameters as needed
print("Best Neural Net hyperparameters:", best_nn_params)

Best Neural Net hyperparameters: {'model__alpha': 0.01, 'model__hidden_layer_sizes': (100,), 'model__learning_rate': 'adaptive', 'model__max_iter': 400, 'model__solver': 'adam'}


In [17]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nn_model.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_nn_model.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

Top Models:
                                               params  mean_test_score  \
10  {'model__alpha': 0.01, 'model__hidden_layer_si...         0.612771   
1   {'model__alpha': 0.01, 'model__hidden_layer_si...         0.607381   
7   {'model__alpha': 0.01, 'model__hidden_layer_si...         0.599458   
14  {'model__alpha': 0.1, 'model__hidden_layer_siz...         0.597875   
8   {'model__alpha': 0.01, 'model__hidden_layer_si...         0.596106   

    std_test_score  
10        0.047941  
1         0.053718  
7         0.051740  
14        0.046307  
8         0.061256  

Best Model Accuracy: 0.9678111587982833
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      1322
           1       0.72      0.67      0.69        76

    accuracy                           0.97      1398
   macro avg       0.85      0.83      0.84      1398
weighted avg       0.97      0.97      0.97      1398


Confusion Mat



## Saving the Tuned Models
- We will unpack these to use in the model_training_notebook when we re-train our meta model

In [40]:
# Save the models and best parameters
joblib.dump(tuned_xgb_model, 'models/tuned_xgb_model.pkl')
joblib.dump(best_xgb_params, 'models/best_xgb_params.pkl')

joblib.dump(tuned_nn_model, 'models/tuned_nn_model.pkl')
joblib.dump(best_nn_params, 'models/best_nn_params.pkl')

['models/best_nn_params.pkl']

