# Hyperparameter Tuning

In [1]:
import os
os.chdir('..')

In [22]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from src.hyperparameter_tuning import tune_hyperparameters
from src.model_training import build_preprocessor, build_full_pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# ... (Read data, df_construct, and other necessary steps)

In [4]:
# Use this code in the event that it's needed
df = pd.read_csv('data/processed_data/df_cleaned.csv')
X = df.drop('at_risk', axis=1)
y = df.at_risk

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

preprocessor = build_preprocessor(numeric_features, categorical_features)
model = RandomForestClassifier()
full_pipeline = build_full_pipeline(preprocessor, model)

In [18]:
grid_search = tune_hyperparameters(full_pipeline, {}, X_train, y_train)

In [19]:
# Analyze grid search results
cv_results = pd.DataFrame(grid_search.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

Top Models:
  params  mean_test_score  std_test_score
0     {}         0.940868        0.015465

Best Model Accuracy: 0.9635193133047211
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1322
           1       0.93      0.36      0.51        76

    accuracy                           0.96      1398
   macro avg       0.95      0.68      0.75      1398
weighted avg       0.96      0.96      0.96      1398


Confusion Matrix:
 [[1320    2]
 [  49   27]]


In [9]:
top_models['params'][54]

{'model__class_weight': None,
 'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 5,
 'model__n_estimators': 300}

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from hyperparameter_tuning import tune_hyperparameters

# Example for RandomForestClassifier
rf_pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
rf_param_grid = {
    'randomforestclassifier__n_estimators': [30, 50, 100, 200, 300],
    'randomforestclassifier__max_depth': [None, 2, 3, 5, 10, 20, 30],
    'randomforestclassifier__class_weight': [None, 'balanced'],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2']
}
rf_grid_search = tune_hyperparameters(rf_pipeline, rf_param_grid, X_train, y_train)

# Example for GradientBoostingClassifier
gb_pipeline = make_pipeline(StandardScaler(), GradientBoostingClassifier())
gb_param_grid = {
    'gradientboostingclassifier__n_estimators': [50, 100, 200],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.2],
    'gradientboostingclassifier__max_depth': [3, 5, 7],
    'gradientboostingclassifier__min_samples_split': [2, 5, 10],
    'gradientboostingclassifier__min_samples_leaf': [1, 2, 4],
    'gradientboostingclassifier__max_features': ['auto', 'sqrt', 'log2']
}
gb_grid_search = tune_hyperparameters(gb_pipeline, gb_param_grid, X_train, y_train)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

preprocessor = build_preprocessor(numeric_features, categorical_features)

In [23]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

nb_model = build_full_pipeline(preprocessor,GaussianNB())
nn_model = build_full_pipeline(preprocessor,MLPClassifier())
xgb_model = build_full_pipeline(preprocessor,XGBClassifier())

In [36]:
# Hyperparameter grid for XGBoost
xgb_param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__scale_pos_weight': [1, 3, 5]  # Adjust based on the class imbalance
}

# Hyperparameter grid for Gaussian Naive Bayes
nb_param_grid = {
    'model__priors': [None, [0.1, 0.9], [0.5, 0.5], [0.9, 0.1]]
}

nn_param_grid = {
    'model__hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'model__activation': ['relu', 'tanh'],
    'model__alpha': [0.0001, 0.001, 0.01],
    'model__learning_rate': ['constant', 'invscaling', 'adaptive'],
    'model__solver': ['sgd', 'adam'],
    'model__max_iter': [200, 400, 600],
}

## XGB Tuning

In [27]:
# Tune hyperparameters
tuned_xgb_model = tune_hyperparameters(xgb_model, xgb_param_grid, X_train, y_train)


# Get the best hyperparameters
best_xgb_params = tuned_xgb_model.best_params_


# Print or use the best hyperparameters as needed
print("Best XGBoost hyperparameters:", best_xgb_params)

Best XGBoost hyperparameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 100, 'model__scale_pos_weight': 3}


In [28]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_xgb_model.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_xgb_model.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

Top Models:
                                               params  mean_test_score  \
40  {'model__learning_rate': 0.1, 'model__max_dept...         0.937401   
37  {'model__learning_rate': 0.1, 'model__max_dept...         0.935185   
38  {'model__learning_rate': 0.1, 'model__max_dept...         0.934876   
35  {'model__learning_rate': 0.1, 'model__max_dept...         0.933457   
41  {'model__learning_rate': 0.1, 'model__max_dept...         0.933407   

    std_test_score  
40        0.020689  
37        0.021410  
38        0.020055  
35        0.024558  
41        0.020956  

Best Model Accuracy: 0.9699570815450643
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      1322
           1       0.76      0.66      0.70        76

    accuracy                           0.97      1398
   macro avg       0.87      0.82      0.84      1398
weighted avg       0.97      0.97      0.97      1398


Confusion Mat

### Observations:
- Compared to the base XGBoost performance seen in the model_training_notebook, we do have some improvement to the recall score of the under-represented class.  It does come at the cost of a slight increase in false positives, but worth the nearly 8% increase in recall.

## Naive Bayes Tuning
- There is not much tuning to do for NB, but we'll make some changes to the "priors" parameter to see if we can come up with even a slightly improved model from the one before

In [32]:
# Tune hyperparameters
tuned_nb_model = tune_hyperparameters(nb_model, nb_param_grid, X_train, y_train)


# Get the best hyperparameters
best_nb_params = tuned_nb_model.best_params_


# Print or use the best hyperparameters as needed
print("Best Naive Bayes hyperparameters:", best_nb_params)

Best Naive Bayes hyperparameters: {'model__priors': None}


In [33]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nb_model.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_nb_model.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

Top Models:
                          params  mean_test_score  std_test_score
0        {'model__priors': None}         0.738422        0.026415
1  {'model__priors': [0.1, 0.9]}         0.738422        0.026415
2  {'model__priors': [0.5, 0.5]}         0.738422        0.026415
3  {'model__priors': [0.9, 0.1]}         0.738422        0.026415

Best Model Accuracy: 0.6609442060085837
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.65      0.78      1322
           1       0.11      0.78      0.20        76

    accuracy                           0.66      1398
   macro avg       0.55      0.72      0.49      1398
weighted avg       0.93      0.66      0.75      1398


Confusion Matrix:
 [[865 457]
 [ 17  59]]


### Obeservations:
- This model is no better than the one we had previously in that, it contains more false negatives.  Despite the fact that the NB model does have a substantially low accuracy (with many false positives), we used it in the ensemble meta model because of it's high recall rate for the under-represented class.
- When we retrain the ensemble model, we will stick with the out-of-the-box NB model.

## Neural Network Tuning

In [None]:
# Tune hyperparameters
tuned_nn_model = tune_hyperparameters(nn_model, nn_param_grid, X_train, y_train)


# Get the best hyperparameters
best_nn_params = tuned_nn_model.best_params_


# Print or use the best hyperparameters as needed
print("Best Neural Net hyperparameters:", best_nn_params)

In [None]:
# Analyze grid search results
cv_results = pd.DataFrame(tuned_nn_model.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = tuned_nn_model.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)