# Hyperparameter Tuning

In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.data_preprocessing import df_construct, add_eng_values, alter_term_gender
from src.model_training import build_preprocessor, build_model, build_full_pipeline
from src.hyperparameter_tuning import tune_hyperparameters
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# ... (Read data, df_construct, and other necessary steps)

In [4]:
# Use this code in the event that it's needed
df = pd.read_csv('data/processed_data/df_cleaned.csv')
X = df.drop('at_risk', axis=1)
y = df.at_risk

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

preprocessor = build_preprocessor(numeric_features, categorical_features)
model = build_model()
full_pipeline = build_full_pipeline(preprocessor, model)

In [11]:
grid_search = tune_hyperparameters(full_pipeline, X_train, y_train)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/jlo/anaconda3/envs/newenv/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/hv/bb146n9d0yv30b1xnd05pwcr0000gn/T/ipykernel_58487/4028494003.py", line 1, in <module>
    grid_search = tune_hyperparameters(full_pipeline, X_train, y_train)
  File "/Users/jlo/Desktop/LHL/LHLProjects/LHL-Capstone-Project/src/hyperparameter_tuning.py", line 18, in tune_hyperparameters
  File "/Users/jlo/anaconda3/envs/newenv/lib/python3.8/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/jlo/anaconda3/envs/newenv/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 898, in fit
    self._run_search(evaluate_candidates)
  File "/Users/jlo/anaconda3/envs/newenv/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 1419, in _run_search
    evaluate_candid

In [8]:
# Analyze grid search results
cv_results = pd.DataFrame(grid_search.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

Top Models:
                                                params  mean_test_score  \
36   {'model__class_weight': None, 'model__max_dept...         0.962783   
581  {'model__class_weight': 'balanced', 'model__ma...         0.962782   
583  {'model__class_weight': 'balanced', 'model__ma...         0.962603   
39   {'model__class_weight': None, 'model__max_dept...         0.962425   
618  {'model__class_weight': 'balanced', 'model__ma...         0.962246   

     std_test_score  
36         0.004307  
581        0.004902  
583        0.004729  
39         0.003361  
618        0.004411  

Best Model Accuracy: 0.9656652360515021
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1322
           1       0.91      0.41      0.56        76

    accuracy                           0.97      1398
   macro avg       0.94      0.70      0.77      1398
weighted avg       0.96      0.97      0.96      1398


C

In [9]:
top_models['params'][54]

{'model__class_weight': None,
 'model__max_depth': None,
 'model__max_features': 'sqrt',
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 5,
 'model__n_estimators': 300}

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from hyperparameter_tuning import tune_hyperparameters

# Example for RandomForestClassifier
rf_pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
rf_param_grid = {
    'randomforestclassifier__n_estimators': [30, 50, 100, 200, 300],
    'randomforestclassifier__max_depth': [None, 2, 3, 5, 10, 20, 30],
    'randomforestclassifier__class_weight': [None, 'balanced'],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2']
}
rf_grid_search = tune_hyperparameters(rf_pipeline, rf_param_grid, X_train, y_train)

# Example for GradientBoostingClassifier
gb_pipeline = make_pipeline(StandardScaler(), GradientBoostingClassifier())
gb_param_grid = {
    'gradientboostingclassifier__n_estimators': [50, 100, 200],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.2],
    'gradientboostingclassifier__max_depth': [3, 5, 7],
    'gradientboostingclassifier__min_samples_split': [2, 5, 10],
    'gradientboostingclassifier__min_samples_leaf': [1, 2, 4],
    'gradientboostingclassifier__max_features': ['auto', 'sqrt', 'log2']
}
gb_grid_search = tune_hyperparameters(gb_pipeline, gb_param_grid, X_train, y_train)