# Hyperparameter Tuning

In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.data_preprocessing import df_construct, add_eng_values, alter_term_gender
from src.model_training import build_preprocessor, build_model, build_full_pipeline
from src.hyperparameter_tuning import tune_hyperparameters
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# ... (Read data, df_construct, and other necessary steps)

In [4]:
# Use this code in the event that it's needed
df = pd.read_csv('data/processed_data/df_cleaned.csv')
X = df.drop('at_risk', axis=1)
y = df.at_risk

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
numeric_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

preprocessor = build_preprocessor(numeric_features, categorical_features)
model = build_model()
full_pipeline = build_full_pipeline(preprocessor, model)

In [15]:
grid_search = tune_hyperparameters(full_pipeline, X_train, y_train)

In [16]:
# Analyze grid search results
cv_results = pd.DataFrame(grid_search.cv_results_)

# Display the top models based on mean test scores
top_models = cv_results.sort_values(by='mean_test_score', ascending=False).head()
print("Top Models:")
print(top_models[['params', 'mean_test_score', 'std_test_score']])

# Select the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)

# Print accuracy and classification report for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)

print(f"\nBest Model Accuracy: {accuracy_best}")
print("Best Model Classification Report:\n", classification_rep_best)
print('\nConfusion Matrix:\n', conf_matrix_best)

Top Models:
                                              params  mean_test_score  \
8  {'model__max_depth': 20, 'model__n_estimators'...         0.944787   
6  {'model__max_depth': 20, 'model__n_estimators'...         0.944752   
2  {'model__max_depth': None, 'model__n_estimator...         0.942552   
1  {'model__max_depth': None, 'model__n_estimator...         0.942532   
7  {'model__max_depth': 20, 'model__n_estimators'...         0.941864   

   std_test_score  
8        0.006190  
6        0.008126  
2        0.004447  
1        0.005986  
7        0.006204  

Best Model Accuracy: 0.9649499284692418
Best Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1322
           1       0.97      0.37      0.53        76

    accuracy                           0.96      1398
   macro avg       0.97      0.68      0.76      1398
weighted avg       0.96      0.96      0.96      1398


Confusion Matrix:
 [[1321