In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE, chi2, SelectKBest
from sklearn.linear_model import LogisticRegression as LogReg

print("Hyperparameter Tuning:")

# Load data
df_encoded = pd.read_csv('data/heart_disease.csv')
X = df_encoded.drop("target", axis=1)
y = df_encoded["target"]

print(f"Data shape: {X.shape}")

# 1) Feature Importance using Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)
importances = rf.feature_importances_
feat_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)

# 2) Recursive Feature Elimination (RFE)
model = LogReg(max_iter=5000, solver="saga", random_state=42)
rfe = RFE(model, n_features_to_select=10)
X_rfe = rfe.fit_transform(X, y)
selected_features_rfe = X.columns[rfe.support_]

# 3) Chi-Square Test
X_chi2 = SelectKBest(score_func=chi2, k=10)
X_chi2_fit = X_chi2.fit_transform(X.abs(), y)
chi2_features = X.columns[X_chi2.get_support()]

# 4) Final reduced dataset
selected_features = list(set(selected_features_rfe).union(set(chi2_features)))
X_reduced = X[selected_features]

print(f"Selected {len(selected_features)} features")

# SPLIT THE DATA 
X_train, X_test, y_train, y_test = train_test_split(
    X_reduced, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

# Train baseline models quickly for comparison
baseline_rf = RandomForestClassifier(n_estimators=100, random_state=42)
baseline_svm = SVC(probability=True, random_state=42)

baseline_rf.fit(X_train, y_train)
baseline_svm.fit(X_train, y_train)

# Get baseline accuracies
y_pred_rf_baseline = baseline_rf.predict(X_test)
y_pred_svm_baseline = baseline_svm.predict(X_test)

baseline_acc_rf = accuracy_score(y_test, y_pred_rf_baseline)
baseline_acc_svm = accuracy_score(y_test, y_pred_svm_baseline)

# Create df_results dataframe
df_results = pd.DataFrame({
    "Random Forest": {"Accuracy": baseline_acc_rf},
    "SVM": {"Accuracy": baseline_acc_svm}
}).T

print(f"Baseline Random Forest Accuracy: {baseline_acc_rf:.3f}")
print(f"Baseline SVM Accuracy: {baseline_acc_svm:.3f}")

# Hyperparameter Tuning 
# Reduced parameter grids for faster execution
param_grid_rf = {
    "n_estimators": [50, 100],
    "max_depth": [10, None],
}

param_grid_svm = {
    "C": [1, 10],
    "kernel": ["rbf"],
}

# GridSearchCV for Random Forest
print("Tuning Random Forest with GridSearchCV")
grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid_rf,
    scoring="accuracy",
    cv=2,
    n_jobs=-1
)
grid_rf.fit(X_train, y_train)
print("Random Forest tuning completed!")

# RandomizedSearchCV for SVM
print("Tuning SVM with RandomizedSearchCV")
rand_svm = RandomizedSearchCV(
    SVC(probability=True, random_state=42),
    param_distributions=param_grid_svm,
    n_iter=2,
    scoring="accuracy",
    cv=2,
    n_jobs=-1,
    random_state=42
)
rand_svm.fit(X_train, y_train)
print("SVM tuning completed!")

# Get best models
best_rf = grid_rf.best_estimator_
best_svm = rand_svm.best_estimator_

# Predict with tuned models
y_pred_rf_tuned = best_rf.predict(X_test)
y_pred_svm_tuned = best_svm.predict(X_test)

# Calculate tuned accuracies
tuned_acc_rf = accuracy_score(y_test, y_pred_rf_tuned)
tuned_acc_svm = accuracy_score(y_test, y_pred_svm_tuned)

# Simple comparison table
comparison_data = []
for model_name in ["Random Forest", "SVM"]:
    if model_name == "Random Forest":
        baseline_acc = baseline_acc_rf
        tuned_acc = tuned_acc_rf
        best_params = grid_rf.best_params_
    else:
        baseline_acc = baseline_acc_svm
        tuned_acc = tuned_acc_svm
        best_params = rand_svm.best_params_
    
    comparison_data.append({
        'Model': model_name,
        'Baseline_Accuracy': baseline_acc,
        'Tuned_Accuracy': tuned_acc,
        'Improvement': tuned_acc - baseline_acc,
        'Best_Params': best_params
    })

df_comparison = pd.DataFrame(comparison_data)

print("\nPerformance Comparison: ")
print(df_comparison.round(3))

# Deliverable: Best performing model with optimized hyperparameters
best_model_row = df_comparison.loc[df_comparison['Tuned_Accuracy'].idxmax()]
print(f"\nBEST PERFORMING MODEL: {best_model_row['Model']}")
print(f"OPTIMIZED ACCURACY: {best_model_row['Tuned_Accuracy']:.3f}")
print(f"OPTIMIZED HYPERPARAMETERS: {best_model_row['Best_Params']}")

# Save the best model
if best_model_row['Model'] == "Random Forest":
    best_model = best_rf
else:
    best_model = best_svm

# Save the model as .pkl file
filename = "models/final_model.pkl"
joblib.dump(best_model, filename)
print(f"Model saved as: {filename}")


# Simple visualization
plt.figure(figsize=(8, 5))
x = range(len(df_comparison))
width = 0.35

plt.bar([i - width/2 for i in x], df_comparison['Baseline_Accuracy'], width, 
        label='Baseline', alpha=0.7, color='skyblue')
plt.bar([i + width/2 for i in x], df_comparison['Tuned_Accuracy'], width, 
        label='Tuned', alpha=0.7, color='lightcoral')

plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Baseline vs Tuned Model Performance')
plt.xticks(x, df_comparison['Model'])
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nHyperparameter tuning completed successfully!")
