In [None]:
# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# 2. Load Data
df = pd.read_csv('../data/heart.csv')

# 3. EDA
sns.histplot(df['age']); plt.title('Age Distribution'); plt.show()
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot=True, fmt=".2f"); plt.show()

# 4. Preprocessing
X = df.drop('target', axis=1)
y = df['target']
# If any categorical columns: one‑hot encode here
# e.g.: X = pd.get_dummies(X, columns=['cp','thal'], drop_first=True)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# 6. Model & Hyperparameter Tuning
rfc = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}
grid = GridSearchCV(rfc, param_grid, cv=5, scoring='f1')
grid.fit(X_train, y_train)

best = grid.best_estimator_
print("Best params:", grid.best_params_)

# 7. Evaluate
y_pred = best.predict(X_test)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d'); plt.title('Confusion Matrix'); plt.show()

# 8. Save Model & Scaler
joblib.dump(best, '../model/heart_model.pkl')
joblib.dump(scaler, '../model/scaler.pkl')
