# Importing Libraries and Setting Up Environment

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix, classification_report, roc_curve, auc, silhouette_score

from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram

import pickle

import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
np.random.seed(42)

In [None]:
heart_disease_df = pd.read_csv('../data/heart_disease.csv')

# Exploratory Data Analysis (EDA)

In [None]:
heart_disease_df.shape

In [None]:
heart_disease_df.info()

In [None]:
print(heart_disease_df.head())

In [None]:
heart_disease_df.describe().round(2)

## Data Quality Checks

In [None]:
heart_disease_df.isna().sum()

In [None]:
heart_disease_df.drop(heart_disease_df[['id', 'dataset']], axis=1, inplace=True)
heart_disease_df['num'] = heart_disease_df['num'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
heart_disease_df.drop(heart_disease_df[heart_disease_df['thal'].isnull() & heart_disease_df['slope'].isnull()].index, inplace=True)

mean_cols = ['trestbps', 'chol', 'thalch', 'oldpeak']
heart_disease_df[mean_cols] = heart_disease_df[mean_cols].fillna(heart_disease_df[mean_cols].mean())

heart_disease_df['ca'].fillna(heart_disease_df['ca'].median(), inplace=True)

mode_cols = ['fbs', 'restecg', 'exang', 'slope', 'thal']
for col in mode_cols:
    heart_disease_df[col].fillna(heart_disease_df[col].mode().iloc[0], inplace=True)

In [None]:
heart_disease_df.isna().sum()

In [None]:
outlier_clos = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
outlier = pd.Series(False, index=heart_disease_df.index)

for col in outlier_clos:
    Q1 = heart_disease_df[col].quantile(0.25)
    Q3 = heart_disease_df[col].quantile(0.75)
    IQR = Q3 - Q1
    upper = Q3 + 1.5*IQR
    lower = Q1 - 1.5*IQR
    mask = ((heart_disease_df[col] > upper) | (heart_disease_df[col] < lower))
    outlier |= mask

heart_disease_df = heart_disease_df[~outlier]

## Data Visualization

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(9, 7))

sns.histplot(x= heart_disease_df['age'], ax=ax[0,0])
sns.histplot(x= heart_disease_df['trestbps'], ax=ax[0,1])
sns.histplot(x= heart_disease_df['oldpeak'], ax=ax[1,0])
sns.histplot(x= heart_disease_df['thalch'], ax=ax[1,1])

for axes in ax.flat:
    axes.set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
sns.scatterplot(heart_disease_df, x='age', y='chol', hue='sex')
plt.show()

In [None]:
pairplot_data = heart_disease_df.drop(['fbs', 'exang', 'ca'], axis=1)
pairplot_data['num'] = pairplot_data['num'].astype('category')

plt.figure(figsize=(20,16))
sns.pairplot(pairplot_data, hue='num', diag_kind='kde', plot_kws={'alpha':0.7,'s':50}, height=2)
plt.suptitle('Analysis of Feature Relationships by Disease Severity', fontsize=18, fontweight='bold')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
corr = heart_disease_df[['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'num']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title(' Correlation Matrix of Heart Disease Features', y=1.04, fontsize=17, fontweight='bold')
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.show()

## Data Preprocessing

### Encoding

In [None]:
heart_disease_df = pd.get_dummies(heart_disease_df, columns=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal'], dtype=int)

### Scaling

In [None]:
heart_disease_df_scaled = heart_disease_df.copy()

scaler = StandardScaler()
scale_clos = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
heart_disease_df_scaled[scale_clos] = scaler.fit_transform(heart_disease_df_scaled[scale_clos])

### Split X & Y

In [None]:
X = heart_disease_df_scaled.drop('num', axis=1)
y = heart_disease_df_scaled['num']

### Dimension Reduction (PCA)

In [None]:
pca = PCA(0.95)
X_pca = pca.fit_transform(X)
pca.n_components_

In [None]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=heart_disease_df['num'], cmap='viridis', alpha=0.7, edgecolor='k')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA: First Two Components')
plt.show()

### Data Splitting

In [None]:
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.25, random_state=42, stratify=y)

# Supervised Learning - Classification

## Logistic Reg Classification

In [None]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

y_pred_log = log_model.predict(X_test)
y_pred_prob_log = log_model.predict_proba(X_test)

print("accuracy_score",accuracy_score(y_test, y_pred_log))
print('f1_score',f1_score(y_test, y_pred_log))
print('balanced_accuracy_score',balanced_accuracy_score(y_test, y_pred_log))
print('precision_score',precision_score(y_test, y_pred_log))
print('recall_score',recall_score(y_test, y_pred_log))

print(classification_report(y_test, y_pred_log))
cm = confusion_matrix(y_test, y_pred_log)
cm

In [None]:
sns.heatmap(cm, cmap='Greens', annot=True)
plt.title('Confusion Matrix (logistic)')
plt.xlabel('Predict')
plt.ylabel('Actual')
plt.show()

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_log[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve (logistic)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid(True)
plt.show()

## Decision Tree

In [None]:
DT_model = DecisionTreeClassifier(criterion='gini', random_state=42)
DT_model.fit(X_train, y_train)

y_pred_DT = DT_model.predict(X_test)
y_pred_prob_DT = DT_model.predict_proba(X_test)

print("accuracy_score",accuracy_score(y_test, y_pred_DT))
print('f1_score',f1_score(y_test, y_pred_DT))
print('balanced_accuracy_score',balanced_accuracy_score(y_test, y_pred_DT))
print('precision_score',precision_score(y_test, y_pred_DT))
print('recall_score',recall_score(y_test, y_pred_DT))

print(classification_report(y_test, y_pred_DT))
cm = confusion_matrix(y_test, y_pred_DT)
cm

In [None]:
sns.heatmap(cm, cmap='Greens', annot=True, fmt='.2f')
plt.title('Confusion Matrix (Decision Tree)')
plt.xlabel('Predict')
plt.ylabel('Actual')
plt.show()

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_DT[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve (Decision Tree)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid(True)
plt.show()

## Random Forest

In [None]:
RF_model = RandomForestClassifier(random_state=42)
RF_model.fit(X_train, y_train)

y_pred_RF = RF_model.predict(X_test)
y_pred_prob_RF = RF_model.predict_proba(X_test)

print("accuracy_score",accuracy_score(y_test, y_pred_RF))
print('f1_score',f1_score(y_test, y_pred_RF))
print('balanced_accuracy_score',balanced_accuracy_score(y_test, y_pred_RF))
print('precision_score',precision_score(y_test, y_pred_RF))
print('recall_score',recall_score(y_test, y_pred_RF))

print(classification_report(y_test, y_pred_RF))
cm = confusion_matrix(y_test, y_pred_RF)
cm

In [None]:
sns.heatmap(cm, cmap='Greens', annot=True, fmt='.2f')
plt.title('Confusion Matrix (Random Forest)')
plt.xlabel('Predict')
plt.ylabel('Actual')
plt.show()

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_RF[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve (Random Forest)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid(True)
plt.show()

## Support Vector Machine (SVM)

In [None]:
SVM_model = SVC(kernel='linear', degree=5, random_state=42)
SVM_model.fit(X_train, y_train)

y_pred_SVM = SVM_model.predict(X_test)

print("accuracy_score",accuracy_score(y_test, y_pred_SVM))
print('f1_score',f1_score(y_test, y_pred_SVM))
print('balanced_accuracy_score',balanced_accuracy_score(y_test, y_pred_SVM))
print('precision_score',precision_score(y_test, y_pred_SVM))
print('recall_score',recall_score(y_test, y_pred_SVM))

print(classification_report(y_test, y_pred_SVM))
cm = confusion_matrix(y_test, y_pred_SVM)
cm

In [None]:
sns.heatmap(cm, cmap='Greens', annot=True, fmt='.2f')
plt.title('Confusion Matrix (SVM)')
plt.xlabel('Predict')
plt.ylabel('Actual')
plt.show()

## XGBoost

In [None]:
xgb_model = XGBClassifier(subsample=0.2, n_estimators=250, max_depth=3, learning_rate=0.01, colsample_bytree=0.8)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
y_pred_prob = xgb_model.predict_proba(X_test)

print("accuracy_score",accuracy_score(y_test, y_pred_xgb))
print('f1_score',f1_score(y_test, y_pred_xgb))
print('balanced_accuracy_score',balanced_accuracy_score(y_test, y_pred_xgb))
print('precision_score',precision_score(y_test, y_pred_xgb))
print('recall_score',recall_score(y_test, y_pred_xgb))

print(classification_report(y_test, y_pred_xgb))
cm = confusion_matrix(y_test, y_pred_xgb)
cm

In [None]:
cm = confusion_matrix(y_test, y_pred_xgb)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - XGBoost')

plt.tight_layout()
plt.show()

## Voting Classifier

In [None]:
voting_model = VotingClassifier(estimators=[
    ('log_model', log_model),
    ('RF_model', RF_model),
    ('xgb_model', xgb_model)
], voting='hard')

voting_model.fit(X_train, y_train)
y_pred_voting = voting_model.predict(X_test)
# y_pred_prob_voting = voting_model.predict_proba(X_test)

print("accuracy_score",accuracy_score(y_test, y_pred_voting))
print('f1_score',f1_score(y_test, y_pred_voting))
print('balanced_accuracy_score',balanced_accuracy_score(y_test, y_pred_voting))
print('precision_score',precision_score(y_test, y_pred_voting))
print('recall_score',recall_score(y_test, y_pred_voting))

print(classification_report(y_test, y_pred_voting))
cm = confusion_matrix(y_test, y_pred_voting)
cm

## Hyperparameter Tuning

In [None]:
param_grid_log = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'max_iter': [100, 500, 1000]
}

param_grid_DT = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 20, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

param_dist_RF = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

param_dist_SVM = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 5],
    'gamma': ['scale', 'auto', 0.1, 1, 10]
}

param_dist_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1]
}

param_dist_voting = {
    'log_model__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'log_model__penalty': ['l1', 'l2', 'elasticnet', None],
    'log_model__solver': ['lbfgs', 'liblinear', 'saga'],
    'log_model__max_iter': [100, 500, 1000],

    'RF_model__n_estimators': [100, 200, 300],
    'RF_model__max_depth': [None, 10, 20, 30],
    'RF_model__min_samples_split': [2, 5, 10],
    'RF_model__min_samples_leaf': [1, 2, 4],
    'RF_model__bootstrap': [True, False],

    'xgb_model__n_estimators': [100, 200, 300],
    'xgb_model__max_depth': [3, 5, 7],
    'xgb_model__learning_rate': [0.01, 0.1, 0.2]
}


In [None]:
grid_log = GridSearchCV(log_model, param_grid=param_grid_log, cv=5, scoring='accuracy', n_jobs=-1)
grid_log.fit(X_train, y_train)
print('Best Score log:', grid_log.best_score_)
print('Best Params log:', grid_log.best_params_)

In [None]:
grid_DT = GridSearchCV(DT_model, param_grid=param_grid_DT, cv=5, scoring='accuracy', n_jobs=-1)
grid_DT.fit(X_train, y_train)
print('Best Score DT:', grid_DT.best_score_)
print('Best Params DT:', grid_DT.best_params_)

In [None]:
grid_RF = RandomizedSearchCV(RF_model, param_distributions=param_dist_RF, cv=3, scoring='accuracy', n_jobs=-1)
grid_RF.fit(X_train, y_train)
print('Best Score RF:', grid_RF.best_score_)
print('Best Params RF:', grid_RF.best_params_)

In [None]:
grid_SVM = RandomizedSearchCV(SVM_model, param_distributions=param_dist_SVM, cv=5, scoring='accuracy', n_jobs=-1)
grid_SVM.fit(X_train, y_train)
print('Best Score SVM:', grid_SVM.best_score_)
print('Best Params SVM:', grid_SVM.best_params_)

In [None]:
grid_xgb = RandomizedSearchCV(xgb_model, param_distributions=param_dist_xgb, n_iter=20, cv=3, scoring='accuracy', n_jobs=-1)
grid_xgb.fit(X_train, y_train)
print('Best Score xgb:', grid_xgb.best_score_)
print('Best Params xgb:', grid_xgb.best_params_)

In [None]:
grid_voting = RandomizedSearchCV(voting_model, param_distributions=param_dist_voting, n_iter=20, cv=3, scoring='accuracy', n_jobs=-1)
grid_voting.fit(X_train, y_train)
print('Best Score voting:', grid_voting.best_score_)
print('Best Params voting:', grid_voting.best_params_)

## Evaluation Models

In [None]:
all_models = {
    'Model': [
        'Logistic Regression', 'Decision Tree', 'Random Forest',
        'Support Vector Machine (SVM)', 'XGBoost', 'Voting Classifier'
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred_log), accuracy_score(y_test, y_pred_DT),
        accuracy_score(y_test, y_pred_RF), accuracy_score(y_test, y_pred_SVM),
        accuracy_score(y_test, y_pred_xgb), accuracy_score(y_test, y_pred_voting)
    ],
    'F1-Score': [
        f1_score(y_test, y_pred_log), f1_score(y_test, y_pred_DT),
        f1_score(y_test, y_pred_RF), f1_score(y_test, y_pred_SVM),
        f1_score(y_test, y_pred_xgb), f1_score(y_test, y_pred_voting)
    ]
}

performance_df = pd.DataFrame(all_models).set_index('Model').sort_values(by='Accuracy', ascending=False)

print('Final Model Evaluation:')
print((performance_df * 100).round(2))

plt.figure(figsize=(10, 8))
sns.barplot(x=performance_df['Accuracy'] * 100, y=performance_df.index, palette='viridis')
plt.title('Final Model Evaluation', fontsize=16)
plt.xlabel("Accuracy (%)", fontsize=12)
plt.ylabel('Model', fontsize=12)
plt.show()

# Unsupervised Learning - Clustering

## KMeans

In [None]:
kmeans = KMeans(n_clusters=3, n_init='auto', random_state=42)
kmeans.fit(X_pca)

labels = kmeans.labels_
centers = kmeans.cluster_centers_

print("Silhouette Score:", silhouette_score(X, labels))

In [None]:
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', alpha=0.7, edgecolor='k')
plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='x', s=200, label='Centroids')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('K-Means Clustering')
plt.legend()
plt.show()

In [None]:
wcss = []
silhouette_scores = []

for k in range(2, 15):
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    labels = kmeans.fit_predict(X_pca)
    wcss.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_pca, labels))

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(range(2, 15), wcss, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.title('Elbow Method')

plt.subplot(1,2,2)
plt.plot(range(2, 15), silhouette_scores, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method')

plt.tight_layout()
plt.show()

## Hierarchical Clustering

In [None]:
linkage_data = linkage(X_pca, method= 'ward', metric='euclidean')

plt.figure(figsize=(10, 7))
dendrogram(linkage_data)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Data points')
plt.ylabel('Distance')
plt.show()

# Export Final Model

In [None]:
models = {
  'final_model_log' : log_model,
  'DT_model' : DT_model,
  'RF_model' : RF_model,
  'SVM_model' : SVM_model,
  'XGBoost_model' : xgb_model,
  'Voting_model' : voting_model
}

for name, model in models.items():
  with open(f'models/{name}.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
with open('models/final_model_log.pkl', 'rb') as file:
  loaded_model = pickle.load(file)

In [None]:
y_pred = loaded_model.predict(X_test)
print("accuracy_score",accuracy_score(y_test, y_pred))