# Loading Dataset

In [None]:
from google.colab import files
import pandas as pd

uploaded = files.upload()

# Assuming one file is uploaded
for fn in uploaded.keys():
    df = pd.read_excel(fn)


Saving heart_disease_uci.xlsx to heart_disease_uci.xlsx


# Exploratory Data Analysis (EDA)
***this section has 6 steps***

**EDA.01**

In [None]:
# 1. Shape
print("🔹 Dataset Shape:")
print(f"Rows: {df.shape[0]}, columns: {df.shape[1]}\n")

# 2. Column Names and Data Types
print("🔹 Column Names and Data Types:")
print(df.dtypes)
print("\n")

# 3. Missing Values
print("🔹 Missing Values per Column:")
missing = df.isnull().sum()
missing = missing[missing > 0]
if not missing.empty:
    print(missing)
else:
    print("No missing values found.")
print("\n")

# 4. Basic Statistics
print("🔹 Summary Statistics for Numerical Features:")
print(df.describe().T[['mean', 'std', 'min', 'max']])

🔹 Dataset Shape:
Rows: 920, columns: 16

🔹 Column Names and Data Types:
id            int64
age           int64
sex          object
dataset      object
cp           object
trestbps    float64
chol        float64
fbs         float64
restecg      object
thalch      float64
exang       float64
oldpeak     float64
slope        object
ca          float64
thal         object
num           int64
dtype: object


🔹 Missing Values per Column:
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
dtype: int64


🔹 Summary Statistics for Numerical Features:
                mean         std   min    max
id        460.500000  265.725422   1.0  920.0
age        53.510870    9.424685  28.0   77.0
trestbps  132.132404   19.066070   0.0  200.0
chol      199.130337  110.780810   0.0  603.0
fbs         0.166265    0.372543   0.0    1.0
thalch    137.545665   25.926276  60.0  202.0
exang       0.389595 

**EDA.02**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Target variable
target = 'num'

# Value counts
print("🔹 Target Value Counts:")
print(df[target].value_counts())

# Relative frequency
print("\n🔹 Target Class Proportions:")
print(df[target].value_counts(normalize=True))

# Visualization
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x=target, palette='coolwarm')
plt.title("Distribution of Target Variable: Heart Disease Severity")
plt.xlabel("Heart Disease Class (num)")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

**EDA.03**

In [None]:
#Identify Feature Types ####################

# Drop target column
features = df.drop(columns=['num'])

# Separate numerical and categorical columns
numerical_cols = features.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = features.select_dtypes(include='object').columns

print("🔹 Numerical Features:", list(numerical_cols))
print("🔹 Categorical Features:", list(categorical_cols))


# Plot Numerical Features ####################

import matplotlib.pyplot as plt
import seaborn as sns

# Histograms
features[numerical_cols].hist(figsize=(15, 10), bins=20, color='skyblue', edgecolor='black')
plt.suptitle("Histograms of Numerical Features", fontsize=16)
plt.tight_layout()
plt.show()

# Boxplots
plt.figure(figsize=(15, 8))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(3, 5, i)
    sns.boxplot(y=features[col], color='lightgreen')
    plt.title(col)
plt.tight_layout()
plt.show()




#Plot Categorical Features ####################

for col in categorical_cols:
    print(f"\n🔹 Frequency Table for {col}")
    print(features[col].value_counts())

    plt.figure(figsize=(6, 4))
    sns.countplot(data=features, x=col, palette='pastel')
    plt.title(f"Bar Plot of {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


**EDA.04**

In [None]:
# Identify Feature Types ####################
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('num')
categorical_cols = df.select_dtypes(include='object').columns


# Numerical vs Target – Boxplots & T-tests ####################
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns

for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=df, x='num', y=col, palette='coolwarm')
    plt.title(f"{col} vs num")
    plt.tight_layout()
    plt.show()

    group0 = df[df['num'] == 0][col]
    group1 = df[df['num'] != 0][col]
    t_stat, p_val = ttest_ind(group0, group1, nan_policy='omit')
    print(f"T-test for {col}: t-stat = {t_stat:.2f}, p-value = {p_val:.4f}")

    # Categorical vs Target – Stacked Bar Plots ####################
    for col in categorical_cols:
      ct = pd.crosstab(df[col], df['num'], normalize='index')
      ct.plot(kind='bar', stacked=True, figsize=(6, 4), colormap='Set2')
    plt.title(f"{col} vs num")
    plt.ylabel("Proportion")
    plt.tight_layout()
    plt.show()

EDA.05

In [None]:
# Correlation Matrix ####################
import matplotlib.pyplot as plt
import seaborn as sns

# Select numerical columns including target
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Correlation matrix
plt.figure(figsize=(12, 10))
corr = df[numerical_cols].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title("Correlation Matrix of Numerical Features")
plt.tight_layout()
plt.show()



# Pairplot (Optional but Insightful) ####################
selected = numerical_cols[:5].tolist() + ['num']  # Pick top 5 for clarity
sns.pairplot(df[selected], hue='num', palette='husl')
plt.suptitle("Pairplot of Selected Features", y=1.02)
plt.show()



**EDA.06**

In [None]:
# Skewness of Numerical Features ####################
from scipy.stats import skew

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('num')

print("🔹 Skewness of Numerical Features:")
for col in numerical_cols:
    sk = skew(df[col].dropna())
    print(f"{col}: skewness = {sk:.2f}")


# Outlier Detection Using IQR ####################
print("\n🔹 Outlier Counts (IQR Method):")
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
    print(f"{col}: {len(outliers)} outliers")


# Spot Invalid Values ####################
print("\n🔹 Invalid Value Checks:")
print("Negative values in 'chol':", (df['chol'] < 0).sum())
print("Negative values in 'trestbps':", (df['trestbps'] < 0).sum())




# Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score

# 1. Load dataset
df = pd.read_excel("heart_disease_uci.xlsx")
print("Columns:", df.columns)

# 2. Transform target into binary classification (0 = no disease, 1 = disease)
df["target"] = (df["num"] > 0).astype(int)
df = df.drop(columns=["num"])  # drop original target

# 3. Identify numerical and categorical columns
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.drop("target")
categorical_cols = df.select_dtypes(include="object").columns

# 4. Define preprocessing pipelines
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, numeric_cols),
        ("cat", cat_transformer, categorical_cols)
    ]
)

# 5. Define models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# 6. Split dataset (stratified to preserve class balance)
X = df.drop(columns=["target"])
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 7. Cross-validation for model comparison
cv_results = {}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    scores = cross_validate(
        pipeline, X_train, y_train, cv=skf,
        scoring=["accuracy", "precision", "recall", "f1", "roc_auc"]
    )
    cv_results[name] = {metric: np.mean(scores[f"test_{metric}"]) for metric in ["accuracy", "precision", "recall", "f1", "roc_auc"]}

results_df = pd.DataFrame(cv_results).T.sort_values(by="roc_auc", ascending=False)
print("\n📊 Cross-validation results (mean scores across folds):")
print(results_df.round(3))

# 8. Test set evaluation for each model
test_results = {}
roc_fig, roc_ax = plt.subplots(figsize=(7, 6))

for name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    # Store test metrics
    test_results[name] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": auc(*roc_curve(y_test, y_proba)[:2])
    }

    # Print classification report
    print(f"\n🔎 Classification Report for {name}:")
    print(classification_report(y_test, y_pred))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(cm).plot(cmap="Blues")
    plt.title(f"Confusion Matrix - {name}")
    plt.show()

    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    roc_ax.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")

# Final ROC curve plot for all models
roc_ax.plot([0, 1], [0, 1], linestyle="--", color="grey")
roc_ax.set_xlabel("False Positive Rate")
roc_ax.set_ylabel("True Positive Rate")
roc_ax.set_title("ROC Curves for All Models")
roc_ax.legend()
plt.show()

# 9. Summary table of test-set metrics
test_results_df = pd.DataFrame(test_results).T.sort_values(by="roc_auc", ascending=False)
print("\n📊 Test set evaluation results:")
print(test_results_df.round(3))

# 10. Identify and recommend best model
best_model = test_results_df.iloc[0]
best_name = test_results_df.index[0]
print("\n✅ Best model:", best_name)
print(best_model.round(3))
print(f"\n👉 Recommendation: {best_name} performed the best with ROC-AUC = {best_model['roc_auc']:.3f}, making it the most suitable model for this dataset.")
