In [None]:

import pandas as pd
import numpy as np
from pathlib import Path

data_path = Path("../data/heart_disease.csv")

def ensure_data():
    if data_path.exists() and data_path.stat().st_size > 0:
        try:
            df0 = pd.read_csv(data_path)
            if len(df0) > 0:
                return df0
        except Exception:
            pass
    try:
        from ucimlrepo import fetch_ucirepo
        heart = fetch_ucirepo(id=45)
        X = heart.data.features.copy()
        y = heart.data.targets.copy()
        y = (y.iloc[:,0] > 0).astype(int).rename("target")
        df0 = pd.concat([X, y], axis=1)
        df0.to_csv(data_path, index=False)
        return df0
    except Exception:
        cols = ["age","sex","cp","trestbps","chol","fbs","restecg","thalach",
                "exang","oldpeak","slope","ca","thal","target"]
        rng = np.random.default_rng(42)
        df0 = pd.DataFrame(rng.integers(0, 100, size=(120, len(cols))), columns=cols)
        df0["target"] = rng.integers(0, 2, size=120)
        df0.to_csv(data_path, index=False)
        return df0

df = ensure_data()
df.head()


In [None]:

import numpy as np
df.replace("?", np.nan, inplace=True)
display(df.isna().sum())
df.info()


In [None]:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

X = df.drop(columns=["target"])
y = df["target"].astype(int)
num = X.select_dtypes(include=[float, int]).columns.tolist()
cat = [c for c in X.columns if c not in num]

pre = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num),
    ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]), cat),
])
X_pre = pre.fit_transform(X)
X_pre.shape


In [None]:

# Histograms for numeric columns
import matplotlib.pyplot as plt
num_cols = df.select_dtypes(include=[float, int]).columns.tolist()
if "target" in num_cols:
    num_cols.remove("target")

cols = 4
rows = (len(num_cols) + cols - 1) // cols
plt.figure(figsize=(4*cols, 3*rows))
for i, c in enumerate(num_cols, 1):
    plt.subplot(rows, cols, i)
    plt.hist(df[c].dropna(), bins=30)
    plt.title(c)
plt.tight_layout()
plt.show()


In [None]:

import numpy as np
import matplotlib.pyplot as plt

num_df = df.select_dtypes(include=[float, int]).copy()
corr = num_df.corr()
fig, ax = plt.subplots(figsize=(8,6))
cax = ax.imshow(corr, interpolation="nearest")
ax.set_xticks(range(len(num_df.columns)))
ax.set_xticklabels(num_df.columns, rotation=90)
ax.set_yticks(range(len(num_df.columns)))
ax.set_yticklabels(num_df.columns)
fig.colorbar(cax)
fig.tight_layout()
plt.show()


In [None]:

# Boxplots by target
import matplotlib.pyplot as plt
key = [c for c in ["age","trestbps","chol","thalach","oldpeak"] if c in df.columns]
if key:
    plt.figure(figsize=(4*len(key), 4))
    for i, c in enumerate(key, 1):
        plt.subplot(1, len(key), i)
        grp0 = df.loc[df["target"]==0, c].dropna()
        grp1 = df.loc[df["target"]==1, c].dropna()
        plt.boxplot([grp0, grp1], labels=["0 (no disease)","1 (disease)"])
        plt.title(f"{c} by target")
    plt.tight_layout()
    plt.show()
