**Reset workspace and clone repo cleanly for modeling**
Reset the environment, recloned my repo for a clean modeling run, scrubbed the remote, confirmed folder structure, and verified git status.

In [59]:
# ==== SAME GITHUB INFO ====
GITHUB_USER = "muhammadhussainqureshi"
REPO_NAME   = "heart-disease-ml"
TOKEN       = "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"

# ---- Hard reset → clean clone ----
%cd /
%cd /content
!pwd

import os, shutil
REPO_PATH = f"/content/{REPO_NAME}"
shutil.rmtree(REPO_PATH, ignore_errors=True)

repo_url = f"https://{GITHUB_USER}:{TOKEN}@github.com/{GITHUB_USER}/{REPO_NAME}.git"
!git clone "{repo_url}" "{REPO_PATH}"
%cd "{REPO_PATH}"

# scrub token from remote immediately
!git remote set-url origin "https://github.com/{GITHUB_USER}/{REPO_NAME}.git"

# ensure folders (idempotent)
for p in ["data/raw","data/processed","notebooks","reports","src"]:
    os.makedirs(p, exist_ok=True)

!git config user.name "{GITHUB_USER}"
!git config user.email "{GITHUB_USER}@users.noreply.github.com"
!git status

/
/content
/content
Cloning into '/content/heart-disease-ml'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 7 (delta 1), reused 7 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (7/7), 24.22 KiB | 6.06 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/heart-disease-ml
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


**Load processed (or raw) CSV from the repo and normalize label**

Loaded the CSV from the repo (preferring the processed snapshot) and normalized the target so modeling is consistent with EDA.

In [60]:
import os, pandas as pd, numpy as np

RAW  = "data/raw/heart.csv"
SNAP = "data/processed/heart_day1_clean.csv"
LOAD = SNAP if os.path.exists(SNAP) else RAW
assert os.path.exists(LOAD), f"Missing CSV. Expected {SNAP} or {RAW}."

df = pd.read_csv(LOAD)
print("Loaded:", LOAD, "| shape:", df.shape)

# Normalize target column exactly like in EDA
target_col = "target" if "target" in df.columns else ("num" if "num" in df.columns else None)
assert target_col, f"No target column found in columns: {df.columns.tolist()}"
if target_col == "num" and df[target_col].max() > 1:
    df[target_col] = (df[target_col] >= 1).astype(int)
df[target_col] = df[target_col].astype(int)

print("Target:", target_col)
print("Class balance (%):")
print((df[target_col].value_counts(normalize=True)*100).round(2))

Loaded: data/processed/heart_day1_clean.csv | shape: (920, 16)
Target: num
Class balance (%):
num
1    55.33
0    44.67
Name: proportion, dtype: float64


**Define preprocessing (impute→one-hot→scale) and split data**

Split the dataset into features/labels, inferred numeric vs categorical columns, built a robust preprocessing pipeline (impute + encode + scale), and performed a stratified train/test split.

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Split features/target
X_raw = df.drop(columns=[target_col])
y     = df[target_col].astype(int)

# Infer numeric vs categorical columns
num_features = X_raw.select_dtypes(include=[np.number, bool]).columns.tolist()
cat_features = X_raw.select_dtypes(exclude=[np.number, bool]).columns.tolist()

print("Numeric features:", num_features)
print("Categorical features:", cat_features)

# Numeric: median impute + scale; Categorical: most_frequent impute + one-hot
numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler()),
])
categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot",  OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_features),
        ("cat", categorical_tf, cat_features),
    ],
    remainder="drop",
)

# Stratified split to preserve class balance
X_train, X_test, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.30, random_state=42, stratify=y
)

X_train.shape, X_test.shape

Numeric features: ['id', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
Categorical features: ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']


((644, 15), (276, 15))

**Train Logistic Regression & Random Forest (same preprocessing)**

Trained two baselines—Logistic Regression (interpretable) and Random Forest (non-linear)—using the same preprocessing pipeline. Reported Accuracy, ROC-AUC, and classification reports. Kept confusion matrices in memory for plotting.

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# LR pipeline
clf_lr = Pipeline([
    ("prep", preprocess),
    ("model", LogisticRegression(max_iter=1000))
])

# RF pipeline
clf_rf = Pipeline([
    ("prep", preprocess),
    ("model", RandomForestClassifier(n_estimators=300, random_state=42))
])

# Fit & predict
clf_lr.fit(X_train, y_train)
pred_lr  = clf_lr.predict(X_test)
proba_lr = clf_lr.predict_proba(X_test)[:, 1]

clf_rf.fit(X_train, y_train)
pred_rf  = clf_rf.predict(X_test)
proba_rf = clf_rf.predict_proba(X_test)[:, 1]

# Metrics
acc_lr = accuracy_score(y_test, pred_lr); auc_lr = roc_auc_score(y_test, proba_lr)
acc_rf = accuracy_score(y_test, pred_rf); auc_rf = roc_auc_score(y_test, proba_rf)

print("LR  | Acc:", round(acc_lr,3), "AUC:", round(auc_lr,3))
print("RF  | Acc:", round(acc_rf,3), "AUC:", round(auc_rf,3))
print("\n=== LR Classification Report ===\n", classification_report(y_test, pred_lr))
print("\n=== RF Classification Report ===\n", classification_report(y_test, pred_rf))

cm_lr = confusion_matrix(y_test, pred_lr)
cm_rf = confusion_matrix(y_test, pred_rf)
(cm_lr, cm_rf)

LR  | Acc: 0.833 AUC: 0.923
RF  | Acc: 0.899 AUC: 0.954

=== LR Classification Report ===
               precision    recall  f1-score   support

           0       0.81      0.81      0.81       123
           1       0.85      0.85      0.85       153

    accuracy                           0.83       276
   macro avg       0.83      0.83      0.83       276
weighted avg       0.83      0.83      0.83       276


=== RF Classification Report ===
               precision    recall  f1-score   support

           0       0.91      0.85      0.88       123
           1       0.89      0.93      0.91       153

    accuracy                           0.90       276
   macro avg       0.90      0.89      0.90       276
weighted avg       0.90      0.90      0.90       276



(array([[100,  23],
        [ 23, 130]]),
 array([[105,  18],
        [ 10, 143]]))

**Save metrics & plots (CSV + PNG) into the repo’s reports/**

Wrote a compact `reports/metrics.csv` for grading and exported three figures—two confusion matrices and one ROC curve—into the repo’s reports/ folder.

In [63]:
import os, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from sklearn.metrics import roc_curve

os.makedirs("reports", exist_ok=True)

# 1) Metrics CSV
metrics = pd.DataFrame([
    {"model":"LogisticRegression","accuracy":acc_lr,"roc_auc":auc_lr},
    {"model":"RandomForest","accuracy":acc_rf,"roc_auc":auc_rf},
])
metrics.to_csv("reports/metrics.csv", index=False)
print("Saved → reports/metrics.csv")
display(metrics)

# 2) Confusion matrices → PNG
def save_cm(cm, title, fname):
    plt.figure()
    sns.heatmap(cm, annot=True, fmt="d", cbar=False)
    plt.xlabel("Predicted"); plt.ylabel("Actual"); plt.title(title)
    plt.savefig(fname, dpi=150, bbox_inches="tight"); plt.close()

save_cm(cm_lr, "LR — Confusion Matrix", "reports/cm_lr.png")
save_cm(cm_rf, "RF — Confusion Matrix", "reports/cm_rf.png")

# 3) ROC curve → PNG
fpr_lr, tpr_lr, _ = roc_curve(y_test, proba_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, proba_rf)

plt.figure()
plt.plot(fpr_lr, tpr_lr, label=f"LR (AUC={auc_lr:.3f})")
plt.plot(fpr_rf, tpr_rf, label=f"RF (AUC={auc_rf:.3f})")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.savefig("reports/roc.png", dpi=150, bbox_inches="tight"); plt.close()

print("Saved plots: reports/cm_lr.png, reports/cm_rf.png, reports/roc.png")

Saved → reports/metrics.csv


Unnamed: 0,model,accuracy,roc_auc
0,LogisticRegression,0.833333,0.923003
1,RandomForest,0.898551,0.953664


Saved plots: reports/cm_lr.png, reports/cm_rf.png, reports/roc.png


**Save this notebook file, commit, push, verify remotes**

Saved `02_Modeling.ipynb` into the repo, committed the modeling artifacts, pushed to GitHub using my PAT for this push, and then reset the remote back to a safe tokenless URL. Printed remotes and the last commit for verification.

In [64]:
# ---- Save THIS notebook into the repo ----
import glob, shutil, os
cands = sorted(glob.glob("/content/*.ipynb"))
if cands:
    shutil.copy(cands[-1], "notebooks/02_Modeling.ipynb")
    print("Saved notebook → notebooks/02_Modeling.ipynb")

# ---- Temporarily set remote with PAT (private push), then scrub ----
!git remote set-url origin "https://{GITHUB_USER}:{TOKEN}@github.com/{GITHUB_USER}/{REPO_NAME}.git"

# Commit only if there are changes (avoid "nothing to commit" errors)
changed = !git status --porcelain
if changed:
    !git add -A
    !git commit -m "02_Modeling: pipelines, metrics.csv, plots, notebook"
else:
    print("No changes to commit.")

# Detect current branch and push
branch = !git rev-parse --abbrev-ref HEAD
branch = branch[0] if branch else "main"
print("Pushing branch:", branch)
!git push origin HEAD:{branch}

# Scrub token from remote again
!git remote set-url origin "https://github.com/{GITHUB_USER}/{REPO_NAME}.git"
print("\nRemotes after scrub:")
!git remote -v

# Show last commit as a quick sanity check
print("\nLast commit:")
!git --no-pager log -1 --oneline

[main fad0e52] 02_Modeling: pipelines, metrics.csv, plots, notebook
 4 files changed, 3 insertions(+)
 create mode 100644 reports/cm_lr.png
 create mode 100644 reports/cm_rf.png
 create mode 100644 reports/metrics.csv
 create mode 100644 reports/roc.png
Pushing branch: main
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 2 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (7/7), 70.09 KiB | 7.79 MiB/s, done.
Total 7 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/muhammadhussainqureshi/heart-disease-ml.git
   5c06400..fad0e52  HEAD -> main

Remotes after scrub:
origin	https://github.com/muhammadhussainqureshi/heart-disease-ml.git (fetch)
origin	https://github.com/muhammadhussainqureshi/heart-disease-ml.git (push)

Last commit:
[33mfad0e52[m[33m ([m[1;36mHEAD -> [m[1;32mmain[m[33m, [m[1;31morigin/main[m[33m, [m[1;31morigin/HEAD[m[33m)[m 02_Modeling: pipelines, metrics.csv, plots, note