In [None]:
import os
import sys
# Set paths relative to this notebook (in notebooks/)
# We assume structure: .../readmit30/notebooks (cwd)
#                      .../readmit30/scripts/data
base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
print(f'Base dir (readmit30): {base_dir}')
os.environ['TRAIN_PATH'] = os.path.join(base_dir, 'scripts', 'data', 'public', 'train.csv')
os.environ['DEV_PATH']   = os.path.join(base_dir, 'scripts', 'data', 'public', 'dev.csv')
os.environ['TEST_PATH']  = os.path.join(base_dir, 'scripts', 'data', 'public', 'public_test.csv')
os.environ['OUT_PATH']   = 'predictions.csv'

print('Environment variables set for local execution.')


# Assignment 1 — Colab Workflow (GitHub + Pre-commit + Submission Validation)

This notebook teaches the standard workflow used throughout the course:

1. Clone your team repo
2. Install dependencies
3. Install **pre-commit** and enable a hook to strip notebook outputs
4. Run this notebook end-to-end
5. Validate `predictions.csv`
6. Commit + push + tag


In [None]:
# (Colab) show python and system info
import sys, platform
print(sys.version)
print(platform.platform())


## 1) Clone Repo

Login to your personal Github account, and make a fork of: https://github.com/TLKline/AIHC-5010-Winter-2026

Follow setup directions for working with a PAT in GitHub (30-second guide):

* Go to GitHub → Settings
* Developer settings
* Personal access tokens
* Choose:
  * Fine-Grained

You can clone using HTTPS.

Repo HTTPS URL (e.g., `https://github.com/TLKline/AIHC-5010-Winter-2026.git`)

## 2) Install dependencies

This installs whatever is in `requirements.txt`.


In [None]:
!pip -q install -r ../requirements.txt

#MAINSTART

# 4) Submission Notebook (Template)

Replace the baseline model with your team’s approach.

In [None]:
import os
from pathlib import Path

TRAIN_PATH = os.environ.get("TRAIN_PATH", "../scripts/data/public/train.csv")
DEV_PATH   = os.environ.get("DEV_PATH",   "../scripts/data/public/dev.csv")
TEST_PATH  = os.environ.get("TEST_PATH",  "../scripts/data/public/public_test.csv")
OUT_PATH   = os.environ.get("OUT_PATH",   "predictions.csv")

print("TRAIN_PATH:", TRAIN_PATH)
print("DEV_PATH:", DEV_PATH)
print("TEST_PATH:", TEST_PATH)
print("OUT_PATH:", OUT_PATH)

In [None]:
import numpy as np
import pandas as pd
np.random.seed(42)

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

assert "row_id" in train.columns and "readmit30" in train.columns
assert "row_id" in test.columns

X_train = train.drop(columns=["readmit30"])
y_train = train["readmit30"].astype(int)

### Exploratory Data Analysis (EDA)

In [None]:
display(train.describe())

In [None]:
print("Target variable (readmit30) counts:")
print(train['readmit30'].value_counts())
print(f"\nReadmission rate: {train['readmit30'].mean():.2%}")

In [None]:
# Inspect categorical features
from pandas.api.types import is_numeric_dtype
cat_cols = [c for c in X_train.columns if not is_numeric_dtype(X_train[c])]
for column in cat_cols[:10]: # Showing first 10 for brevity
    print(f"\nValue counts for column: {column}")
    print(train[column].value_counts().head(5))

In [None]:
# === Required EDA Tasks (Missingness + Data Quality) ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown

df = train.copy()
target_col = "readmit30"

# Treat common placeholder missing values as NaN
obj_cols = df.select_dtypes(include=["object"]).columns
df[obj_cols] = df[obj_cols].replace("?", np.nan)

display(Markdown("## EDA Tasks (Missingness + Data Quality)"))

# 1) Basic dataset snapshot
n_rows, n_cols = df.shape
feature_dtypes = df.drop(columns=[target_col]).dtypes
num_count = (feature_dtypes.apply(lambda t: pd.api.types.is_numeric_dtype(t))).sum()
cat_count = len(feature_dtypes) - num_count
readmit_rate = df[target_col].mean()

print(f"Rows x Columns: {n_rows} x {n_cols}")
print(f"Outcome column: {target_col}")
print(f"Readmission rate: {readmit_rate:.2%}")
print(f"Numeric features: {num_count} | Categorical features: {cat_count}")
display(df.head())

# 2) Missingness audit
missing_tbl = (
    df.isna().sum()
    .to_frame("missing_count")
    .assign(missing_pct=lambda x: x["missing_count"] / len(df) * 100)
    .sort_values("missing_pct", ascending=False)
 )
display(Markdown("### Missingness table (sorted)"))
display(missing_tbl)

top15 = missing_tbl.head(15).iloc[::-1]
plt.figure(figsize=(8, 6))
plt.barh(top15.index, top15["missing_pct"], color="#4C78A8")
plt.title("Top 15 Columns by % Missing")
plt.xlabel("% Missing")
plt.tight_layout()
plt.show()

acceptable_cols = missing_tbl[missing_tbl["missing_pct"] < 5].head(3).index.tolist()
problem_cols = missing_tbl[missing_tbl["missing_pct"] > 30].head(3)

display(Markdown("### Missingness recommendations"))
print("Acceptable missingness (<5%):")
for c in acceptable_cols:
    print(f"- {c}")

print("\nProblematic missingness (>30%) and action:")
for c, row in problem_cols.iterrows():
    action = "drop" if row["missing_pct"] > 50 else "impute or investigate collection"
    print(f"- {c}: {row['missing_pct']:.1f}% missing → {action}")

# 3) Is missingness related to the outcome?
display(Markdown("### Missingness vs Outcome (3 columns)"))
missing_candidates = missing_tbl[missing_tbl["missing_pct"] > 15].index.tolist()
missing_candidates = [c for c in missing_candidates if c != target_col]
if len(missing_candidates) < 3:
    missing_candidates = [c for c in missing_tbl.index if c != target_col and missing_tbl.loc[c, "missing_pct"] > 0][:3]
if len(missing_candidates) < 3:
    missing_candidates = [c for c in missing_tbl.index if c != target_col][:3]

missing_assoc_notes = []
for col in missing_candidates[:3]:
    is_missing = df[col].isna()
    rates = df.groupby(is_missing)[target_col].mean().reindex([False, True])
    rates.index = ["Not Missing", "Missing"]
    table = rates.to_frame("readmit_rate")
    display(Markdown(f"**{col}**"))
    display(table)

    plt.figure(figsize=(4, 3))
    plt.bar(table.index, table["readmit_rate"], color=["#72B7B2", "#F58518"])
    plt.title(f"Readmit Rate by Missingness: {col}")
    plt.ylabel("Readmit Rate")
    plt.tight_layout()
    plt.show()

    diff = abs(table.loc["Missing", "readmit_rate"] - table.loc["Not Missing", "readmit_rate"])
    note = "associated" if diff >= 0.03 else "not strongly associated"
    missing_assoc_notes.append((col, note, diff))
    print(f"Interpretation: Missingness appears {note} with outcome (Δ ≈ {diff:.3f}).\n")

# 4) Minimal data quality checks
display(Markdown("### Data quality checks"))
dup_count = df.duplicated().sum()
print(f"Duplicate rows: {dup_count}")

id_cols = [c for c in df.columns if any(k in c.lower() for k in ["patient", "encounter", "id"]) ]
id_cols = [c for c in id_cols if c != "row_id"]
if id_cols:
    print("Top repeated IDs (first 2 ID-like columns):")
    for col in id_cols[:2]:
        print(f"\n{col}:")
        print(df[col].value_counts().head(5))
else:
    print("No obvious patient/encounter ID columns detected beyond row_id.")

# Outliers / validity for 3 numeric columns
num_cols_all = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols_all = [c for c in num_cols_all if c != target_col]
num_pick = num_cols_all[:3]
if len(num_pick) < 3:
    print("Not enough numeric columns for outlier check.")
else:
    summary_rows = []
    for col in num_pick:
        p1, p99 = df[col].quantile([0.01, 0.99])
        summary_rows.append({
            "column": col,
            "min": df[col].min(),
            "median": df[col].median(),
            "max": df[col].max(),
            "p1": p1,
            "p99": p99,
        })
    summary_df = pd.DataFrame(summary_rows)
    display(Markdown("#### Numeric validity summary"))
    display(summary_df)

    fig, axes = plt.subplots(1, len(num_pick), figsize=(12, 3))
    for ax, col in zip(axes, num_pick):
        sns.boxplot(x=df[col], ax=ax, color="#A1C9F4")
        ax.set_title(col)
    plt.tight_layout()
    plt.show()

# Leakage screen (heuristic)
leak_keywords = ["readmit", "discharge", "death", "mort", "post", "after", "length", "los", "stay"]
leakage_cols = [c for c in df.columns if c != target_col and any(k in c.lower() for k in leak_keywords)]
leakage_cols = leakage_cols[:2]
if len(leakage_cols) < 2:
    extra = [c for c in df.columns if c != target_col and c not in leakage_cols][:2 - len(leakage_cols)]
    leakage_cols.extend(extra)

def leakage_reason(col: str) -> str:
    lc = col.lower()
    if "readmit" in lc:
        return "directly references the outcome"
    if "discharge" in lc:
        return "may include post-discharge information"
    if "death" in lc or "mort" in lc:
        return "post-outcome indicator"
    if "length" in lc or "los" in lc or "stay" in lc:
        return "likely reflects length of stay, possibly post-admission"
    if "post" in lc or "after" in lc:
        return "post-event timing indicator"
    return "could encode post-encounter information"

display(Markdown("### Candidate leakage columns"))
for col in leakage_cols:
    print(f"- {col}: {leakage_reason(col)}")

# Final summary bullets (8–12)
top5 = missing_tbl[missing_tbl["missing_pct"] > 0].head(5)
if top5.empty:
    top5 = missing_tbl.head(5)
missing_str = ", ".join([f"{idx} ({row['missing_pct']:.1f}%)" for idx, row in top5.iterrows()])
assoc_note = "None found"
if missing_assoc_notes:
    assoc_note = f"{missing_assoc_notes[0][0]} is {missing_assoc_notes[0][1]} (Δ ≈ {missing_assoc_notes[0][2]:.3f})."

summary_bullets = [
    f"Top 5 missing columns: {missing_str}",
    f"Overall readmission rate: {readmit_rate:.2%}",
    f"Missingness-outcome check: {assoc_note}",
    "Drop or review columns with >30% missingness; prioritize imputation for moderate missingness.",
    "Use consistent encoding for categorical features and monitor high-cardinality columns.",
    "Check duplicate rows and repeated IDs prior to modeling.",
    "Investigate outliers using 1st/99th percentiles before scaling or winsorizing.",
    f"Leakage candidates to review: {', '.join(leakage_cols)}.",
    "Next steps: run cross-validation and tune model hyperparameters.",
    "Next steps: calibrate probabilities if using decision thresholds.",
 ]

display(Markdown("### Final Summary (8–12 bullets)"))
display(Markdown("\n".join([f"- {b}" for b in summary_bullets])))

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# TODO: Add any new imports for your own method here
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder
from pandas.api.types import is_numeric_dtype

method = 4

# Define features to drop (high missingness, low value, or leakage/IDs)
features_to_drop = [
    "weight",
    "payer_code",
    "medical_specialty",
    "encounter_id",
    "patient_nbr",
    "discharge_disposition_id",
    "row_id",
    "readmit30",
 ]

# Drop them from X_train explicitly
X_train = X_train.drop(columns=features_to_drop, errors="ignore")

cat_cols = [c for c in X_train.columns if not is_numeric_dtype(X_train[c])]
num_cols = [c for c in X_train.columns if is_numeric_dtype(X_train[c])]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols),
        ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                          ("onehot", OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
    ],
)

if method==1:
    # Use logistic regression model
    clf = Pipeline([
        ("preprocess", preprocess),
        ("model", LogisticRegression(max_iter=200)),
    ])

if method==2:
    # Use logistic regression model
    clf = Pipeline([
        ("preprocess", preprocess),
        ("model", LogisticRegression(max_iter=200,class_weight='balanced')),
    ])

if method==3:
    # Use SVC (i.e. SVM model)
    clf = Pipeline(
        [
            ("preprocess", preprocess),
            ("scaler", StandardScaler(with_mean=False)), # Add StandardScaler here
            ("model", SVC(gamma="auto",max_iter=1000,probability=True)),
        ]
    )

if method == 4:
    # Preprocess for HGB: ordinal-encode categories (HGB needs numeric inputs)
    preprocess_hgb = ColumnTransformer(
        transformers=[
            ("num", Pipeline([
                ("imputer", SimpleImputer(strategy="median")),
            ]), num_cols),
            ("cat", Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
            ]), cat_cols),
        ],
        remainder="drop",
    )

    clf = Pipeline([
        ("preprocess", preprocess_hgb),
        ("model", HistGradientBoostingClassifier(
            max_depth=6,
            learning_rate=0.05,
            max_iter=300,
            l2_regularization=1.0,
            early_stopping=True,
            random_state=42,
            class_weight='balanced',
        )),
    ])

clf.fit(X_train, y_train)

In [None]:
p_test = clf.predict_proba(test)[:, 1]
pred = pd.DataFrame({"row_id": test["row_id"].astype(int), "prob_readmit30": p_test.astype(float)})
pred.to_csv(OUT_PATH, index=False)
pred.head()

In [None]:
# Validate output format (required for students before tagging)
!python ../scripts/validate_submission.py --pred {OUT_PATH} --test {TEST_PATH}


In [None]:
# Calculate metrics for the dev set
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
import matplotlib.pyplot as plt

dev = pd.read_csv(DEV_PATH)
print('DEV_PATH:', DEV_PATH)
print(dev['readmit30'].value_counts())

X_dev = dev.drop(columns=["readmit30"])
y_dev = dev["readmit30"].astype(int)

# Calculate metrics
y_true = y_dev.astype(int)
y_pred = clf.predict_proba(X_dev)[:, 1]

auroc = roc_auc_score(y_true, y_pred)
auprc = average_precision_score(y_true, y_pred)
brier = brier_score_loss(y_true, y_pred)

print(f'AUROC: {auroc:.4f}')
print(f'AUPRC: {auprc:.4f}')
print(f'Brier Score: {brier:.4f}')

# Create figures
plt.figure(figsize=(10, 6))

# Histogram of predicted probabilities
plt.hist(y_pred, bins=20, alpha=0.7, label='Predicted Probabilities')
plt.title('Histogram of Predicted Probabilities')
plt.xlabel('Probability')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Scatter plot of true vs predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_true, y_pred, alpha=0.5, label='True vs Predicted')
plt.title('True vs Predicted Probabilities')
plt.xlabel('True Labels')
plt.ylabel('Predicted Probabilities')
plt.legend()
plt.show()

# Create ROC Curve
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_true, y_pred)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'AUROC = {auroc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# Create Precision-Recall Curve
from sklearn.metrics import precision_recall_curve

precision, recall, _ = precision_recall_curve(y_true, y_pred)
plt.figure(figsize=(10, 6))
plt.plot(recall, precision, label=f'AUPRC = {auprc:.4f}')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

# Create Confusion Matrix Heatmap
from sklearn.metrics import confusion_matrix
import seaborn as sns

threshold = 0.5  # Default threshold for binary classification
y_pred_binary = (y_pred >= threshold).astype(int)
cm = confusion_matrix(y_true, y_pred_binary)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Readmit', 'Readmit'], yticklabels=['No Readmit', 'Readmit'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

#MAINEND

## 5) Validate the predictions file format

This checks:
- required columns
- probabilities in [0, 1]
- row_ids match the test file

It assumes the submission notebook wrote `predictions.csv` in the repo root.


In [None]:
from pathlib import Path
pred_path = Path("predictions.csv")
test_path = Path("../scripts/data/public/public_test.csv")

if not pred_path.exists():
    print("predictions.csv not found. Run notebooks/submission.ipynb first.")
else:
    !python ../scripts/validate_submission.py --pred predictions.csv --test ../scripts/data/public/public_test.csv


## 6) Commit + push + tag

You will:
- add changes
- commit (pre-commit hook runs here)
- push
- tag a milestone (example: `milestone_wk3`) and push tags



You will need a Personal Access Token (PAT) for the following step. See instructions above.

## Done ✅

If you hit issues:
- Make sure you pulled the latest course template (missing files).
- Make sure `data/public/*` exists in your repo (or your instructor provided it separately).
