In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

DATA_DIR = Path("../data")
RAW_FILE = DATA_DIR / "hospital_readmissions.csv"   # confirm name
CLEAN_FILE = DATA_DIR / "cleaned_readmissions.csv"

pd.set_option("display.max_columns", 100)
RAW_FILE, RAW_FILE.exists()


In [None]:
df = pd.read_csv(RAW_FILE, dtype=str)
print(df.shape)
df.head(3)


In [None]:
def standardize_cols(cols):
    return [c.strip().lower().replace(" ", "_").replace("/", "_").replace("-", "_") for c in cols]

df.columns = standardize_cols(df.columns)
df.columns.tolist()


In [None]:

mask = df["measure_id"].str.contains("READM-30", na=False) if "measure_id" in df.columns else np.full(len(df), True)
df = df[mask].copy()

# Keep a clean subset (only keep columns that actually exist)
wanted = [
    "facility_name","facility_id","state","zip_code","county_name",
    "measure_id","measure_name","compared_to_national",
    "score","denominator","lower_estimate","higher_estimate",
    "footnote","start_date","end_date"
]
df = df[[c for c in wanted if c in df.columns]].copy()
df.head(3)


In [None]:
def to_num(s):
    return pd.to_numeric(s, errors="coerce")

for col in ["score","denominator","lower_estimate","higher_estimate"]:
    if col in df.columns:
        df[col] = to_num(df[col])

# Ensure facility_id is string (often leading zeros)
if "facility_id" in df.columns:
    df["facility_id"] = df["facility_id"].astype(str).str.zfill(6)


In [None]:
for col in ["facility_name","state","county_name","measure_id","measure_name","compared_to_national"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

for col in ["start_date","end_date"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")


In [None]:
measure_map = {
    "READM-30-AMI-HRRP": "Acute Myocardial Infarction",
    "READM-30-HF-HRRP": "Heart Failure",
    "READM-30-PN-HRRP": "Pneumonia",
    "READM-30-COPD-HRRP": "COPD",
    "READM-30-CABG-HRRP": "CABG",
    "READM-30-HIP-KNEE-HRRP": "Hip/Knee Replacement"
}

if "measure_id" in df.columns:
    df["condition"] = df["measure_id"].map(measure_map).fillna(df.get("measure_name", "Other"))
else:
    df["condition"] = df.get("measure_name", "Other")

df["condition"].value_counts().head(10)


In [None]:
# Risk band using quantiles (only where score present)
if "score" in df.columns:
    df["risk_band"] = pd.qcut(df["score"], q=4, labels=["Low","Moderate","Elevated","High"])
else:
    df["risk_band"] = np.nan

# Drop exact duplicates just in case
df = df.drop_duplicates().reset_index(drop=True)
df.sample(5)


In [None]:
summary = {
    "rows": len(df),
    "null_score_pct": round(df["score"].isna().mean()*100, 2) if "score" in df.columns else None,
    "date_range": (
        df["start_date"].min(), df["end_date"].max()
    ) if "start_date" in df.columns and "end_date" in df.columns else ("N/A","N/A"),
    "states": df["state"].nunique() if "state" in df.columns else None,
    "hospitals": df["facility_id"].nunique() if "facility_id" in df.columns else None
}
summary


In [None]:
df.to_csv(CLEAN_FILE, index=False)
CLEAN_FILE, CLEAN_FILE.exists(), pd.read_csv(CLEAN_FILE, nrows=3).shape
