In [10]:
from pathlib import Path
import pandas as pd
import numpy as np

# 1) find the data folder no matter where the notebook sits
candidate_dirs = [Path("data"), Path("../data"), Path("../../data")]
DATA_DIR = next((d for d in candidate_dirs if d.exists()), None)
if DATA_DIR is None:
    raise FileNotFoundError("Couldn't find a 'data' folder at ./data, ../data, or ../../data")

RAW_FILE = DATA_DIR / "hospital_readmissions.csv"
CLEAN_FILE = DATA_DIR / "cleaned_readmissions.csv"

# 2) prefer RAW; if absent, fall back to CLEAN
if RAW_FILE.exists():
    load_path = RAW_FILE
    source = "raw"
elif CLEAN_FILE.exists():
    load_path = CLEAN_FILE
    source = "cleaned"
else:
    print("CSV files seen:", list(DATA_DIR.glob("*.csv")))
    raise FileNotFoundError("No hospital_readmissions.csv or cleaned_readmissions.csv found in data/")

pd.set_option("display.max_columns", 100)
print("DATA_DIR:", DATA_DIR.resolve())
print("Loading:", load_path.name, f"(source={source})")

# 3) load
df = pd.read_csv(load_path, dtype=str)
print("Shape:", df.shape)
df.head(3)


DATA_DIR: /Users/meghanakurapati/Documents/projects/healthcare-readmission-analytics/data
Loading: hospital_readmissions.csv (source=raw)
Shape: (18510, 12)


Unnamed: 0,Facility Name,Facility ID,State,Measure Name,Number of Discharges,Footnote,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions,Start Date,End Date
0,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-AMI-HRRP,296,,0.9483,13.0146,13.7235,36,07/01/2020,06/30/2023
1,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-CABG-HRRP,151,,0.9509,9.6899,10.1898,13,07/01/2020,06/30/2023
2,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-HF-HRRP,681,,1.0597,21.5645,20.3495,151,07/01/2020,06/30/2023


In [11]:
def standardize_cols(cols):
    return [c.strip().lower().replace(" ", "_").replace("/", "_").replace("-", "_") for c in cols]
df.columns = standardize_cols(df.columns)

# 2) harmonize names you’ll use later
#    your file has 'measure_name' (codes like READM-30-AMI-HRRP) and 'excess_readmission_ratio'
rename_map = {
    "measure_name": "measure_id",                 # put the code into a stable name
    "excess_readmission_ratio": "score",          # main metric
}
df = df.rename(columns=rename_map)

# 3) if no 'score' yet, try predicted rate as a fallback
if "score" not in df.columns:
    if "predicted_readmission_rate" in df.columns:
        df["score"] = pd.to_numeric(df["predicted_readmission_rate"], errors="coerce")
    else:
        raise ValueError("No metric found. Expected 'excess_readmission_ratio' or 'predicted_readmission_rate'.")

# quick peek
df.columns.tolist(), df.head(3)



(['facility_name',
  'facility_id',
  'state',
  'measure_id',
  'number_of_discharges',
  'footnote',
  'score',
  'predicted_readmission_rate',
  'expected_readmission_rate',
  'number_of_readmissions',
  'start_date',
  'end_date'],
                      facility_name facility_id state          measure_id  \
 0  SOUTHEAST HEALTH MEDICAL CENTER       10001    AL   READM-30-AMI-HRRP   
 1  SOUTHEAST HEALTH MEDICAL CENTER       10001    AL  READM-30-CABG-HRRP   
 2  SOUTHEAST HEALTH MEDICAL CENTER       10001    AL    READM-30-HF-HRRP   
 
   number_of_discharges footnote   score predicted_readmission_rate  \
 0                  296      NaN  0.9483                    13.0146   
 1                  151      NaN  0.9509                     9.6899   
 2                  681      NaN  1.0597                    21.5645   
 
   expected_readmission_rate number_of_readmissions  start_date    end_date  
 0                   13.7235                     36  07/01/2020  06/30/2023  
 1          

In [12]:
if "measure_id" in df.columns:
    df = df[df["measure_id"].str.contains("READM-30", na=False)].copy()

# 2) keep sensible columns when present
wanted = [
    "facility_name","facility_id","state","measure_id",
    "score","predicted_readmission_rate","expected_readmission_rate",
    "number_of_discharges","number_of_readmissions",
    "start_date","end_date","footnote"
]
df = df[[c for c in wanted if c in df.columns]].copy()
print("After subset:", df.shape)
df.head(3)

After subset: (18510, 12)


Unnamed: 0,facility_name,facility_id,state,measure_id,score,predicted_readmission_rate,expected_readmission_rate,number_of_discharges,number_of_readmissions,start_date,end_date,footnote
0,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-AMI-HRRP,0.9483,13.0146,13.7235,296,36,07/01/2020,06/30/2023,
1,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-CABG-HRRP,0.9509,9.6899,10.1898,151,13,07/01/2020,06/30/2023,
2,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-HF-HRRP,1.0597,21.5645,20.3495,681,151,07/01/2020,06/30/2023,


In [13]:

for col in ["score","predicted_readmission_rate","expected_readmission_rate",
            "number_of_discharges","number_of_readmissions"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# dates
for col in ["start_date","end_date"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

# facility_id: keep leading zeros (and remove trailing '.0' if came from excel)
if "facility_id" in df.columns:
    df["facility_id"] = (
        df["facility_id"].astype(str)
                          .str.replace(".0","", regex=False)
                          .str.zfill(6)
    )

# optional: create a risk band if enough numeric values exist
if df["score"].notna().sum() >= 4:
    df["risk_band"] = pd.qcut(df["score"], q=4, labels=["Low","Moderate","Elevated","High"])

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18510 entries, 0 to 18509
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   facility_name               18510 non-null  object        
 1   facility_id                 18510 non-null  object        
 2   state                       18510 non-null  object        
 3   measure_id                  18510 non-null  object        
 4   score                       11927 non-null  float64       
 5   predicted_readmission_rate  11927 non-null  float64       
 6   expected_readmission_rate   11927 non-null  float64       
 7   number_of_discharges        8340 non-null   float64       
 8   number_of_readmissions      8121 non-null   float64       
 9   start_date                  18510 non-null  datetime64[ns]
 10  end_date                    18510 non-null  datetime64[ns]
 11  footnote                    6583 non-null   object    

Unnamed: 0,facility_name,facility_id,state,measure_id,score,predicted_readmission_rate,expected_readmission_rate,number_of_discharges,number_of_readmissions,start_date,end_date,footnote,risk_band
0,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-AMI-HRRP,0.9483,13.0146,13.7235,296.0,36.0,2020-07-01,2023-06-30,,Low
1,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-CABG-HRRP,0.9509,9.6899,10.1898,151.0,13.0,2020-07-01,2023-06-30,,Low
2,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-HF-HRRP,1.0597,21.5645,20.3495,681.0,151.0,2020-07-01,2023-06-30,,High


In [14]:
def to_num(s):
    return pd.to_numeric(s, errors="coerce")

for col in ["score","denominator","lower_estimate","higher_estimate"]:
    if col in df.columns:
        df[col] = to_num(df[col])

# Ensure facility_id is string (often leading zeros)
if "facility_id" in df.columns:
    df["facility_id"] = df["facility_id"].astype(str).str.zfill(6)


In [15]:
for col in ["facility_name","state","county_name","measure_id","measure_name","compared_to_national"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

for col in ["start_date","end_date"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")


In [16]:
measure_map = {
    "READM-30-AMI-HRRP": "Acute Myocardial Infarction",
    "READM-30-HF-HRRP": "Heart Failure",
    "READM-30-PN-HRRP": "Pneumonia",
    "READM-30-COPD-HRRP": "COPD",
    "READM-30-CABG-HRRP": "CABG",
    "READM-30-HIP-KNEE-HRRP": "Hip/Knee Replacement"
}

if "measure_id" in df.columns:
    df["condition"] = df["measure_id"].map(measure_map).fillna(df.get("measure_name", "Other"))
else:
    df["condition"] = df.get("measure_name", "Other")

df["condition"].value_counts().head(10)


condition
Acute Myocardial Infarction    3085
CABG                           3085
Heart Failure                  3085
Hip/Knee Replacement           3085
Pneumonia                      3085
COPD                           3085
Name: count, dtype: int64

In [17]:
# Risk band using quantiles (only where score present)
if "score" in df.columns:
    df["risk_band"] = pd.qcut(df["score"], q=4, labels=["Low","Moderate","Elevated","High"])
else:
    df["risk_band"] = np.nan

# Drop exact duplicates just in case
df = df.drop_duplicates().reset_index(drop=True)
df.sample(5)


Unnamed: 0,facility_name,facility_id,state,measure_id,score,predicted_readmission_rate,expected_readmission_rate,number_of_discharges,number_of_readmissions,start_date,end_date,footnote,risk_band,condition
732,HONORHEALTH DEER VALLEY MEDICAL CENTER,30092,AZ,READM-30-HF-HRRP,1.0618,19.9969,18.8331,309.0,66.0,2020-07-01,2023-06-30,,High,Heart Failure
8561,MUNSON MEDICAL CENTER,230097,MI,READM-30-AMI-HRRP,0.8508,11.1354,13.088,517.0,50.0,2020-07-01,2023-06-30,,Low,Acute Myocardial Infarction
16857,MEDICAL COLLEGE OF VIRGINIA HOSPITALS,490032,VA,READM-30-COPD-HRRP,0.9919,18.2329,18.3814,85.0,15.0,2020-07-01,2023-06-30,,Moderate,COPD
6169,COMMUNITY HOSPITAL OF ANDERSON AND MADISON COUNTY,150113,IN,READM-30-HIP-KNEE-HRRP,0.9936,5.314,5.348,,,2020-07-01,2023-06-30,,Moderate,Hip/Knee Replacement
8986,ABBOTT NORTHWESTERN HOSPITAL,240057,MN,READM-30-CABG-HRRP,0.9322,9.0095,9.6652,,,2020-07-01,2023-06-30,,Low,CABG


In [18]:
summary = {
    "rows": len(df),
    "null_score_pct": round(df["score"].isna().mean()*100, 2) if "score" in df.columns else None,
    "date_range": (
        df["start_date"].min(), df["end_date"].max()
    ) if "start_date" in df.columns and "end_date" in df.columns else ("N/A","N/A"),
    "states": df["state"].nunique() if "state" in df.columns else None,
    "hospitals": df["facility_id"].nunique() if "facility_id" in df.columns else None
}
summary


{'rows': 18510,
 'null_score_pct': np.float64(35.56),
 'date_range': (Timestamp('2020-07-01 00:00:00'),
  Timestamp('2023-06-30 00:00:00')),
 'states': 51,
 'hospitals': 3085}

In [19]:
df.to_csv(CLEAN_FILE, index=False)
CLEAN_FILE, CLEAN_FILE.exists(), pd.read_csv(CLEAN_FILE, nrows=3).shape


(PosixPath('data/cleaned_readmissions.csv'), True, (3, 14))

In [21]:
OUTPUT = DATA_DIR / "cleaned_readmissions.csv"
df.to_csv(OUTPUT, index=False)
print("Saved cleaned dataset to:", OUTPUT.resolve())

# sanity check
pd.read_csv(OUTPUT, nrows=3).head(3)


Saved cleaned dataset to: /Users/meghanakurapati/Documents/projects/healthcare-readmission-analytics/data/cleaned_readmissions.csv


Unnamed: 0,facility_name,facility_id,state,measure_id,score,predicted_readmission_rate,expected_readmission_rate,number_of_discharges,number_of_readmissions,start_date,end_date,footnote,risk_band,condition
0,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-AMI-HRRP,0.9483,13.0146,13.7235,296.0,36.0,2020-07-01,2023-06-30,,Low,Acute Myocardial Infarction
1,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-CABG-HRRP,0.9509,9.6899,10.1898,151.0,13.0,2020-07-01,2023-06-30,,Low,CABG
2,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-HF-HRRP,1.0597,21.5645,20.3495,681.0,151.0,2020-07-01,2023-06-30,,High,Heart Failure
