In [8]:
import pandas as pd
import numpy as np

# ============================
# 1. LOAD DATA
# ============================
df = pd.read_csv("Train_Beneficiarydata-1542865627584.csv")

# ============================
# 2. FIX COUNTY MISSING VALUES
# ============================

# Flag missing county (County == 0 means missing)
df["county_missing"] = (df["County"] == 0).astype(int)

# ============================
# 3. FLAG DEAD OR NOT (DOD)
# ============================

# Some values might be NaN or "0"
df["is_dead"] = df["DOD"].notna().astype(int)

# ============================
# 4. FIX BIRTH YEAR AND CREATE AGE
# ============================

# Convert DOB to datetime
df["DOB"] = pd.to_datetime(df["DOB"], errors="coerce")

# Calculate age (based on 2016 CMS dataset year)
df["Age"] = 2016 - df["DOB"].dt.year

# Clip unreasonable values
df["Age"] = df["Age"].clip(lower=0, upper=110)

# ============================
# 5. CLEAN GENDER FIELD
# ============================

# Known valid values = 1 (Male), 2 (Female)
df["Gender"] = df["Gender"].replace({0: np.nan})
df["Gender"] = df["Gender"].fillna(df["Gender"].mode()[0])

# ============================
# 6. CLEAN STATE AND COUNTY
# ============================

df["State"] = df["State"].replace({0: np.nan})
df["County"] = df["County"].replace({0: np.nan})

# ============================
# 7. CHRONIC CONDITIONS → Convert missing to 0/1
# ============================

chronic_cols = [
    "RenalDiseaseIndicator", "ChronicCond_Alzheimer", "ChronicCond_Heartfailure",
    "ChronicCond_KidneyDisease", "ChronicCond_Cancer", "ChronicCond_ObstrPulmonary",
    "ChronicCond_Depression", "ChronicCond_Diabetes", "ChronicCond_IschemicHeart",
    "ChronicCond_Osteoporasis", "ChronicCond_rheumatoidarthritis",
    "ChronicCond_stroke"
]

for col in chronic_cols:
    df[col] = df[col].replace({"Y": 1})   # Y → 1
    df[col] = df[col].fillna(0)           # NaN → 0
    df[col] = df[col].astype(int)         # convert all to int

# ============================
# 8. REORDER COLUMNS (Optional)
# ============================

cols_front = ["BeneID", "DOB", "DOD", "Age", "Gender", "State", "County",
              "county_missing", "is_dead"]

other_cols = [c for c in df.columns if c not in cols_front]

df = df[cols_front + other_cols]

# ============================
# 9. SAVE CLEAN VERSION
# ============================

df.to_csv("Train_Beneficiarydata_CLEAN.csv", index=False)

print("✔ Beneficiary dataset cleaned successfully!")


✔ Beneficiary dataset cleaned successfully!
