In [8]:
import pandas as pd
import numpy as np

# ============================
# 1. LOAD DATA
# ============================
df = pd.read_csv("Train_Beneficiarydata-1542865627584.csv")

# ============================
# 2. FIX COUNTY MISSING VALUES
# ============================

# Flag missing county (County == 0 means missing)
df["county_missing"] = (df["County"] == 0).astype(int)

# ============================
# 3. FLAG DEAD OR NOT (DOD)
# ============================

# Some values might be NaN or "0"
df["is_dead"] = df["DOD"].notna().astype(int)

# ============================
# 4. FIX BIRTH YEAR AND CREATE AGE
# ============================

# Convert DOB to datetime
df["DOB"] = pd.to_datetime(df["DOB"], errors="coerce")

# Calculate age (based on 2016 CMS dataset year)
df["Age"] = 2016 - df["DOB"].dt.year

# Clip unreasonable values
df["Age"] = df["Age"].clip(lower=0, upper=110)

# ============================
# 5. CLEAN GENDER FIELD
# ============================

# Known valid values = 1 (Male), 2 (Female)
df["Gender"] = df["Gender"].replace({0: np.nan})
df["Gender"] = df["Gender"].fillna(df["Gender"].mode()[0])

# ============================
# 6. CLEAN STATE AND COUNTY
# ============================

df["State"] = df["State"].replace({0: np.nan})
df["County"] = df["County"].replace({0: np.nan})

# ============================
# 7. CHRONIC CONDITIONS → Convert missing to 0/1
# ============================

chronic_cols = [
    "RenalDiseaseIndicator", "ChronicCond_Alzheimer", "ChronicCond_Heartfailure",
    "ChronicCond_KidneyDisease", "ChronicCond_Cancer", "ChronicCond_ObstrPulmonary",
    "ChronicCond_Depression", "ChronicCond_Diabetes", "ChronicCond_IschemicHeart",
    "ChronicCond_Osteoporasis", "ChronicCond_rheumatoidarthritis",
    "ChronicCond_stroke"
]

for col in chronic_cols:
    df[col] = df[col].replace({"Y": 1})   # Y → 1
    df[col] = df[col].fillna(0)           # NaN → 0
    df[col] = df[col].astype(int)         # convert all to int

# ============================
# 8. REORDER COLUMNS (Optional)
# ============================

cols_front = ["BeneID", "DOB", "DOD", "Age", "Gender", "State", "County",
              "county_missing", "is_dead"]

other_cols = [c for c in df.columns if c not in cols_front]

df = df[cols_front + other_cols]

# ============================
# 9. SAVE CLEAN VERSION
# ============================

df.to_csv("Train_Beneficiarydata_CLEAN.csv", index=False)

print("✔ Beneficiary dataset cleaned successfully!")


✔ Beneficiary dataset cleaned successfully!


In [5]:
import pandas as pd
import numpy as np

# ============================
# 1. LOAD INPATIENT DATA
# ============================
df_inpatient = pd.read_csv("data/Train_Inpatientdata-1542865627584.csv")

print(f"Original shape: {df_inpatient.shape}")

# ============================
# 2. DROP ClmProcedureCode_6 (100% NULL)
# ============================
# ClmProcedureCode_6 has no values (100% null), so we drop it
df_inpatient = df_inpatient.drop(columns=['ClmProcedureCode_6'], errors='ignore')

# ============================
# 3. HANDLE MISSING PHYSICIAN VALUES
# ============================
# Replace NaN values with "NA" for physician columns
physician_cols = ['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']

for col in physician_cols:
    if col in df_inpatient.columns:
        df_inpatient[col] = df_inpatient[col].fillna("NA")
        df_inpatient[col] = df_inpatient[col].replace("", "NA")

# ============================
# 4. HANDLE MISSING DIAGNOSIS CODES
# ============================
# Replace NaN values with "NA" for diagnosis code columns
diagnosis_cols = [f'ClmDiagnosisCode_{i}' for i in range(1, 11)]

for col in diagnosis_cols:
    if col in df_inpatient.columns:
        df_inpatient[col] = df_inpatient[col].fillna("NA")
        df_inpatient[col] = df_inpatient[col].replace("", "NA")

# ============================
# 5. HANDLE ClmAdmitDiagnosisCode
# ============================
# Fill any missing or empty values in ClmAdmitDiagnosisCode with "Missing"
df_inpatient["ClmAdmitDiagnosisCode"] = df_inpatient["ClmAdmitDiagnosisCode"].fillna("Missing")
df_inpatient["ClmAdmitDiagnosisCode"] = df_inpatient["ClmAdmitDiagnosisCode"].replace("", "Missing")

# ============================
# 6. HANDLE MISSING PROCEDURE CODES
# ============================
# For ClmProcedureCode_1 through ClmProcedureCode_5, fill NaN with 0
# These are numeric codes, so 0 indicates "no procedure code"
procedure_cols = [f'ClmProcedureCode_{i}' for i in range(1, 6)]

for col in procedure_cols:
    if col in df_inpatient.columns:
        df_inpatient[col] = df_inpatient[col].fillna(0)
        # Convert to integer type
        df_inpatient[col] = df_inpatient[col].astype(int)

# ============================
# 7. HANDLE DEDUCTIBLE AMOUNT PAID
# ============================
# DeductibleAmtPaid has only one value (1068.0) for non-null entries
# Fill missing values with the standard deductible amount of 1068.0
df_inpatient["DeductibleAmtPaid"] = df_inpatient["DeductibleAmtPaid"].fillna(1068.0)
# Convert to integer since all values are the same
df_inpatient["DeductibleAmtPaid"] = df_inpatient["DeductibleAmtPaid"].astype(int)

# ============================
# 8. CONVERT DATE COLUMNS TO DATETIME
# ============================
# Convert date columns to datetime format for better processing
date_cols = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt']

for col in date_cols:
    if col in df_inpatient.columns:
        df_inpatient[col] = pd.to_datetime(df_inpatient[col], errors='coerce')

# ============================
# 9. CREATE DERIVED FEATURES
# ============================
# Calculate claim duration in days
df_inpatient['ClaimDuration'] = (df_inpatient['ClaimEndDt'] - df_inpatient['ClaimStartDt']).dt.days + 1

# Calculate admission duration in days  
df_inpatient['AdmissionDuration'] = (df_inpatient['DischargeDt'] - df_inpatient['AdmissionDt']).dt.days + 1

# Handle any negative durations (data quality issues)
df_inpatient['ClaimDuration'] = df_inpatient['ClaimDuration'].clip(lower=1)
df_inpatient['AdmissionDuration'] = df_inpatient['AdmissionDuration'].clip(lower=1)

# ============================
# 10. VALIDATE DATA QUALITY
# ============================
# Remove any duplicate claim IDs if they exist
df_inpatient = df_inpatient.drop_duplicates(subset=['ClaimID'], keep='first')

# ============================
# 11. SAVE CLEANED DATA
# ============================
df_inpatient.to_csv("data/Train_Inpatientdata_CLEAN.csv", index=False)

print(f"✔ Inpatient dataset cleaned successfully! Rows: {len(df_inpatient)}")
print(f"Final shape: {df_inpatient.shape}")

Original shape: (40474, 30)
✔ Inpatient dataset cleaned successfully! Rows: 40474
Final shape: (40474, 31)
