In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

# 1. Load RAW data
df = pd.read_csv("../data/raw/medical_appointment_data.csv")
print("Initial shape:", df.shape)

# 2. Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# 3. CLEAN TARGET FIRST
print("\nBefore cleaning no_show:")
print(df["no_show"].value_counts(dropna=False))

df["no_show"] = df["no_show"].astype(str).str.strip().str.lower()
df["no_show"] = df["no_show"].replace({"no": 0, "yes": 1})
df["no_show"] = pd.to_numeric(df["no_show"], errors="coerce")

df = df.dropna(subset=["no_show"])
df["no_show"] = df["no_show"].astype(int)

print("\nAfter cleaning no_show:")
print(df["no_show"].value_counts())
print("Shape after target cleaning:", df.shape)

# 4. Feature cleaning
df["age"] = df["age"].fillna(df["age"].median())
df["specialty"] = df["specialty"].fillna("unknown")
df["place"] = df["place"].fillna("unknown")

# 5. Date features
df["appointment_date"] = pd.to_datetime(df["appointment_date_continuous"])
df["weekday"] = df["appointment_date"].dt.weekday
df["month"] = df["appointment_date"].dt.month

# 6. Encode categoricals
cat_cols = df.select_dtypes(include="object").columns
cat_cols = [c for c in cat_cols if c != "no_show"]

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# 7. Save cleaned data
os.makedirs("../data/processed", exist_ok=True)
df.to_csv("../data/processed/cleaned_data.csv", index=False)

print("\n✅ CLEANED DATA SAVED")
print("Final cleaned shape:", df.shape)


Initial shape: (109593, 26)

Before cleaning no_show:
no_show
no     74761
yes    34832
Name: count, dtype: int64

After cleaning no_show:
no_show
0    74761
1    34832
Name: count, dtype: int64
Shape after target cleaning: (109593, 26)


  df["no_show"] = df["no_show"].replace({"no": 0, "yes": 1})



✅ CLEANED DATA SAVED
Final cleaned shape: (109593, 29)


In [6]:
clean_df = pd.read_csv("../data/processed/cleaned_data.csv")

print(clean_df.shape)
print(clean_df["no_show"].value_counts())


(109593, 29)
no_show
0    74761
1    34832
Name: count, dtype: int64
