In [1]:
import pandas as pd
import numpy as np

In [3]:
pd.set_option("display.max_columns", 100)

### Load raw dataset

In [3]:
RAW_PATH = "../data/raw/BDIR61FL.csv"

df_raw = pd.read_csv(RAW_PATH)  # df_raw = OG copy
df = df_raw.copy()              # df = working copy

### Shape 

In [4]:
df.shape


(17842, 1827)

In [5]:
df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17842 entries, 0 to 17841
Columns: 1827 entries, caseid to s541y_3
dtypes: float64(1599), int64(224), object(4)
memory usage: 248.7+ MB


### Null Columns 

In [6]:
null_cols = df.columns[df.isnull().all()]
len(null_cols)


0

### Columns with only ONE unique value

In [7]:
constant_cols = df.columns[df.nunique() <= 1]
len(constant_cols)


390

In [8]:
df.drop(columns=constant_cols, inplace=True)


### Identity Info drop

In [9]:
id_like_cols = [
    "caseid", "hhid", "cluster", "line", "v001", "v002", "v003", "v004", "v005", "v006", "v007","v008","v009"
]

df.drop(columns=id_like_cols, errors="ignore", inplace=True)


### Checkpoint_1

In [11]:
CLEAN_PATH = "../data/processed/01_basic_cleaned.csv"
df.to_csv(CLEAN_PATH, index=False)


KeyboardInterrupt: 

### Sanity check

In [10]:
print("Before:", df_raw.shape)
print("After :", df.shape)


Before: (17842, 1827)
After : (17842, 1428)


## PHASE 2: Feature Decoding and Interpretation


### Inspect column names

In [8]:
df.columns[:50]


Index(['v000', 'v010', 'v011', 'v012', 'v013', 'v014', 'v015', 'v016', 'v017',
       'v018', 'v019', 'v019a', 'v020', 'v021', 'v022', 'v023', 'v024', 'v025',
       'v026', 'v027', 'v028', 'v029', 'v030', 'v031', 'v032', 'v034', 'v040',
       'v042', 'v101', 'v102', 'v106', 'v107', 'v113', 'v115', 'v116', 'v119',
       'v120', 'v121', 'v122', 'v123', 'v124', 'v127', 'v128', 'v129', 'v130',
       'v133', 'v134', 'v135', 'v136', 'v137'],
      dtype='object')

### Search anima variables 

In [10]:
[col for col in df.columns if "hb" in col.lower() or "anem" in col.lower()]


['shb1_1',
 'shb1_2',
 'shb1_3',
 'shb1_4',
 'shb1d_1',
 'shb1d_2',
 'shb1d_3',
 'shb1d_4',
 'shb1m_1',
 'shb1m_2',
 'shb1m_3',
 'shb1m_4',
 'shb1y_1',
 'shb1y_2',
 'shb1y_3',
 'shb1y_4',
 'shb2_1',
 'shb2_2',
 'shb2_3',
 'shb2_4',
 'shb2d_1',
 'shb2d_2',
 'shb2d_3',
 'shb2d_4',
 'shb2m_1',
 'shb2m_2',
 'shb2m_3',
 'shb2m_4',
 'shb2y_1',
 'shb2y_2',
 'shb2y_3',
 'shb2y_4',
 'shb3_1',
 'shb3_2',
 'shb3_3',
 'shb3_4',
 'shb3d_1',
 'shb3d_2',
 'shb3d_3',
 'shb3d_4',
 'shb3m_1',
 'shb3m_2',
 'shb3m_3',
 'shb3m_4',
 'shb3y_1',
 'shb3y_2',
 'shb3y_3',
 'shb3y_4']

### Target Variable


Hemoglobin Level	V453 / HA53	                  
Anemia Status	V457 / HA57	Anemia level                     
we will name 'hb' for level and 'anemia' for binary 

In [11]:
df["hb"] = df["v453"].replace(999, pd.NA)


In [13]:
df["hb"].describe()


count     5727.0
unique     104.0
top        123.0
freq       191.0
Name: hb, dtype: float64

### Create binary anemia label

In [12]:
df["anemia"] = (df["hb"] < 12.0).astype(int) #non-pregnant women


In [15]:
df["anemia"] = np.where(                                #pregnant women       
    (df["pregnant"] == 1) & (df["hb"] < 11.0), 1,
    np.where(df["hb"] < 12.0, 1, 0)
) 

KeyError: 'pregnant'

### compare v and h

In [15]:
df[["hb", "v457"]].dropna().head()


Unnamed: 0,hb,v457
5,11.0,3.0
9,13.0,4.0
17,10.4,3.0
20,13.6,4.0
26,11.8,3.0


### fix g/dl

In [14]:
df["hb"] = df["hb"].replace(994, pd.NA) / 10


## Phase 2.6 to 2.8 Feature Selection  and Encoding  

https://docs.google.com/spreadsheets/d/1OCGH5YABR8Lu_VlW97hbYaG_agCxWnVgf20BlLgSswU/edit?gid=2052048461#gid=2052048461

### Panic button

In [16]:
df_full = df.copy()


### DROP COLUMNS

In [20]:

keep_cols = [
    "hb",          # target source 
    # --- A. Demographic Characteristics ---
    'v012', # Age
    'v501', # Marital Status
    'v025', # Residence (Urban/Rural)
    'v024', # Region
    'v136', # Household Size
    'v511', # Age at 1st Marriage
    'v218', # Number of Living Children
    'v222', # Birth Interval
    'v228', # Terminated pregnancy

    # --- B. Socio-Economic Status ---
    'v106', # Education Level
    'v133', # Years of Schooling
    'v190', # Wealth Index
    'v714', # Employment Status
    'v716', # Occupation
    'v119', # Electricity
    'v127', # Main Floor Material
    'v129', # Main Roof Material
    'v128', # Main Wall Material
    'v121', # Has Television
    'v153', # Has Telephone/Mobile (Check if v169a is better)
    
    # --- C. Lifestyle & Behavioral Factors ---
    'v161', # Cooking Fuel
    
    'v157', # Frequency reading newspaper
    'v158', # Frequency listening to radio
    'v159', # Frequency watching TV
    'v312', # Current Contraceptive Method

    # --- D. Environmental & Sanitation ---
    'v113', # Source of Drinking Water
    'v116', # Type of Toilet Facility
    'v160', # Shared Toilet
    'v115', # Time to get to Water Source
    'v040', # Cluster Altitude (meters)

    # --- E. Health & Nutrition Indicators ---
    'v445', # Body Mass Index (BMI)
    'v213', # Pregnancy Status
    'v404', # Currently Breastfeeding
    'v215', # Time since last menstrual period
]



### Subset df

In [22]:
df = df[keep_cols]


### Sanity Check

In [4]:
df.shape
df.head()
df.isnull().sum().sort_values(ascending=False)


occupation              15149
hb                      12137
birth_interval           1817
shared_toilet             606
age                         0
marital_status              0
household_size              0
age_first_marriage          0
living_children             0
terminated_pregnancy        0
residence                   0
education_level             0
years_schooling             0
wealth_index                0
employment_status           0
has_electricity             0
floor_material              0
roof_material               0
region                      0
wall_material               0
has_tv                      0
cooking_fuel                0
has_telephone               0
freq_radio                  0
freq_tv                     0
contraceptive_method        0
freq_newspaper              0
water_source                0
toilet_type                 0
time_to_water               0
altitude                    0
bmi                         0
is_pregnant                 0
is_breastf

### Rename Col

In [26]:
df.rename(columns={
    # --- A. Demographic Characteristics ---
    "v012": "age",
    "v501": "marital_status",
    "v025": "residence",
    "v024": "region",
    "v136": "household_size",
    "v511": "age_first_marriage",
    "v218": "living_children",
    "v222": "birth_interval",
    "v228": "terminated_pregnancy",

    # --- B. Socio-Economic Status ---
    "v106": "education_level",
    "v133": "years_schooling",
    "v190": "wealth_index",
    "v714": "employment_status",
    "v716": "occupation",
    "v119": "has_electricity",
    "v127": "floor_material",
    "v129": "roof_material",
    "v128": "wall_material",
    "v121": "has_tv",
    "v153": "has_telephone",       # Note: v153 is often landline. Check if v169a (mobile) is available/better.

    # --- C. Lifestyle & Behavioral Factors ---
    "v161": "cooking_fuel",
    "v157": "freq_newspaper",
    "v158": "freq_radio",
    "v159": "freq_tv",
    "v312": "contraceptive_method",

    # --- D. Environmental & Sanitation ---
    "v113": "water_source",
    "v116": "toilet_type",
    "v160": "shared_toilet",
    "v115": "time_to_water",
    "v040": "altitude",

    # --- E. Health & Nutrition Indicators ---
    "v445": "bmi",
    "v213": "is_pregnant",
    "v404": "is_breastfeeding",
    "v215": "time_since_period",
    
    # --- Target Variables (If you haven't renamed them yet) ---
    "v453": "hg",
    
}, inplace=True)

### Checkpoint 2

In [27]:
df.to_csv("../data/processed/feature_reduced.csv", index=False)


In [3]:
PROC_PATH = "../data/processed/feature_reduced.csv"

df = pd.read_csv(PROC_PATH)             # df = working copy

In [5]:
df.shape


(17842, 35)

### Remove Noise or redundancy 

In [6]:
cols_to_drop = [
    "iron_supplementation",
    "birth_interval",
    "terminated_pregnancy",
    "time_since_period",
    "years_schooling"
]


In [7]:
df.drop(columns=cols_to_drop, inplace=True, errors="ignore")


### Sanity check

In [15]:
df.columns
df.shape


(17842, 31)

### Checkpoint 3

In [9]:
df.to_csv("../data/processed/feature_reduced_2.csv", index=False)


### RollBack

In [10]:
df_before_missing = df.copy()


### Count Missing value per Col

In [11]:
missing_counts = df.isna().sum().sort_values(ascending=False)
missing_percent = (missing_counts / len(df)) * 100

missing_summary = pd.DataFrame({
    "missing_count": missing_counts,
    "missing_percent": missing_percent
})

missing_summary


Unnamed: 0,missing_count,missing_percent
occupation,15149,84.906401
hb,12137,68.024885
shared_toilet,606,3.39648
residence,0,0.0
region,0,0.0
household_size,0,0.0
age,0,0.0
age_first_marriage,0,0.0
living_children,0,0.0
wealth_index,0,0.0


In [None]:
cols_to_drop = [
    "occupation",
]

In [12]:
df.drop(columns=cols_to_drop, inplace=True, errors="ignore")


### Drop rows with missing target  

In [16]:
df.columns
df.shape

(17842, 31)

In [17]:
df = df.dropna(subset=["hb"])


In [18]:
df.columns
df.shape

(5705, 31)

### CREATE BINARY LABEL

In [19]:
df["anemia"] = (df["hb"] < 12.0).astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["anemia"] = (df["hb"] < 12.0).astype(int)


### Checkpoint 4

In [20]:
df.to_csv("../data/processed/feature_reduced_3.csv", index=False)


In [None]:
# phase 4 STEP 5.1. (start from here tomorrow)

In [3]:
PROC_PATH = "../data/processed/feature_reduced_3.csv"

df = pd.read_csv(PROC_PATH)             # df = working copy

In [8]:
df.columns
df.shape

(5705, 31)

In [5]:
cols_to_drop = [
    "occupation",
]

In [6]:
df = df.drop(columns=cols_to_drop)

### Checkpoint 4

In [7]:
df.to_csv("../data/processed/processed_4.csv", index=False)


In [2]:
PROC_PATH = "../data/processed/processed_4.csv"

df = pd.read_csv(PROC_PATH)             # df = working copy

In [3]:
df.shape

(5705, 31)

### Baseline

In [4]:
df.dtypes


hb                      float64
age                       int64
marital_status            int64
residence                 int64
region                    int64
household_size            int64
age_first_marriage        int64
living_children           int64
education_level           int64
wealth_index              int64
employment_status         int64
has_electricity           int64
floor_material            int64
roof_material             int64
wall_material             int64
has_tv                    int64
has_telephone             int64
cooking_fuel              int64
freq_newspaper            int64
freq_radio                int64
freq_tv                   int64
contraceptive_method      int64
water_source              int64
toilet_type               int64
shared_toilet           float64
time_to_water             int64
altitude                  int64
bmi                       int64
is_pregnant               int64
is_breastfeeding          int64
anemia                    int64
dtype: o

### FINAL FEATURE CATEGORIZATION

In [21]:

# 1. Numerical Features (Continuous or Counts)
# Strategy: Impute missing with Median/Mean -> StandardScaler
numeric_features = [
    "age",                  # v012
    "household_size",       # v136
    "age_first_marriage",   # v511
    "living_children",      # v218
    "years_schooling",      # v133
    "time_to_water",        # v115 (Ensure 996 is recoded to 0 first)
    "altitude",             # v040
    "bmi"                   # v445
]

# 2. Ordinal Categorical Features (Rank/Order matters)
# Strategy: OrdinalEncoder (Preserve 0<1<2 hierarchy)
ordinal_features = [
    "education_level",      # v106 (No < Pri < Sec < Higher)
    "wealth_index",         # v190 (Poorest < ... < Richest)
    "freq_newspaper",       # v157 (Not at all < ... < Often)
    "freq_radio",           # v158
    "freq_tv"               # v159
]

# 3. Nominal Categorical Features (No inherent order)
# Strategy: OneHotEncoder (get_dummies)
nominal_features = [
    "marital_status",       # v501 (Married, Widowed, Divorced...)
    "residence",            # v025 (Urban, Rural) - Technically binary but safe here
    "region",               # v024
    "floor_material",       # v127
    "roof_material",        # v129
    "wall_material",        # v128
    "cooking_fuel",         # v161
    "water_source",         # v113
    "toilet_type",          # v116
    "contraceptive_method", # v312
]

# 4. Binary Features (True/False or 0/1)
# Strategy: Simple Mapping (No=0, Yes=1) -> Mode Imputation
binary_features = [
    "employment_status",    # v714
    "has_electricity",      # v119
    "has_tv",               # v121
    "has_telephone",        # v153
    "is_pregnant",          # v213
    "is_breastfeeding",     # v404
    "shared_toilet",        # v160
]


### Sanity check ‚Äî did we miss

In [7]:
all_features = (
    numeric_features +
    ordinal_features +
    nominal_features +
    binary_features +
    ["hb"]  # target
)

set(df.columns) - set(all_features)


{'anemia'}

### Define Targets 

In [22]:
target_columns = ["anemia", "hb"]


In [23]:
all_features = (
    numeric_features +
    ordinal_features +
    nominal_features +
    binary_features +
    target_columns
)

set(df.columns) - set(all_features)


set()

### Lock feature types

In [24]:
import json

feature_groups = {
    "numeric": numeric_features,
    "ordinal": ordinal_features,
    "nominal": nominal_features,
    "binary": binary_features,
    "target": target_columns
}

with open("feature_groups.json", "w") as f:
    json.dump(feature_groups, f, indent=4)


### DHS missing codes

In [14]:
# 1. Handle Special DHS Codes FIRST (Valid Data)
df['time_to_water'] = df['time_to_water'].replace(996, 0)

# v215 (Time since period) - Complex! 
# 994=Menopause, 995=Before Last Birth, 996=Never. 
# You might want to keep these as categories or specific numbers, NOT NaNs.

# 2. Define "True Missing" mappings
missing_map = {
    # 1-digit variables (Check carefuly: is 8 a category?)
    'education_level': [9],
    'wealth_index': [], # Usually no missing
    
    # 2-digit variables
    'age_first_marriage': [99, 98],
    'years_schooling': [98, 99],
    
    # 3-digit variables
    'hemoglobin_level': [999], # v453
    'birth_interval': [999],   # v222
    
    # 4-digit variables
    'bmi': [9998, 9999],       # v445
    'altitude': [9999]         # v040
}



In [15]:
# 3. Apply the mapping
for col, missing_codes in missing_map.items():
    if col in df.columns:
        df[col] = df[col].replace(missing_codes, np.nan)

In [19]:
#verify 
df["education_level"].value_counts(dropna=False).head(10)

education_level
2    2058
1    1757
0    1462
3     428
Name: count, dtype: int64

### sanity check

In [25]:
for col in binary_features:
    print(col, df[col].dropna().unique())


employment_status [0 1]
has_electricity [0 1 7]
has_tv [0 1 7]
has_telephone [0 7 1]
is_pregnant [0 1]
is_breastfeeding [0 1]
shared_toilet [0. 7. 1.]


In [26]:
df[binary_features] = df[binary_features].replace(7, np.nan)

In [27]:
for col in binary_features:
    print(col, df[col].dropna().unique())

employment_status [0 1]
has_electricity [0. 1.]
has_tv [0. 1.]
has_telephone [0. 1.]
is_pregnant [0 1]
is_breastfeeding [0 1]
shared_toilet [0. 1.]


### Lock Binary 

In [28]:
df[binary_features] = df[binary_features].astype("float")


### Checkpoint 5

In [29]:
df.to_csv("../data/processed/processed_5.csv", index=False)


### Sanity check  

In [32]:
for col in ordinal_features:
    print(f"--- {col} ---")
    print(df[col].value_counts(dropna=False).sort_index())
    print("\n")

--- education_level ---
education_level
0    1462
1    1757
2    2058
3     428
Name: count, dtype: int64


--- wealth_index ---
wealth_index
1    1010
2    1063
3    1085
4    1208
5    1339
Name: count, dtype: int64


--- freq_newspaper ---
freq_newspaper
0    4722
1     575
2     404
9       4
Name: count, dtype: int64


--- freq_radio ---
freq_radio
0    5154
1     242
2     307
9       2
Name: count, dtype: int64


--- freq_tv ---
freq_tv
0    2175
1     693
2    2835
9       2
Name: count, dtype: int64




In [33]:
print("freq_radio" in df.columns)
print("freq_tv" in df.columns)

True
True


### Ordinal Mapping

In [34]:
# Define the strict logical order
# Any value NOT in these dictionaries will become NaN (automatically fixing your '9' problem)

ordinal_mappings = {
    "education_level": {0: 0, 1: 1, 2: 2, 3: 3},    # Enforces 0-3
    "wealth_index":    {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, # Shifts 1-5 -> 0-4
    
    # Media: 0=None, 1=Less than weekly, 2=Weekly+
    # We map 0,1,2. If your data has 3 (Daily), add 3:3. 
    # The '9' will disappear because it is not in this map.
    "freq_newspaper": {0: 0, 1: 1, 2: 2, 3: 3}, 
    "freq_radio":     {0: 0, 1: 1, 2: 2, 3: 3},
    "freq_tv":        {0: 0, 1: 1, 2: 2, 3: 3}
}


In [35]:

# Apply the mapping safely
for col, mapping in ordinal_mappings.items():
    if col in df.columns:
        print(f"Mapping {col}...")
        df[col] = df[col].map(mapping)
    else:
        print(f"Skipping {col} (not in DataFrame)")



Mapping education_level...
Mapping wealth_index...
Mapping freq_newspaper...
Mapping freq_radio...
Mapping freq_tv...


In [36]:
# Sanity Check
print("\n--- Final Checks ---")
print("Wealth unique values (should be 0-4):", df["wealth_index"].unique())
print("Newspaper unique values (should be 0-2/3, no 9):", df["freq_newspaper"].unique())


--- Final Checks ---
Wealth unique values (should be 0-4): [1 3 2 0 4]
Newspaper unique values (should be 0-2/3, no 9): [ 0.  1.  2. nan]


### Lock Ordinal 

In [38]:
df[ordinal_features] = df[ordinal_features].astype("float")
# Sanity check
print("Ordinal dtypes clean:")
print(df[ordinal_features].dtypes)

Ordinal dtypes clean:
education_level    float64
wealth_index       float64
freq_newspaper     float64
freq_radio         float64
freq_tv            float64
dtype: object


### Checkpoint 6 

In [39]:
df.to_csv("../data/processed/processed_6.csv", index=False)

### clean Numeric

In [40]:
# 1. Fix BMI (Special DHS Logic)
# Replace DHS missing codes for BMI (9998, 9999) with NaN
df["bmi"] = df["bmi"].replace([9998, 9999], np.nan)

# Convert to correct unit (div by 100) -> 2500 becomes 25.0
# Only divide valid values (NaN stays NaN)
df["bmi"] = df["bmi"] / 100.0



In [41]:
# 2. Fix other numeric missing codes
# For variables like age_first_marriage, 97+ often means inconsistent/missing
# We replace common numeric missing codes carefully
numeric_missing_codes = [99, 98, 999, 998]

# Apply ONLY to specific columns where these are definitely errors
# (Don't apply to 'age' unless you are sure no one is 98!)
cols_to_clean = ["age_first_marriage", "altitude"] # adjust based on your list

for col in cols_to_clean:
    if col in df.columns:
        df[col] = df[col].replace(numeric_missing_codes, np.nan)



In [42]:
# 3. Sanity Check
print("BMI Stats (Should be approx 12-60):")
print(df["bmi"].describe())

BMI Stats (Should be approx 12-60):
count    5685.000000
mean       21.502130
std         3.907806
min        13.070000
25%        18.600000
50%        20.870000
75%        23.910000
max        46.470000
Name: bmi, dtype: float64


In [43]:
# 2.2 Fix Time to Water
# 996 = On premises (0 minutes)
# 998/999 = Missing
if "time_to_water" in df.columns:
    df["time_to_water"] = df["time_to_water"].replace(996, 0)
    df["time_to_water"] = df["time_to_water"].replace([998, 999], np.nan)

# 2.3 Fix Altitude
# 9999 = Missing
if "altitude" in df.columns:
    df["altitude"] = df["altitude"].replace(9999, np.nan)



In [48]:
# 1. Force-update the feature lists by checking what actually exists in df
# This guarantees 'years_schooling' cannot be in this list
numeric_features = [c for c in numeric_features if c in df.columns]
ordinal_features = [c for c in ordinal_features if c in df.columns]
nominal_features = [c for c in nominal_features if c in df.columns]

print(f"Active numeric features: {len(numeric_features)}")
if "years_schooling" in numeric_features:
    print("‚ùå ERROR: years_schooling is STILL in the list.")
else:
    print("‚úÖ years_schooling is gone from the list.")

# 2. Re-define the 'other' bucket using the CLEAN list
# We exclude special columns that need specific logic
special_numerics = ["bmi", "time_to_water", "altitude", "age"]
other_numeric = [c for c in numeric_features if c not in special_numerics]


# 3.4 General Numeric Cleaning
# This used to crash because 'other_numeric' had ghost columns. 
# Now it is safe.
if other_numeric:
    print(f"Cleaning general numeric columns: {other_numeric}")
    df[other_numeric] = df[other_numeric].replace([98, 99, 998, 999], np.nan)

# 3.5 Lock all to float
df[numeric_features] = df[numeric_features].astype("float")

print("\nüéâ Numeric cleaning finished successfully.")

Active numeric features: 7
‚úÖ years_schooling is gone from the list.
Cleaning general numeric columns: ['household_size', 'age_first_marriage', 'living_children']

üéâ Numeric cleaning finished successfully.


### Nominal 

In [49]:
# Just a quick peek to ensure '99' didn't survive in a nominal column
print("Cooking fuel unique:", df["cooking_fuel"].unique()) 
# Should see codes like 1, 2, 3... or NaN. NOT 99.

Cooking fuel unique: [ 8 97 11 10  9  2  5  3  4  7  1 96  6]


In [50]:
# 1. Define the specific codes to clean
# 97 = Not a dejure resident (Missing)
# 99 = Missing (We might have missed some in Step 5.2)
nominal_missing_codes = [97, 99]

# 2. Apply ONLY to nominal features
# We keep 96 because "Other" is a valid category!
if nominal_features:
    print(f"Cleaning 97/99 from {len(nominal_features)} nominal columns...")
    df[nominal_features] = df[nominal_features].replace(nominal_missing_codes, np.nan)

# 3. Sanity Check for Cooking Fuel
# Should see: 1, 2, 3... 96 (Other). NO 97.
print("\nCooking fuel unique values (should have 96, but no 97):")
print(df["cooking_fuel"].unique())

Cleaning 97/99 from 10 nominal columns...

Cooking fuel unique values (should have 96, but no 97):
[ 8. nan 11. 10.  9.  2.  5.  3.  4.  7.  1. 96.  6.]


### FINAL SANITY CHECK

In [51]:
print("=== üè• FINAL DATASET SANITY CHECK ===\n")

# 1. Check Dimensions
print(f"Dataset Shape: {df.shape}")
print("-" * 30)

# 2. Check TARGET (Must be clean 0/1)
print(">>> TARGET CHECK: 'anemia'")
if "anemia" in df.columns:
    print(df["anemia"].value_counts(dropna=False).sort_index())
else:
    print("‚ùå ERROR: Target 'anemia' is missing!")
print("-" * 30)

# 3. Check BINARY Features
# Success Rule: Only [0.0, 1.0, nan]. No 7, 9, or other numbers.
print(f">>> BINARY FEATURES ({len(binary_features)} cols)")
# Filter list to what actually exists
safe_binary = [c for c in binary_features if c in df.columns]
for col in safe_binary:
    uniques = sorted(df[col].dropna().unique())
    # Alert if anything other than 0 or 1 exists
    if not set(uniques).issubset({0.0, 1.0}):
        print(f"‚ö†Ô∏è FLAG: {col} has unexpected values: {uniques}")
    else:
        # Optional: Print only if you want to see everything
        # print(f"OK: {col}")
        pass
print("Binary check complete. (If no flags above, all are clean 0/1).")
print("-" * 30)

# 4. Check ORDINAL Features
# Success Rule: 0.0 to 4.0 range + nan. No 9, 99.
print(f">>> ORDINAL FEATURES ({len(ordinal_features)} cols)")
safe_ordinal = [c for c in ordinal_features if c in df.columns]
for col in safe_ordinal:
    print(f"{col}: {sorted(df[col].dropna().unique())}")
print("-" * 30)

# 5. Check NOMINAL Features
# Success Rule: No 97, 98, 99. (96 "Other" is OK).
print(f">>> NOMINAL FEATURES ({len(nominal_features)} cols)")
safe_nominal = [c for c in nominal_features if c in df.columns]
for col in safe_nominal:
    # Check for forbidden codes
    bad_codes = [97, 98, 99]
    values = df[col].unique()
    found_bad = [x for x in values if x in bad_codes]
    if found_bad:
        print(f"‚ùå ERROR: {col} still has codes: {found_bad}")
    else:
        # Just print first 5 categories to verify it looks right
        print(f"{col}: Clean. (Example val: {values[0]})")
print("-" * 30)

# 6. Check NUMERIC Features
# Success Rule: Reasonable min/max. No 9999.
print(f">>> NUMERIC FEATURES ({len(numeric_features)} cols)")
safe_numeric = [c for c in numeric_features if c in df.columns]
# We use describe() to see ranges quickly
print(df[safe_numeric].describe().T[["min", "max", "count"]])
print("-" * 30)

# 7. Check Ghost Columns
# Ensure no dropped columns are lingering in your lists
all_lists = safe_binary + safe_ordinal + safe_nominal + safe_numeric
if "years_schooling" in all_lists:
    print("‚ùå ERROR: 'years_schooling' is still in a feature list!")
if "hb" in all_lists:
    print("‚ùå ERROR: 'hb' (target source) is in a feature list!")
print("Ghost column check complete.")

=== üè• FINAL DATASET SANITY CHECK ===

Dataset Shape: (5705, 31)
------------------------------
>>> TARGET CHECK: 'anemia'
anemia
0    3230
1    2475
Name: count, dtype: int64
------------------------------
>>> BINARY FEATURES (7 cols)
Binary check complete. (If no flags above, all are clean 0/1).
------------------------------
>>> ORDINAL FEATURES (5 cols)
education_level: [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0)]
wealth_index: [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0)]
freq_newspaper: [np.float64(0.0), np.float64(1.0), np.float64(2.0)]
freq_radio: [np.float64(0.0), np.float64(1.0), np.float64(2.0)]
freq_tv: [np.float64(0.0), np.float64(1.0), np.float64(2.0)]
------------------------------
>>> NOMINAL FEATURES (10 cols)
marital_status: Clean. (Example val: 1)
residence: Clean. (Example val: 2)
region: Clean. (Example val: 1)
floor_material: Clean. (Example val: 11.0)
roof_material: Clean. (Example val: 31.0)
wall_

### Checkpoint 7

In [52]:
df.to_csv("../data/processed/processed_7.csv", index=False)

## Filling Missing values 


#### fill the NaN values we created. using the standard research approach:
Numeric Features: Fill with Median (Robust to outliers).
Categorical (Binary/Nominal/Ordinal): Fill with Mode (Most frequent value).

In [53]:
from sklearn.impute import SimpleImputer

# 1. Define Imputers
# Numeric -> Median (Safe for skewed data like income/BMI)
num_imputer = SimpleImputer(strategy="median")

# Categorical -> Mode (Most Frequent)
# We treat Binary and Ordinal as categorical for imputation purposes because
# we want integers (e.g., 1.0 or 0.0), not decimals (0.45).
cat_imputer = SimpleImputer(strategy="most_frequent")

# 2. Impute Numeric Features
# Filter list to what actually exists in df to avoid crashes
numeric_features = [c for c in numeric_features if c in df.columns]
print(f"Imputing {len(numeric_features)} numeric features...")
df[numeric_features] = num_imputer.fit_transform(df[numeric_features])

# 3. Impute Categorical Features (Nominal + Ordinal + Binary)
# We combine them for this step to save code lines
categorical_group = nominal_features + ordinal_features + binary_features
# Filter to ensure we only touch columns that exist
categorical_group = [c for c in categorical_group if c in df.columns]

print(f"Imputing {len(categorical_group)} categorical features...")
df[categorical_group] = cat_imputer.fit_transform(df[categorical_group])

# 4. Final Missingness Check
total_missing = df.isnull().sum().sum()
print("-" * 30)
if total_missing == 0:
    print("‚úÖ SUCCESS: Dataset contains 0 missing values.")
else:
    print(f"‚ùå ERROR: Still found {total_missing} missing values!")
    # Optional: Print which columns are failing
    print(df.columns[df.isnull().any()])

Imputing 7 numeric features...
Imputing 22 categorical features...
------------------------------
‚úÖ SUCCESS: Dataset contains 0 missing values.


# DATASET PREPROCESSING COMPLETE  

### Checkpoint 8

In [54]:
df.to_csv("../data/processed/processed_8.csv", index=False)

In [55]:
df["is_pregnant"].unique()


array([0., 1.])

### change target for pregnant 

In [56]:
df["anemia"] = np.where(
    (df["is_pregnant"] == 1) & (df["hb"] < 11.0), 1,
    np.where(
        (df["is_pregnant"] == 0) & (df["hb"] < 12.0), 1,
        0
    )
)


In [57]:
df = df.dropna(subset=["anemia"])
df["anemia"] = df["anemia"].astype(int)


### checkpoint 9

In [58]:
df.to_csv("../data/processed/processed_9.csv", index=False)