# Cleaning WHO Data Pipeline

In [2]:
import pandas as pd
import numpy as np

### Clean BMI data

In [6]:
boys_df = pd.read_csv("data/bmi-boys-perc-who2007-exp.csv", sep=";")
girls_df = pd.read_csv("data/bmi-girls-perc-who2007-exp.csv", sep=";")

In [7]:
def clean_who_bmi(df, gender):
    df = df.copy()
    for col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace(",", ".", regex=False)
            .str.replace(" ", "", regex=False)
            .replace("", np.nan)
        )
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df[["Month", "L", "M", "S"]].dropna()
    df["Sex"] = 1 if gender == "M" else 2
    return df[["Sex", "Month", "L", "M", "S"]]

In [8]:
boys_clean = clean_who_bmi(boys_df, "M")
girls_clean = clean_who_bmi(girls_df, "F")
who_bmi_df = pd.concat([boys_clean, girls_clean], ignore_index=True)
who_bmi_df = who_bmi_df.sort_values(by=["Sex", "Month"]).reset_index(drop=True)

In [9]:
who_bmi_df.to_csv("data/who_bmi_clean.csv", index=False)

In [10]:
print(who_bmi_df.head())

   Sex  Month       L        M        S
0    1     61 -0.7387  15.2641  0.08390
1    1     62 -0.7621  15.2616  0.08414
2    1     63 -0.7856  15.2604  0.08439
3    1     64 -0.8089  15.2605  0.08464
4    1     65 -0.8322  15.2619  0.08490


### Clean HFA and WFA Data

In [3]:
hfa_boys = pd.read_csv("./data/hfa-boys-perc-who2007-exp.csv", sep=";")
hfa_girl = pd.read_csv("./data/hfa-girls-perc-who2007-exp.csv", sep=";")

wfa_boys = pd.read_csv("./data/wfa-boys-perc-who2007.csv", sep=";")
wfa_girls = pd.read_csv("./data/wfa-girls-perc-who2007.csv", sep=";")

In [4]:
def clean_who_lms(df, gender, measure_type):
    df = df.copy()
    df.columns = [col.strip() for col in df.columns]

    for col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace(",", ".", regex=False)
            .str.replace(" ", "", regex=False)
            .replace("", np.nan)
        )
        df[col] = pd.to_numeric(df[col], errors="coerce")

    df = df[["Month", "L", "M", "S"]].dropna()
    df["Sex"] = 1 if gender == "M" else 2
    df["Type"] = measure_type.upper()
    return df[["Type", "Sex", "Month", "L", "M", "S"]]

In [7]:
hfa_boys_clean = clean_who_lms(hfa_boys, "M", "HFA")
hfa_girls_clean = clean_who_lms(hfa_girl, "F", "HFA")

wfa_boys_clean = clean_who_lms(wfa_boys, "M", "WFA")
wfa_girls_clean = clean_who_lms(wfa_girls, "F", "WFA")

In [8]:
who_lms_all = pd.concat(
    [hfa_boys_clean, hfa_girls_clean, wfa_boys_clean, wfa_girls_clean],
    ignore_index=True
).sort_values(by=["Type", "Sex", "Month"])

In [9]:
who_lms_all.to_csv("data/who_lms_all_clean.csv", index=False)

In [10]:
who_lms_all.head()

Unnamed: 0,Type,Sex,Month,L,M,S
0,HFA,1,61,1.0,110.2647,0.04164
1,HFA,1,62,1.0,110.8006,0.04172
2,HFA,1,63,1.0,111.3338,0.0418
3,HFA,1,64,1.0,111.8636,0.04187
4,HFA,1,65,1.0,112.3895,0.04195
