### **Import Library**

In [None]:
import pandas as pd
import numpy as np

### **Merging Datasets**

In [None]:
df_A = pd.read_csv('stunting_wasting_dataset.csv')
df_B = pd.read_csv('data_balita.csv')

In [None]:
df_A.head()

Unnamed: 0,Jenis Kelamin,Umur (bulan),Tinggi Badan (cm),Berat Badan (kg),Stunting,Wasting
0,Laki-laki,19,91.6,13.3,Tall,Risk of Overweight
1,Laki-laki,20,77.7,8.5,Stunted,Underweight
2,Laki-laki,10,79.0,10.3,Normal,Risk of Overweight
3,Perempuan,2,50.3,8.3,Severely Stunted,Risk of Overweight
4,Perempuan,5,56.4,10.9,Severely Stunted,Risk of Overweight


In [None]:
df_B.head()

Unnamed: 0,Umur (bulan),Jenis Kelamin,Tinggi Badan (cm),Status Gizi
0,0,laki-laki,44.591973,stunted
1,0,laki-laki,56.705203,tinggi
2,0,laki-laki,46.863358,normal
3,0,laki-laki,47.508026,normal
4,0,laki-laki,42.743494,severely stunted


In [None]:
df_A.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Jenis Kelamin      100000 non-null  object 
 1   Umur (bulan)       100000 non-null  int64  
 2   Tinggi Badan (cm)  100000 non-null  float64
 3   Berat Badan (kg)   100000 non-null  float64
 4   Stunting           100000 non-null  object 
 5   Wasting            100000 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 4.6+ MB


In [None]:
df_B.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120999 entries, 0 to 120998
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Umur (bulan)       120999 non-null  int64  
 1   Jenis Kelamin      120999 non-null  object 
 2   Tinggi Badan (cm)  120999 non-null  float64
 3   Status Gizi        120999 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 3.7+ MB


In [None]:
df_A = df_A.drop(columns=['Berat Badan (kg)', 'Wasting'])
df_A.head()

Unnamed: 0,Jenis Kelamin,Umur (bulan),Tinggi Badan (cm),Stunting
0,Laki-laki,19,91.6,Tall
1,Laki-laki,20,77.7,Stunted
2,Laki-laki,10,79.0,Normal
3,Perempuan,2,50.3,Severely Stunted
4,Perempuan,5,56.4,Severely Stunted


In [None]:
df_A = df_A.rename(columns={'Stunting':'Status Gizi'})
df_A['Jenis Kelamin'] = df_A['Jenis Kelamin'].str.lower().str.strip()
df_A['Status Gizi'] = df_A['Status Gizi'].str.lower().str.strip()
df_A.head()

Unnamed: 0,Jenis Kelamin,Umur (bulan),Tinggi Badan (cm),Status Gizi
0,laki-laki,19,91.6,tall
1,laki-laki,20,77.7,stunted
2,laki-laki,10,79.0,normal
3,perempuan,2,50.3,severely stunted
4,perempuan,5,56.4,severely stunted


In [None]:
mapping = {
    "tinggi": "tall",
    "normal": "normal",
    "stunted": "stunted",
    "severely stunted": "severely stunted"
}
df_B["Status Gizi"] = df_B["Status Gizi"].map(mapping).fillna(df_B["Status Gizi"])
df_B.head()

Unnamed: 0,Umur (bulan),Jenis Kelamin,Tinggi Badan (cm),Status Gizi
0,0,laki-laki,44.591973,stunted
1,0,laki-laki,56.705203,tall
2,0,laki-laki,46.863358,normal
3,0,laki-laki,47.508026,normal
4,0,laki-laki,42.743494,severely stunted


In [None]:
df = pd.concat([df_A, df_B], ignore_index=True)
df.head()

Unnamed: 0,Jenis Kelamin,Umur (bulan),Tinggi Badan (cm),Status Gizi
0,laki-laki,19,91.6,tall
1,laki-laki,20,77.7,stunted
2,laki-laki,10,79.0,normal
3,perempuan,2,50.3,severely stunted
4,perempuan,5,56.4,severely stunted


In [None]:
print(df['Status Gizi'].value_counts())

Status Gizi
normal              140067
stunted              29975
severely stunted     25688
tall                 25269
Name: count, dtype: int64


In [None]:
print(df['Umur (bulan)'].value_counts())

Umur (bulan)
12    6148
4     6115
5     6093
22    6075
15    6060
      ... 
56    2000
57    2000
58    2000
59    2000
60    2000
Name: count, Length: 61, dtype: int64


In [None]:
df.isnull().sum()

Unnamed: 0,0
Jenis Kelamin,0
Umur (bulan),0
Tinggi Badan (cm),0
Status Gizi,0


In [None]:
df.duplicated().sum()

np.int64(180244)

In [None]:
df = df.drop_duplicates()
df.duplicated().sum()

np.int64(0)

In [None]:
print(df['Status Gizi'].value_counts())

Status Gizi
normal              22374
tall                 7052
severely stunted     6599
stunted              4730
Name: count, dtype: int64


In [None]:
print(df['Umur (bulan)'].value_counts())

Umur (bulan)
2     2284
1     2276
0     2266
59     691
57     682
      ... 
11     493
13     492
15     483
12     471
9      427
Name: count, Length: 61, dtype: int64


In [None]:
df = df.sort_values(by=["Jenis Kelamin", "Umur (bulan)"], ascending=[True, True]).reset_index(drop=True)
df.head()

Unnamed: 0,Jenis Kelamin,Umur (bulan),Tinggi Badan (cm),Status Gizi
0,laki-laki,0,46.3,normal
1,laki-laki,0,50.5,normal
2,laki-laki,0,45.8,stunted
3,laki-laki,0,49.1,normal
4,laki-laki,0,55.0,normal


### **Data Validation**

In [None]:
who_boys_zero_to_two  = pd.read_excel("lhfa_boys_0-to-2-years_zscores.xlsx")
who_boys_two_to_five  = pd.read_excel("lhfa_boys_2-to-5-years_zscores.xlsx")

who_girls_zero_to_two = pd.read_excel("lhfa_girls_0-to-2-years_zscores.xlsx")
who_girls_two_to_five = pd.read_excel("lhfa_girls_2-to-5-years_zscores.xlsx")

In [None]:
def zscore_who(height, age_months, sex):
    if sex == 'laki-laki' and age_months <= 24:   # 0–24 bulan
        row = who_boys_zero_to_two[who_boys_zero_to_two['Month'] == age_months].iloc[0]
    elif sex == 'laki-laki' and age_months >= 25:   # 25–60 bulan
        row = who_boys_two_to_five[who_boys_two_to_five['Month'] == age_months].iloc[0]
    elif sex == 'perempuan' and age_months <= 24:
        row = who_girls_zero_to_two[who_girls_zero_to_two['Month'] == age_months].iloc[0]
    elif sex == 'perempuan' and age_months >= 25:
        row = who_girls_two_to_five[who_girls_two_to_five['Month'] == age_months].iloc[0]

    L, M, S = row['L'], row['M'], row['S']

    # Rumus LMS
    if np.isclose(L, 0):
        z = np.log(height / M) / S
    else:
        z = (((height / M) ** L) - 1) / (L * S)
    return z

df['Z-score WHO'] = [
    zscore_who(h, a, s)
    for h, a, s in zip(df['Tinggi Badan (cm)'], df['Umur (bulan)'], df['Jenis Kelamin'])
]

def classify_stunting(z):
    if z < -3:
        return 'severely stunted'
    elif z < -2:
        return 'stunted'
    elif z > 3:
        return 'tall'
    else:
        return 'normal'

In [None]:
df['Status Gizi WHO'] = df['Z-score WHO'].apply(classify_stunting)
df.head()

Unnamed: 0,Jenis Kelamin,Umur (bulan),Tinggi Badan (cm),Status Gizi,Z-score WHO,Status Gizi WHO
0,laki-laki,0,46.3,normal,-1.893291,normal
1,laki-laki,0,50.5,normal,0.325286,normal
2,laki-laki,0,45.8,stunted,-2.157408,stunted
3,laki-laki,0,49.1,normal,-0.41424,normal
4,laki-laki,0,55.0,normal,2.702332,normal


In [None]:
df["label_match"] = df["Status Gizi"] == df["Status Gizi WHO"]

inconsistent = df[df["label_match"] == False]
print("Jumlah data yang tidak sesuai dengan standar WHO:", len(inconsistent))
inconsistent.head()

Jumlah data yang tidak sesuai dengan standar WHO: 353


Unnamed: 0,Jenis Kelamin,Umur (bulan),Tinggi Badan (cm),Status Gizi,Z-score WHO,Status Gizi WHO,label_match
12,laki-laki,0,44.2,stunted,-3.00258,severely stunted,False
98,laki-laki,0,55.6,normal,3.019272,tall,False
159,laki-laki,0,55.583024,normal,3.010305,tall,False
315,laki-laki,0,55.586373,normal,3.012074,tall,False
1096,laki-laki,0,55.59028,normal,3.014138,tall,False


In [None]:
df = df.drop(columns=['Status Gizi', 'label_match'])
df.head()

Unnamed: 0,Jenis Kelamin,Umur (bulan),Tinggi Badan (cm),Z-score WHO,Status Gizi WHO
0,laki-laki,0,46.3,-1.893291,normal
1,laki-laki,0,50.5,0.325286,normal
2,laki-laki,0,45.8,-2.157408,stunted
3,laki-laki,0,49.1,-0.41424,normal
4,laki-laki,0,55.0,2.702332,normal


In [None]:
df.to_csv("dataset_stunting.csv", index=False)