In [387]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import shap
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [386]:
data = pd.read_stata("data/vietnam_finaldataset.dta")

In [334]:
df = pd.DataFrame(data)

In [335]:
df.head(5)
# df.info()
# df.describe()

Unnamed: 0,CHILDCODE,agemon1,aniany1,bcg1,bmi1,careage1,carecantread1,caredu1,caregiver_is_female1,caregiver_is_parent1,...,z_maths_raw_r5,z_read_raw_r5,STNPRSR5_r5,katz_norms_r5,z_selfefficacy_r5,z_agency_r5,z_selfsteem_r5,z_peersr5,z_pride_r5,z_relationparents_r5
0,10001,7,1,yes,14.922146,30,no,Grade 12,1.0,1.0,...,1.180432,-0.567188,8,0.501903,-0.902831,0.405941,-0.593782,-0.197115,1.0155,-1.140962
1,10002,14,1,yes,15.547168,25,no,"Post-secondary, vocational",1.0,1.0,...,0.547116,0.837693,7,-0.262727,-0.666351,-0.233022,0.613296,0.430333,-1.787409,-1.616398
2,10003,11,1,yes,15.734185,30,no,Grade 9,1.0,1.0,...,-0.561189,0.837693,4,0.254078,0.862961,1.068327,-0.134186,0.713549,0.29241,-0.155111
3,10004,12,1,yes,15.627442,36,no,Grade 6,1.0,1.0,...,0.388786,1.239088,6,-1.138824,-1.244954,1.017076,-0.926815,-0.699221,-1.034115,0.147575
4,10005,9,1,yes,17.101952,34,no,Grade 9,1.0,1.0,...,-1.402517,1.003388,5,-0.848556,-0.918531,-0.071417,0.376933,-0.179738,-1.097467,-0.820417


In [74]:
# for column in df.columns:
#     print(column)

# Clean data

In [336]:
df = df.drop_duplicates()

In [337]:
dependent_vars = ["agemon1", "female1", "stunting1", "underweight1", "bcg1", "measles1", "tetanus1", "has_longterm_disease_r1", "bmi1","careage1", "carecantread1", "caredu1", "caregiver_is_female1", "caregiver_is_parent1", "dadage1", "dadedu1", "dadlive1", "momage1", "momedu1", "momlive1", "numante1", "hhsize1", "headage1", "headedu1", "headsex1", "ownlandhse1", "typesite1", "cookingq1", "drwaterq1", "elecq1", "toiletq1", "aniany1", "sv1", "injury_child_may_die_r1", "sees_dad_daily_r1", "sees_mom_daily_r1", "health_worse_than_others_r1"]
independent_vars = ['bmi5', 'stunting5', 'thinness5', 'chhealth5', 'z_selfefficacy_r5', 'z_agency_r5', 'z_selfsteem_r5', 'z_peersr5', 'z_pride_r5', 'z_relationparents_r5']

In [338]:
cols_to_keep = dependent_vars + independent_vars
df_filterd = df[cols_to_keep]

In [339]:
# df_filterd.isna().sum()
df_filterd = df_filterd.dropna()

In [340]:
X = pd.DataFrame(df_filterd[dependent_vars])
y = pd.DataFrame(df_filterd[independent_vars])

In [385]:
y.head()

Unnamed: 0,bmi5,stunting5,thinness5,chhealth5,z_selfefficacy_r5,z_agency_r5,z_selfsteem_r5,z_peersr5,z_pride_r5,z_relationparents_r5
0,14.497697,moderately stunted,severely thin,average,-0.902831,0.405941,-0.593782,-0.197115,1.0155,-1.140962
1,19.965398,not stunted,not thin,average,-0.666351,-0.233022,0.613296,0.430333,-1.787409,-1.616398
2,17.469445,not stunted,not thin,average,0.862961,1.068327,-0.134186,0.713549,0.29241,-0.155111
3,19.086428,moderately stunted,not thin,average,-1.244954,1.017076,-0.926815,-0.699221,-1.034115,0.147575
6,21.771661,not stunted,not thin,average,0.383515,0.405941,0.299607,-0.197115,-0.407526,0.421651


In [384]:
# X.head()
# X.info()

In [381]:
# for i in X.columns:
#     print(i)

# Features Clean

In [342]:
stunting_mapping = {
    "not stunted": 0,
    "moderately stunted": 1,
    "severely stunted": 2,
}

X["stunting1"] = X["stunting1"].map(stunting_mapping)

X["stunting1"].unique()

[0, 1, 2]
Categories (3, int64): [0 < 1 < 2]

In [343]:
underweight_mapping = {
    "not underweight": 0,
    "moderately underweight": 1,
    "severely underweight": 2,
}
X["underweight1"] = X["underweight1"].map(underweight_mapping)

X["underweight1"].unique()

[0, 1, 2]
Categories (3, int64): [0 < 1 < 2]

In [344]:
 bcg_mapping = {
     "no": 0,
     "yes": 1,
 }

X["bcg1"] = X["bcg1"].map(bcg_mapping)
X["bcg1"].unique()

[1, 0]
Categories (2, int64): [0 < 1]

In [345]:
 bcg_mapping = {
     "no": 0,
     "yes": 1,
 }

X["measles1"] = X["measles1"].map(bcg_mapping)
X["measles1"].unique()

[0, 1]
Categories (2, int64): [0 < 1]

In [346]:
 bcg_mapping = {
     "no": 0,
     "yes": 1,
 }

X["tetanus1"] = X["tetanus1"].map(bcg_mapping)
X["tetanus1"].unique()

[1, 0]
Categories (2, int64): [0 < 1]

In [347]:
 bcg_mapping = {
     0: 0,
     "Yes": 1,
 }

X["has_longterm_disease_r1"] = X["has_longterm_disease_r1"].map(bcg_mapping)
X["has_longterm_disease_r1"].unique()

[0, 1]
Categories (2, int64): [0 < 1]

In [348]:
 bcg_mapping = {
     "no": 0,
     "yes": 1,
 }

X["carecantread1"] = X["carecantread1"].map(bcg_mapping)
X["carecantread1"].unique()

[0, 1]
Categories (2, int64): [0 < 1]

In [349]:
bcg_mapping = {
    'None': 0,
    'Grade 1': 1,
    'Grade 2': 2,
    'Grade 3': 3,
    'Grade 4': 4,
    'Grade 5': 5,
    'Grade 6': 6,
    'Grade 7': 7,
    'Grade 8': 8,
    'Grade 9': 9,
    'Grade 10': 10,
    'Grade 11': 11,
    'Grade 12': 12,
    'Post-secondary, vocational': 13,
    'University': 14
}

X['caredu1'] = X['caredu1'].map(bcg_mapping)

X["caredu1"].unique()


[12, 13, 9, 6, 4, ..., 10, 3, 2, 14, 1]
Length: 15
Categories (15, int64): [0 < 1 < 2 < 3 ... 11 < 12 < 13 < 14]

In [350]:
bcg_mapping = {
    'None': 0,
    'Grade 1': 1,
    'Grade 2': 2,
    'Grade 3': 3,
    'Grade 4': 4,
    'Grade 5': 5,
    'Grade 6': 6,
    'Grade 7': 7,
    'Grade 8': 8,
    'Grade 9': 9,
    'Grade 10': 10,
    'Grade 11': 11,
    'Grade 12': 12,
    'Post-secondary, vocational': 13,
    'University': 14
}
X['dadedu1'] = X['dadedu1'].map(bcg_mapping)
X["dadedu1"].unique()

[11, 12, 7, 6, 9, ..., 10, 5, 2, 1, 14]
Length: 15
Categories (15, int64): [0 < 1 < 2 < 3 ... 11 < 12 < 13 < 14]

In [351]:
bcg_mapping = {
    'Lives in the household': 1,
    'Does not live in household': 0,
    'Has died': 2,
}

X["dadlive1"] = X['dadlive1'].map(bcg_mapping)
X["dadlive1"].unique()

[1]
Categories (3, int64): [1 < 0 < 2]

In [352]:
bcg_mapping = {
    'None': 0,
    'Grade 1': 1,
    'Grade 2': 2,
    'Grade 3': 3,
    'Grade 4': 4,
    'Grade 5': 5,
    'Grade 6': 6,
    'Grade 7': 7,
    'Grade 8': 8,
    'Grade 9': 9,
    'Grade 10': 10,
    'Grade 11': 11,
    'Grade 12': 12,
    'Post-secondary, vocational': 13,
    'University': 14
}

X['momedu1'] = X['momedu1'].map(bcg_mapping)
X["momedu1"].unique()

[12, 13, 9, 6, 4, ..., 10, 3, 2, 14, 1]
Length: 15
Categories (15, int64): [0 < 1 < 2 < 3 ... 11 < 12 < 13 < 14]

In [353]:
bcg_mapping = {
    'Lives in the household': 1,
    'Does not live in household': 0,
    'Has died': 2,
}

X["momlive1"] = X['momlive1'].map(bcg_mapping)

X["momlive1"].unique()

[1]
Categories (2, int64): [1 < 0]

In [354]:
bcg_mapping = {
    'None': 0,
    'Grade 1': 1,
    'Grade 2': 2,
    'Grade 3': 3,
    'Grade 4': 4,
    'Grade 5': 5,
    'Grade 6': 6,
    'Grade 7': 7,
    'Grade 8': 8,
    'Grade 9': 9,
    'Grade 10': 10,
    'Grade 11': 11,
    'Grade 12': 12,
    'Post-secondary, vocational': 13,
    'University': 14
}

X['headedu1'] = X['headedu1'].map(bcg_mapping)
X["headedu1"].unique()

[11, 2, 7, 6, 9, ..., 4, 14, 10, 13, 1]
Length: 15
Categories (15, int64): [0 < 1 < 2 < 3 ... 11 < 12 < 13 < 14]

In [355]:
bcg_mapping = {
    'male': 1,
    'female': 0,
}

X['headsex1'] = X['headsex1'].map(bcg_mapping)

X["headsex1"].unique()

[1, 0]
Categories (2, int64): [1 < 0]

In [356]:
 bcg_mapping = {
     "no": 0,
     "yes": 1,
 }
X['ownlandhse1'] = X['ownlandhse1'].map(bcg_mapping)
X["ownlandhse1"].unique()

[1, 0]
Categories (2, int64): [0 < 1]

In [357]:
 bcg_mapping = {
     "rural": 0,
     "urban": 1,
 }

X['typesite1'] = X['typesite1'].map(bcg_mapping)
X["typesite1"].unique()

[0, 1]
Categories (2, int64): [1 < 0]

In [358]:
 bcg_mapping = {
     "No": 0,
     "Yes": 1,
 }

X['cookingq1'] = X['cookingq1'].map(bcg_mapping)
X["cookingq1"].unique()

[0, 1]
Categories (2, int64): [0 < 1]

In [359]:
 bcg_mapping = {
     "No": 0,
     "Yes": 1,
 }

X['drwaterq1'] = X['drwaterq1'].map(bcg_mapping)
X["drwaterq1"].unique()

[0, 1]
Categories (2, int64): [0 < 1]

In [360]:
 bcg_mapping = {
     "No": 0,
     "Yes": 1,
 }

X['elecq1'] = X['elecq1'].map(bcg_mapping)
X["elecq1"].unique()

[1, 0]
Categories (2, int64): [0 < 1]

In [361]:
 bcg_mapping = {
     "No": 0,
     "Yes": 1,
 }

X['toiletq1'] = X['toiletq1'].map(bcg_mapping)
X["toiletq1"].unique()

[1, 0]
Categories (2, int64): [0 < 1]

In [None]:
injury_child_may_die_r1
sees_dad_daily_r1
sees_mom_daily_r1
health_worse_than_others_r1

In [362]:
 bcg_mapping = {
     0: 0,
     "Yes": 1,
 }
X["injury_child_may_die_r1"] = X["injury_child_may_die_r1"].map(bcg_mapping)

X["injury_child_may_die_r1"].unique()
# X["sees_dad_daily_r1"].unique()
# X["sees_mom_daily_r1"].unique()
# X["health_worse_than_others_r1"].unique()

[0, 1]
Categories (2, int64): [0 < 1]

In [388]:
category_columns = X.dtypes[X.dtypes == 'category']
print("Columns dtype 'category':")
print(category_columns)

Columns dtype 'category':
Series([], dtype: object)


In [377]:
# X['measles1'] = X['measles1'].astype(int)
# X['tetanus1'] = X['tetanus1'].astype(int)
# X['has_longterm_disease_r1'] = X['has_longterm_disease_r1'].astype(int)
# X['carecantread1'] = X['carecantread1'].astype(int)
# X['caredu1'] = X['caredu1'].astype(int)
# X['dadage1'] = X['dadage1'].astype(int)
# X['dadedu1'] = X['dadedu1'].astype(int)
# X['dadlive1'] = X['dadlive1'].astype(int)
# X['momage1'] = X['momage1'].astype(int)
# X['momedu1'] = X['momedu1'].astype(int)
# X['measles1'] = X['measles1'].astype(int)
# X['momlive1'] = X['momlive1'].astype(int)

In [380]:
# X['numante1'] = X['numante1'].astype(int)
# X['headedu1'] = X['headedu1'].astype(int)
# X['headsex1'] = X['headsex1'].astype(int)
# X['ownlandhse1'] = X['ownlandhse1'].astype(int)
# X['typesite1'] = X['typesite1'].astype(int)
# X['cookingq1'] = X['cookingq1'].astype(int)
# X['drwaterq1'] = X['drwaterq1'].astype(int)
# X['elecq1'] = X['elecq1'].astype(int)
# X['toiletq1'] = X['toiletq1'].astype(int)
# X['injury_child_may_die_r1'] = X['injury_child_may_die_r1'].astype(int)


# Round 5 Clean

In [392]:
stunting_mapping = {
    "not stunted": 0,
    "moderately stunted": 1,
    "severely stunted": 2,
}

y["stunting5"] = y["stunting5"].map(stunting_mapping)
y["stunting5"].unique()

[1, 0, 2]
Categories (3, int64): [0 < 1 < 2]

In [398]:
stunting_mapping = {
    "very poor": 0,
    "poor": 1,
    "average": 2,
    "good":3,
    "very good":4,
}
y["chhealth5"] = y["chhealth5"].map(stunting_mapping)
y["chhealth5"].unique()

[2, 3, 1, 4, 0]
Categories (5, int64): [0 < 1 < 2 < 3 < 4]

In [402]:
category_columns = y.dtypes[y.dtypes == 'category']
print("Columns dtype 'category':")
print(category_columns)

Columns dtype 'category':
Series([], dtype: object)


In [401]:
y['stunting5'] = y['stunting5'].astype(int)
y['thinness5'] = y['thinness5'].astype(int)
y['chhealth5'] = y['chhealth5'].astype(int)

In [406]:
X.to_csv('data/X_cleaned.csv', index=False)

In [407]:
y.to_csv('data/y_cleaned.csv', index=False)