# Data Preprocessing - DHIS2 Data - Child Health


## Packages

In [215]:
import os
import glob
import numpy as np
import pandas as pd
import collections

## Main

In [216]:
os.getcwd()
os.chdir('/Users/edinhamzic/Symphony/wb_bangladesh/')
os.getcwd()

'/Users/edinhamzic/Symphony/wb_bangladesh'

In [217]:
def list_files(path, pattern):
    path_all = os.path.join(path, pattern)
    file_names = os.listdir(path)
    abs_file_names = glob.glob(path_all, recursive=True)
    file_names = [os.path.split(glob.glob(file_path)[0])[1] for file_path in abs_file_names]
    data_collection = [file_name.split("_")[0] for file_name in file_names]
    year = [file_name.split("_")[1] for file_name in file_names]
    geo_level = [file_name.split("_")[2] for file_name in file_names]
    file_type = [file_name.split("_")[3][:-4] for file_name in file_names]
    files_df = pd.DataFrame.from_dict({'file_names': file_names, 
                                       'data_collection': data_collection,
                                       'year': year,
                                       'geo_level': geo_level,
                                       'file_type': file_type ,
                                       'abs_file_names': abs_file_names
                                      })

    return files_df

def subset_paths(input_df, years, geo_levels, file_types):
    df = input_df.copy(deep=True)
    years_subset = [str(year) in years for year in df['year']]
    geos_subset = [str(geo) in geo_levels for geo in df['geo_level']]
    filetype_subset = [str(file_type) in file_types for file_type in df['file_type']]
    df = df.loc[np.array(years_subset) & np.array(geos_subset) & np.array(filetype_subset)] 
    return df

def read_transform(input_df, year_var, file):
    df = input_df.copy(deep=True)
    data_dict = {}
    for i in df.index:
        tmp = pd.read_csv(df[file][i])
        tmp = tmp.pivot_table(values='Value',index='Organisation unit', columns='Data')
        data_dict[df[year_var][i]] = tmp
        print(f"################################# {df[year_var][i]} ###################################")
        print(round(tmp.isna().sum() / tmp.shape[0]*100,2))
        print(np.mean(round(tmp.isna().sum() / tmp.shape[0]*100,2)))
        print(sum(tmp.isna().sum()) / (tmp.shape[0]*tmp.shape[1])*100,2)    
    return data_dict

def select_variables(input_data_dict, pattern):
    data_dict = input_data_dict.copy()
    for key, value in data_dict.items():
        print([variable for variable in data_dict[key].columns if pattern in variable])
        data_dict[key] = data_dict[key][[variable for variable in data_dict[key].columns if pattern in variable]]
    return data_dict

In [218]:
childhealth = list_files(path='data/dhis2/health_indicators/', pattern='02ChildHealth_*')
childhealth = subset_paths(input_df=childhealth, years=[str(year) for year in range(2009, 2019)], geo_levels=['District'], file_types=['NAME'])
childhealth = read_transform(input_df=childhealth, year_var='year', file='abs_file_names')


################################# 2018 ###################################
Data
% of breast feeding initiated within 1 hour by CSBA    0.00
% of neonatal death reported individually              3.12
IMCI Stunting (%)                                      0.00
IMCI Total Child                                       0.00
IMCI Underweight (%)                                   0.00
IMCI Wasting (%)                                       0.00
Neonatal Case fatality rate (EmOC)                     0.00
Percentage of diarrhea reported at facility            0.00
Percentage of pneumonia reported at facility           0.00
dtype: float64
0.3466666666666667
0.3472222222222222 2
################################# 2010 ###################################
Data
% of breast feeding initiated within 1 hour by CSBA    0.00
% of neonatal death reported individually              4.69
IMCI Stunting (%)                                      0.00
IMCI Total Child                                       0.00
IMCI 

In [225]:
childhealth = select_variables(input_data_dict=childhealth, pattern='IMCI')
childhealth['2018']['IMCI Stunting (%)'].mean()

['IMCI Stunting (%)', 'IMCI Total Child', 'IMCI Underweight (%)', 'IMCI Wasting (%)']
['IMCI Stunting (%)', 'IMCI Total Child', 'IMCI Underweight (%)', 'IMCI Wasting (%)']
['IMCI Stunting (%)', 'IMCI Total Child', 'IMCI Underweight (%)', 'IMCI Wasting (%)']
['IMCI Stunting (%)', 'IMCI Total Child', 'IMCI Underweight (%)', 'IMCI Wasting (%)']
['IMCI Stunting (%)', 'IMCI Total Child', 'IMCI Underweight (%)', 'IMCI Wasting (%)']
['IMCI Stunting (%)', 'IMCI Total Child', 'IMCI Underweight (%)', 'IMCI Wasting (%)']
['IMCI Stunting (%)', 'IMCI Total Child', 'IMCI Underweight (%)', 'IMCI Wasting (%)']
['IMCI Stunting (%)', 'IMCI Total Child', 'IMCI Underweight (%)', 'IMCI Wasting (%)']
['IMCI Stunting (%)', 'IMCI Total Child', 'IMCI Underweight (%)', 'IMCI Wasting (%)']
['IMCI Stunting (%)', 'IMCI Total Child', 'IMCI Underweight (%)', 'IMCI Wasting (%)']


1.7557812500000007

In [227]:
for year in range(2009,2019):
    print(f"####################### {year} #################################")
    print(childhealth[str(year)]['IMCI Total Child'].sum())
    print(childhealth[str(year)]['IMCI Underweight (%)'].mean())
    print(childhealth[str(year)]['IMCI Wasting (%)'].mean())

####################### 2009 #################################
6555935.0
3.1193750000000007
1.1990625
####################### 2010 #################################
6565822.0
3.1203125000000003
1.20046875
####################### 2011 #################################
6834903.0
3.0075000000000007
1.1653125
####################### 2012 #################################
11096955.0
1.9642187500000001
0.7559375000000002
####################### 2013 #################################
11529608.0
1.8726562500000001
0.7181250000000002
####################### 2014 #################################
11959548.0
2.5771875
0.8470312500000002
####################### 2015 #################################
12299909.0
2.5132812499999995
0.8503125000000001
####################### 2016 #################################
12659779.0
2.869375
0.9248437500000001
####################### 2017 #################################
13769386.0
3.01328125
1.1085937500000003
####################### 2018 ###################