# NOTEBOOK

## Data sources:
- DHIS2 health indicators: 2011 and 2016
- DGFP health indicators: 2011 and 2016
- DHS raw variables: 2011 and 2014
- SVRS raw variables: 2012 and 2015
- CES indicators: 2011 and 2016

## Time points:
![Timpoints](timepoints.png)

In [176]:
import os
import re
import numpy as np
import pandas as pd
from collections import Counter
from fuzzywuzzy import fuzz

In [29]:
def intersect_dfs(input_df1, input_df2):
    df1 = input_df1.copy(deep=True)
    df2 = input_df2.copy(deep=True)
    subset_var = list(set(list(df1.columns)).intersection(set(list(df2.columns))))
    return df1[subset_var], df2[subset_var]

def read_ces(files_list, common=True):
    data_dict = {}
    for file in files_list:
        data_dict[file] = pd.read_csv(file, encoding='cp850')
        data_dict[file].rename(columns={'Survey.Units"':'geo'}, inplace=True)
        subset = [not bool(re.search(r"Division|Launch District|CC|KCC|RCC|DCC|SCC|CCC|BCC|Urban|Rural|CC Slum| Slum|National", geo)) for geo in data_dict[file]['Survey.Units']]
        print(Counter([not bool(re.search(r"Division|Launch District|CC|KCC|RCC|DCC|SCC|CCC|BCC|Urban|Rural|CC Slum| Slum|National", geo)) for geo in data_dict[file]['Survey.Units']]))
        data_dict[file] = data_dict[file].loc[subset,:]
        print(data_dict[file].shape)
    return data_dict

def match_districts(ref_df, ref_match, input_df, input_match):
    out = pd.DataFrame()
    for key, code in enumerate(input_df[input_match]):
        code_match = {}
        code_match['FuzzRatio'] = [fuzz.ratio(ref_code, code)  for ref_code in ref_df[ref_match]]
        code_match['Geo'] = code
        code_match['DistrictGeo'] = [value[0] for value in ref_df.values]
        code_match['DistrictName'] = [value[1]  for value in ref_df.values]
        code_match = pd.DataFrame.from_dict(code_match)
        out = out.append(code_match.sort_values('FuzzRatio', ascending=False).iloc[0,:])
    for var in list(out.columns):
        if out[var].dtype.kind == 'f':
            out[var] = out[var].astype(int)
            out[var] = out[var].astype(str)
            code_length = max([len(char) for char in out[var]])
            out[var] = out[var].str.pad(width=code_length, side='left', fillchar='0') 
    return out

## GEOS

```
DGFP_GEO = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/geos/dgfp_geo.csv'
dgfp_geo = pd.read_csv(DGFP_GEO)
dgfp_geo['zila'] = dgfp_geo['zila'].str.replace('Panchagarh', 'Panchagarh Zila')
dgfp_geo['zila_short'] = dgfp_geo['zila'].str.replace("Zila", "")
dgfp_geo['division_geo'] = dgfp_geo['division_geo'].astype(str).str.pad(width=2, side='left', fillchar='0')
dgfp_geo['zila_geo'] = dgfp_geo['zila_geo'].astype(str).str.pad(width=2, side='left', fillchar='0')
dgfp_geo['DistrictGeo'] = dgfp_geo['division_geo'].str.cat(dgfp_geo['zila_geo'], sep="")
geo =  dgfp_geo[['DistrictGeo', 'zila', 'zila_short']]
geo = geo.drop_duplicates()
geo.shape
```

## CES Data

In [163]:
CES2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2011.csv'
CES2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2016.csv'
ces_list = [CES2011, CES2016]
ces = read_ces(files_list=ces_list, common=True)
ces.keys()

Counter({True: 64, False: 20})
(64, 44)
Counter({True: 64, False: 24})
(64, 52)


dict_keys(['/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2011.csv', '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2016.csv'])

In [164]:
ces_2011, ces_2016 = intersect_dfs(input_df1=ces[ces_list[0]], input_df2=ces[ces_list[1]])
print(ces_2011.shape)
print(ces_2016.shape)

(64, 32)
(64, 32)


In [165]:
ces_2011['DistrictCode'] = ces_2011['DistrictCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2011['DivisionCode'] = ces_2011['DivisionCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2011['DistrictGeo'] = ces_2011['DivisionCode'].str.cat(ces_2011['DistrictCode'], sep="")
ces_2011 = ces_2011.drop(['DivisionName', 'DivisionCode', 'Geo', 'DistrictCode', 'Year', 'FuzzRatio', 'Survey.Units'], axis=1)
ces_2011.head()

Unnamed: 0,PENTA1_Children23M,PENTA2_Children23M,Fully_Children23M,TT1_Mother0-11MChildren,DistrictName,Measles_Children23M,VitACoverage_Children12-59M,PENTA3_Children12M,BCG_Children23M,TT2_Mother0-11MChildren,...,TT4_Mother0-11MChildren,OPV1_Children23M,Measles_Children12M,TT3_Mother0-11MChildren,Fully_Children12M,PENTA3_Children23M,OPV3_Children12M,PENTA1_Children12M,OPV2_Children12M,DistrictGeo
0,99.0,98.3,75.0,96.7,Brahmanbaria,84.5,88.1,83.3,99.0,95.7,...,71.9,99.0,81.3,86.7,72.6,84.1,92.0,98.3,97.5,2012
1,98.1,96.0,79.7,90.0,Bagerhat,88.1,96.7,85.6,98.1,86.2,...,52.9,98.1,86.7,71.4,78.3,85.6,93.3,98.1,96.0,4001
2,94.3,90.2,76.2,89.0,Bandarban,82.3,83.3,83.8,94.3,85.2,...,54.3,93.8,79.2,72.4,73.1,84.4,88.8,94.3,90.8,2003
3,100.0,98.6,79.9,99.0,Barguna,88.1,96.7,87.3,100.0,98.6,...,47.6,99.5,82.6,72.4,74.4,87.9,94.7,100.0,98.6,1004
4,98.6,98.1,77.4,97.1,Barisal,86.8,79.0,86.9,98.6,94.8,...,52.4,98.6,82.6,77.1,73.4,86.9,94.5,98.6,98.1,1006


In [166]:
ces_2016['DistrictCode'] = ces_2016['DistrictCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2016['DivisionCode'] = ces_2016['DivisionCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2016['DistrictGeo'] = ces_2016['DivisionCode'].str.cat(ces_2016['DistrictCode'], sep="")
ces_2016 = ces_2016.drop(['DivisionName', 'DivisionCode', 'Geo', 'DistrictCode', 'Year', 'FuzzRatio', 'Survey.Units'], axis=1)
ces_2016.head()

Unnamed: 0,PENTA1_Children23M,PENTA2_Children23M,Fully_Children23M,TT1_Mother0-11MChildren,DistrictName,Measles_Children23M,VitACoverage_Children12-59M,PENTA3_Children12M,BCG_Children23M,TT2_Mother0-11MChildren,...,TT4_Mother0-11MChildren,OPV1_Children23M,Measles_Children12M,TT3_Mother0-11MChildren,Fully_Children12M,PENTA3_Children23M,OPV3_Children12M,PENTA1_Children12M,OPV2_Children12M,DistrictGeo
0,97.4,97.1,88.3,98.1,Bagerhat,92.2,82.0,90.8,98.2,97.8,...,61.9,97.4,90.3,85.3,86.8,91.0,90.8,97.4,96.8,4001
1,96.3,94.8,83.9,94.8,Bandarban,89.8,84.1,87.8,99.0,92.2,...,79.6,96.3,86.3,86.1,80.9,87.8,87.8,96.3,94.8,2003
2,98.8,98.3,91.1,98.8,Barguna,94.9,96.8,93.0,99.7,97.4,...,64.4,98.8,91.0,88.4,87.6,93.6,93.0,98.8,97.9,1004
3,99.1,99.3,94.6,100.0,Barisal,97.1,100.0,95.5,99.7,100.0,...,79.3,99.1,93.1,96.7,91.0,96.0,95.5,99.1,99.3,1006
6,99.8,99.8,91.9,100.0,Bhola,96.6,98.4,94.5,99.8,100.0,...,79.0,99.8,95.4,94.1,91.3,94.5,94.5,99.8,99.8,1009


In [252]:
check_list = []
for var1, var2 in zip(sorted(geo2011['DistrictName']), sorted(geo2016['DistrictName'])):
    check_list.append(var1==var2)
print(all(check_list))
geo = ces_2011[['DistrictGeo', 'DistrictName']]
geo.head()

True


Unnamed: 0,DistrictGeo,DistrictName
0,2012,Brahmanbaria
1,4001,Bagerhat
2,2003,Bandarban
3,1004,Barguna
4,1006,Barisal


## SVRS Data 

In [219]:
SVRS2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/data_svrs_zila_2012.csv'
SVRS2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/data_svrs_zila_2015.csv'

In [220]:
svrs_2011 = pd.read_csv(SVRS2011)
svrs_2011['district'] = svrs_2011['district'].str.replace(' Zila', '')
display(svrs_2011.head())
svrs_2016 = pd.read_csv(SVRS2016)
display(svrs_2016.head())
svrs_2011, svrs_2016 = intersect_dfs(input_df1=svrs_2011, input_df2=svrs_2016)
print(svrs_2011.columns)
print(svrs_2016.columns)

Unnamed: 0,district,no_births,no_live_births,no_registered_births,prop_registered_births,no_deaths,no_deaths_rural,prop_deaths_rural,no_deaths_.5y,no_deaths_1.4y,...,no_married_..15y,prop_married_..15y,rate_live_births,rate_fertility,rate_death,rate_child_death,rate_under5y_mortality,rate_infant_mortality,rate_maternal_mortality,year
0,Bagerhat,145,143,10,0.07,49,38,0.78,4,1,...,6348,0.66,10.85,40.26,3.66,1.14,27.97,20.98,27.97,2012
1,Bandarban,216,213,27,0.12,39,22,0.56,12,1,...,5084,0.71,19.26,74.69,3.48,0.98,56.34,51.64,4.69,2012
2,Barguna,285,283,20,0.07,80,60,0.75,11,5,...,7165,0.75,20.55,78.02,5.77,5.03,38.87,21.2,3.53,2012
3,Barisal,383,379,52,0.14,118,80,0.68,20,1,...,9130,0.66,19.36,75.6,5.97,0.76,52.77,50.13,2.64,2012
4,Bhola,305,302,156,0.51,86,63,0.73,20,2,...,7723,0.69,18.19,73.89,5.13,1.52,66.23,59.6,9.93,2012


Unnamed: 0,district,no_births,no_live_births,no_registered_births,prop_registered_births,prop_attendant_delivery,no_deaths,no_deaths_rural,prop_deaths_rural,no_deaths_.5y,...,no_married_..15y,prop_married_..15y,rate_child_death,rate_death,year,rate_live_births,rate_fertility,rate_under5y_mortality,rate_infant_mortality,rate_maternal_mortality
0,Bagerhat,174,169,13,0.07,0.54,64,50,0.78,3,...,4840,0.73,0.0,6.9,2015,18.76,69.1,17.75,17.75,0.0
1,Bandarban,71,70,1,0.01,0.15,21,15,0.71,2,...,1622,0.67,0.0,5.79,2015,19.56,74.5,28.57,28.57,0.0
2,Barguna,127,126,4,0.03,0.5,45,36,0.8,4,...,4240,0.76,1.96,5.79,2015,16.34,59.07,31.75,23.81,7.94
3,Barisal,747,732,15,0.02,0.68,234,79,0.34,20,...,22857,0.71,2.01,5.15,2015,16.45,57.85,27.32,19.13,2.73
4,Bhola,362,353,63,0.17,0.29,64,60,0.94,11,...,7730,0.73,3.28,4.0,2015,22.63,89.43,31.16,19.83,0.0


Index(['pop_.15y', 'child_1.4y', 'no_deaths_1.4y', 'rate_maternal_mortality',
       'pop_15.19y', 'total_pop', 'year', 'no_deaths_.5y',
       'prop_registered_births', 'no_married_..15y', 'rate_child_death',
       'pop_.35y', 'prop_married_..15y', 'prop_pop_women', 'no_deaths_rural',
       'no_registered_births', 'rate_fertility', 'no_births',
       'women_15.45_men_..15y', 'rate_infant_mortality',
       'rate_under5y_mortality', 'child_0.5y', 'women_15.49y',
       'prop_deaths_rural', 'no_live_births', 'women_15.45y', 'no_deaths',
       'child_.5y', 'no_deaths_.1y', 'men_.15y', 'no_maternal_deaths',
       'rate_live_births', 'women_15.19y', 'rate_death', 'prop_pop_rural',
       'district'],
      dtype='object')
Index(['pop_.15y', 'child_1.4y', 'no_deaths_1.4y', 'rate_maternal_mortality',
       'pop_15.19y', 'total_pop', 'year', 'no_deaths_.5y',
       'prop_registered_births', 'no_married_..15y', 'rate_child_death',
       'pop_.35y', 'prop_married_..15y', 'prop_pop_women'

In [221]:
geo_svrs_2011 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=svrs_2011, input_match='district')
geo_svrs_2016 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=svrs_2016, input_match='district')

In [222]:
print(svrs_2011.shape)
print(svrs_2016.shape)
svrs_2011 = svrs_2011.merge(geo_svrs_2011, how='left', left_on='district', right_on='Geo')
svrs_2016 = svrs_2016.merge(geo_svrs_2016, how='left', left_on='district', right_on='Geo')
svrs_2011 = svrs_2011.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
svrs_2016 = svrs_2016.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
print(svrs_2011.shape)
print(svrs_2016.shape)

(64, 36)
(64, 36)
(64, 36)
(64, 36)


## DHIS2 Data

In [236]:
DHIS2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhis2/health_indicators/District_2011_NAME.csv'
DHIS2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhis2/health_indicators/District_2016_NAME.csv'

In [237]:
dhis_2011 = pd.read_csv(DHIS2011)
dhis_2011['Geo'] = dhis_2011['True'].str.replace(" District", "")
display(dhis_2011.head())
dhis_2016 = pd.read_csv(DHIS2016)
dhis_2016['Geo'] = dhis_2016['True'].str.replace(" District", "")
display(dhis_2016.head())
dhis_2011, dhis_2016 = intersect_dfs(input_df1=dhis_2011, input_df2=dhis_2016)
print(dhis_2011.shape)
print(dhis_2016.shape)

Unnamed: 0,True,07Vaccine&LogisticsstockofUpazilaMunCC: Differences between Pentavalent doses and vial uses,07Vaccine&LogisticsstockofUpazilaMunCC: Penta vial Opening + Receive,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI Form need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI Investigation form need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI form E36 - E39 need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI line listing form need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI BCG diluent need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI BCG need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI CC supply book need with buffer,...,02ChildHealth: IMCI Wasting (%),02ChildHealth: Neonatal Case fatality rate (EmOC),02ChildHealth: Percentage of diarrhea reported at facility,02ChildHealth: Percentage of pneumonia reported at facility,05Logistics: Percentage of functional ambulance,05Logistics: Percentage of functional x-ray,AntenatalCare(ANC): 1st Visit ANC,AntenatalCare(ANC): 2nd Visit ANC,AntenatalCare(ANC): 3rd & more ANC,Geo
0,Bagerhat District,358.0,128774.0,643.0,395.0,398.0,509.0,26253.0,26373.0,93.0,...,1.8,0.2009,11.35,3.7,68.85,37.025,,,,Bagerhat
1,Bandarban District,-667.0,62055.0,115.0,112.0,110.0,120.0,8212.5,8362.5,4.0,...,0.16,0.1165,8.7,8.39,65.24,61.538,,,,Bandarban
2,Barguna District,0.0,109797.0,3856.0,33.0,36.0,38.0,15523.5,15523.5,14.0,...,1.4,1.1355,7.1,4.07,71.36,46.073,,,,Barguna
3,Barishal District,492.0,297323.0,1827.0,552.0,551.0,552.0,40608.0,40608.0,27.0,...,1.0,0.3973,11.74,5.68,81.82,51.923,,,,Barishal
4,Bhola District,10.0,198050.0,3499.0,95.0,92.0,94.0,29293.5,29430.0,41.0,...,8.3,0.0774,13.54,3.74,76.34,67.742,,,,Bhola


Unnamed: 0,True,04Newborn: % of Nurse training on ETAT at SCANU,04Newborn: % of female baby admitted in SCANU reported individually,04Newborn: % of female baby admitted in SCANU reported monthly,04Newborn: % of functional Radiant warmer,04Newborn: % of male baby admitted in SCANU reported individually,04Newborn: % of male baby admitted in SCANU reported monthly,04Newborn: % of non functional Photo therapy unit,04Newborn: % of non-functioning Table Resuscitator with Radiant warmer,04Newborn: % of nurse allocated in SCANU among all nurses in the facility,...,02ChildHealth: IMCI Total Child,02ChildHealth: IMCI Underweight (%),02ChildHealth: IMCI Wasting (%),02ChildHealth: Neonatal Case fatality rate (EmOC),02ChildHealth: Percentage of diarrhea reported at facility,02ChildHealth: Percentage of pneumonia reported at facility,AntenatalCare(ANC): 1st Visit ANC,AntenatalCare(ANC): 2nd Visit ANC,AntenatalCare(ANC): 3rd & more ANC,Geo
0,Bagerhat District,245.0,41.5,39.4,42.6,58.5,60.6,49.3,35.3,38.1,...,215355.0,4.3,1.7,0.187,11.57,3.32,,,,Bagerhat
1,Bandarban District,,48.3,42.9,0.0,51.7,57.1,0.0,0.0,0.0,...,59810.0,1.6,0.35,0.3076,9.28,8.57,,,,Bandarban
2,Barguna District,,0.0,,,100.0,,,,,...,127829.0,2.8,1.1,1.2882,7.47,4.38,,,,Barguna
3,Barishal District,,100.0,,,0.0,,,,,...,209694.0,4.2,0.88,0.0354,13.06,6.35,,,,Barishal
4,Bhola District,379.3,37.0,36.4,23.1,63.0,63.6,12.6,0.0,13.7,...,171269.0,7.7,5.3,0.0986,13.35,4.6,,,,Bhola


(64, 350)
(64, 350)


In [238]:
geo_dhis_2011 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=dhis_2011, input_match='Geo')
geo_dhis_2016 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=dhis_2016, input_match='Geo')

In [239]:
print(dhis_2011.shape)
print(dhis_2016.shape)
dhis_2011 = dhis_2011.merge(geo_dhis_2011, how='left', left_on='Geo', right_on='Geo')
dhis_2016 = dhis_2016.merge(geo_dhis_2016, how='left', left_on='Geo', right_on='Geo')
dhis_2011 = dhis_2011.drop(['FuzzRatio','Geo',], axis=1)
dhis_2016 = dhis_2016.drop(['FuzzRatio','Geo',], axis=1)
print(dhis_2011.shape)
print(dhis_2016.shape)

(64, 350)
(64, 350)
(64, 351)
(64, 351)


In [254]:
for ces, svrs, dhis in zip(sorted(ces_2011['DistrictName']), sorted(svrs_2011['DistrictName']), sorted(dhis_2011['DistrictName'])):
    print("\n -------------------")
    print(f"CES vs SVRS: {ces == svrs}")
    print(f"CES vs DHIS: {ces == dhis}")
    print(f"DHIS vs SVRS: {dhis == svrs}")


 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

 -------------------
CES vs SVRS: True
CES vs DHIS: True
DHIS vs SVRS: True

## DGFP Data

In [49]:
DGFP2011a = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_ngothanaprocess_2011.csv'
DGFP2011b = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_thanaprocess_2011.csv'
DGFP2011c = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_district_monthprocess_2011.csv'
DGFP2011d = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_ngodistrict_monthprocess_2011.csv'
DGFP2016a = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_ngothanaprocess_2016.csv'
DGFP2016b = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_thanaprocess_2016.csv'
DGFP2016c = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_district_monthprocess_2016.csv'
DGFP2016d = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_ngodistrict_monthprocess_2016.csv'

## DHS Data

In [48]:
DHS2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/data/data_dhs_2011.csv' 
DHS2014 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/data/data_dhs_2014.csv' 