# NOTEBOOK

## Data sources:

- **DHIS2 health indicators:**
    - 2011, 2014 and 2017
- **DGFP health indicators:**
    - 2011, 2014 and 2017
- **DHS raw variables:**
    - 2011, 2014, 2017
- **SVRS raw variables:**
    - 2012, 2014, 2017
- **CES indicators:**
    - 2011, 2014, 2016

## Time points:
![Timpoints](timepoints_version2.png)

In [1]:
import os
import re
import numpy as np
import pandas as pd
from collections import Counter
from fuzzywuzzy import fuzz

In [28]:
def intersect_dfs(list_dfs):
    subset_var = []
    for key, value in list_dfs.items():
        if len(subset_var) == 0:
            subset_var = list(value.columns)
        else:
            subset_var = list(set(subset_var).intersection(set(list(value.columns))))
    for key, value in list_dfs.items():
        value = value[subset_var]
        list_dfs[key] = value
    del subset_var
    return list_dfs

def read_ces(files_list, common=True):
    data_dict = {}
    for file in files_list:
        data_dict[file] = pd.read_csv(file, encoding='cp850')
        data_dict[file].rename(columns={'Survey.Units"':'geo'}, inplace=True)
        subset = [not bool(re.search(r"Division|Launch District|CC|KCC|RCC|DCC|SCC|CCC|BCC|Urban|Rural|CC Slum| Slum|National", geo)) for geo in data_dict[file]['Survey.Units']]
        print(Counter([not bool(re.search(r"Division|Launch District|CC|KCC|RCC|DCC|SCC|CCC|BCC|Urban|Rural|CC Slum| Slum|National", geo)) for geo in data_dict[file]['Survey.Units']]))
        data_dict[file] = data_dict[file].loc[subset,:]
        print(data_dict[file].shape)
    return data_dict

def match_districts(ref_df, ref_match, input_df, input_match):
    out = pd.DataFrame()
    for key, code in enumerate(input_df[input_match]):
        code_match = {}
        code_match['FuzzRatio'] = [fuzz.ratio(ref_code, code)  for ref_code in ref_df[ref_match]]
        code_match['Geo'] = code
        code_match['DistrictGeo'] = [value[0] for value in ref_df.values]
        code_match['DistrictName'] = [value[1]  for value in ref_df.values]
        code_match = pd.DataFrame.from_dict(code_match)
        out = out.append(code_match.sort_values('FuzzRatio', ascending=False).iloc[0,:])
    for var in list(out.columns):
        if out[var].dtype.kind == 'f':
            out[var] = out[var].astype(int)
            out[var] = out[var].astype(str)
            code_length = max([len(char) for char in out[var]])
            out[var] = out[var].str.pad(width=code_length, side='left', fillchar='0') 
    return out

def read_dgfp(files_list):
    data_dict = {}
    for file in files_list:
        data_dict[file] = pd.read_csv(file)
    return data_dict

def distrGO_rates(input_df, pattern, denominator_male, denominator_female):
    df = input_df.copy(deep=True)
    vars_rates = [var for var in df.columns if pattern in var]
    for var in vars_rates:
        if "_male" in var:
            df[var] = np.round(df[var]/(denominator_male/1000),4)
        else: 
            df[var] = np.round(df[var]/(denominator_female/1000), 4)
    return df, vars_rates

## GEOS

## CES Data

In [33]:
CES2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2011.csv'
CES2014 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2014.csv'
CES2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2016.csv'
ces_list = [CES2011, CES2014, CES2016]
ces = read_ces(files_list=ces_list, common=True)
ces.keys()

Counter({True: 64, False: 20})
(64, 44)
Counter({True: 64, False: 23})
(64, 50)
Counter({True: 64, False: 24})
(64, 52)


dict_keys(['/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2011.csv', '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2014.csv', '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2016.csv'])

In [34]:
ces = intersect_dfs(list_dfs=ces)
ces_2011 = ces['/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2011.csv']
ces_2014 = ces['/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2014.csv']
ces_2016 = ces['/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2016.csv']

In [35]:
ces_2011['DistrictCode'] = ces_2011['DistrictCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2011['DivisionCode'] = ces_2011['DivisionCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2011['DistrictGeo'] = ces_2011['DivisionCode'].str.cat(ces_2011['DistrictCode'], sep="")
ces_2011 = ces_2011.drop(['DivisionName', 'DivisionCode', 'Geo', 'DistrictCode', 'Year', 'FuzzRatio', 'Survey.Units'], axis=1)
ces_2011.head()

Unnamed: 0,Fully_Children23M,PENTA1_Children23M,PENTA3_Children12M,PENTA3_Children23M,TT2_Mother0-11MChildren,TT5_Mother0-11MChildren,PENTA1_Children12M,PENTA2_Children12M,Fully_Children12M,BCG_Children23M,...,VitACoverage_Children12-59M,OPV1_Children12M,DistrictName,Measles_Children12M,PENTA2_Children23M,OPV3_Children23M,OPV1_Children23M,OPV2_Children12M,BCG_Children12M,DistrictGeo
0,75.0,99.0,83.3,84.1,95.7,53.8,98.3,97.5,72.6,99.0,...,88.1,98.3,Brahmanbaria,81.3,98.3,93.5,99.0,97.5,98.3,2012
1,79.7,98.1,85.6,85.6,86.2,31.9,98.1,96.0,78.3,98.1,...,96.7,98.1,Bagerhat,86.7,96.0,93.3,98.1,96.0,98.1,4001
2,76.2,94.3,83.8,84.4,85.2,38.6,94.3,90.2,73.1,94.3,...,83.3,93.8,Bandarban,79.2,90.2,89.4,93.8,90.8,94.3,2003
3,79.9,100.0,87.3,87.9,98.6,27.6,100.0,98.6,74.4,100.0,...,96.7,99.5,Barguna,82.6,98.6,95.4,99.5,98.6,100.0,1004
4,77.4,98.6,86.9,86.9,94.8,31.0,98.6,98.1,73.4,98.6,...,79.0,98.6,Barisal,82.6,98.1,94.5,98.6,98.1,98.6,1006


In [36]:
ces_2014['DistrictCode'] = ces_2014['DistrictCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2014['DivisionCode'] = ces_2014['DivisionCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2014['DistrictGeo'] = ces_2014['DivisionCode'].str.cat(ces_2014['DistrictCode'], sep="")
ces_2014 = ces_2014.drop(['DivisionName', 'DivisionCode', 'Geo', 'DistrictCode', 'Year', 'FuzzRatio', 'Survey.Units'], axis=1)
ces_2014.head()

Unnamed: 0,Fully_Children23M,PENTA1_Children23M,PENTA3_Children12M,PENTA3_Children23M,TT2_Mother0-11MChildren,TT5_Mother0-11MChildren,PENTA1_Children12M,PENTA2_Children12M,Fully_Children12M,BCG_Children23M,...,VitACoverage_Children12-59M,OPV1_Children12M,DistrictName,Measles_Children12M,PENTA2_Children23M,OPV3_Children23M,OPV1_Children23M,OPV2_Children12M,BCG_Children12M,DistrictGeo
0,87.7,95.9,91.4,91.4,96.2,56.7,95.9,96.4,82.4,100.0,...,94.8,97.4,Bagerhat,88.3,96.4,95.0,97.4,96.9,100.0,4001
1,84.8,90.3,90.7,91.2,85.7,42.9,90.3,89.5,79.6,98.6,...,82.9,91.4,Bandarban,81.3,89.5,90.7,91.4,90.6,98.6,2003
2,88.6,93.9,94.1,94.1,97.6,41.9,93.9,93.5,86.6,100.0,...,97.1,96.4,Barguna,91.8,93.5,95.4,96.4,95.4,100.0,1004
3,91.7,96.1,98.1,98.1,99.5,58.6,96.1,98.1,88.8,99.5,...,96.2,99.5,Barisal,94.1,98.1,98.6,99.5,99.5,99.5,1006
6,83.1,95.1,94.5,94.5,96.7,38.6,95.1,95.3,78.7,100.0,...,77.1,98.4,Bhola,83.1,95.3,93.4,98.4,97.5,100.0,1009


In [37]:
ces_2016['DistrictCode'] = ces_2016['DistrictCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2016['DivisionCode'] = ces_2016['DivisionCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2016['DistrictGeo'] = ces_2016['DivisionCode'].str.cat(ces_2016['DistrictCode'], sep="")
ces_2016 = ces_2016.drop(['DivisionName', 'DivisionCode', 'Geo', 'DistrictCode', 'Year', 'FuzzRatio', 'Survey.Units'], axis=1)
ces_2016.head()

Unnamed: 0,Fully_Children23M,PENTA1_Children23M,PENTA3_Children12M,PENTA3_Children23M,TT2_Mother0-11MChildren,TT5_Mother0-11MChildren,PENTA1_Children12M,PENTA2_Children12M,Fully_Children12M,BCG_Children23M,...,VitACoverage_Children12-59M,OPV1_Children12M,DistrictName,Measles_Children12M,PENTA2_Children23M,OPV3_Children23M,OPV1_Children23M,OPV2_Children12M,BCG_Children12M,DistrictGeo
0,88.3,97.4,90.8,91.0,97.8,36.5,97.4,96.8,86.8,98.2,...,82.0,97.4,Bagerhat,90.3,97.1,91.0,97.4,96.8,98.2,4001
1,83.9,96.3,87.8,87.8,92.2,65.3,96.3,94.8,80.9,99.0,...,84.1,96.3,Bandarban,86.3,94.8,87.8,96.3,94.8,99.0,2003
2,91.1,98.8,93.0,93.6,97.4,36.3,98.8,97.9,87.6,99.7,...,96.8,98.8,Barguna,91.0,98.3,93.6,98.8,97.9,99.7,1004
3,94.6,99.1,95.5,96.0,100.0,60.0,99.1,99.3,91.0,99.7,...,100.0,99.1,Barisal,93.1,99.3,96.0,99.1,99.3,99.7,1006
6,91.9,99.8,94.5,94.5,100.0,56.1,99.8,99.8,91.3,99.8,...,98.4,99.8,Bhola,95.4,99.8,94.5,99.8,99.8,99.8,1009


In [42]:
check_list = []
for var1, var2, var3 in zip(sorted(ces_2011['DistrictName']), sorted(ces_2014['DistrictName']), sorted(ces_2016['DistrictName'])):
    check_list.append(var1==var2)
    check_list.append(var1==var3)
    check_list.append(var2==var3)
    print(var1==var2)
    print(var1==var3)
    print(var2==var3)
print("#"*100)
print(all(check_list))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
########################################

In [43]:
geo = ces_2011[['DistrictGeo', 'DistrictName']]
geo.head()

Unnamed: 0,DistrictGeo,DistrictName
0,2012,Brahmanbaria
1,4001,Bagerhat
2,2003,Bandarban
3,1004,Barguna
4,1006,Barisal


## SVRS Data 

In [76]:
SVRS2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/data_svrs_zila_2012_clean.csv'
SVRS2014 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/data_svrs_zila_2014_clean.csv'
SVRS2017 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/data_svrs_zila_2017_clean.csv'

In [77]:
svrs_2011 = pd.read_csv(SVRS2011)
svrs_2011['district'] = svrs_2011['district'].str.replace(' Zila', '')
display(svrs_2011.head())
print(svrs_2011.columns)

Unnamed: 0,district,prop_live_births,prop_registered_births,prop_deaths_rural,sex_ratio,dependency_ratio,prop_pop_rural,prop_pop_women,prop_pop_rural_women,prop_women_15.45y_overwomen,prop_married_women_15.45y,prop_married_..15y,rate_live_births,rate_fertility,rate_death,rate_child_death,rate_under5y_mortality,rate_infant_mortality,rate_maternal_mortality,year
0,Bagerhat,98.62,7,78,103.13,51.4,73.67,49.23,36.03,50.65,73.58,66.24,10.85,43.49,3.66,1.14,27.97,20.98,1398.6,2012
1,Bandarban,98.61,12,56,107.07,63.84,45.42,48.29,22.1,49.68,73.23,70.54,19.26,80.3,3.48,0.98,56.34,51.64,469.48,2012
2,Barguna,99.3,7,75,100.62,56.56,64.75,49.85,32.28,49.91,82.21,74.9,20.55,82.58,5.77,5.03,38.87,21.2,0.0,2012
3,Barisal,98.96,14,68,105.58,54.58,63.4,48.64,30.64,48.66,69.59,65.86,19.36,81.8,5.97,0.76,52.77,50.13,0.0,2012
4,Bhola,99.02,51,73,110.73,61.07,71.88,47.45,34.15,49.2,73.6,68.84,18.19,77.89,5.13,1.52,66.23,59.6,993.38,2012


Index(['district', 'prop_live_births', 'prop_registered_births',
       'prop_deaths_rural', 'sex_ratio', 'dependency_ratio', 'prop_pop_rural',
       'prop_pop_women', 'prop_pop_rural_women', 'prop_women_15.45y_overwomen',
       'prop_married_women_15.45y', 'prop_married_..15y', 'rate_live_births',
       'rate_fertility', 'rate_death', 'rate_child_death',
       'rate_under5y_mortality', 'rate_infant_mortality',
       'rate_maternal_mortality', 'year'],
      dtype='object')


```
svrs_2014 = pd.read_csv(SVRS2014)
display(svrs_2014.head())
print(svrs_2014.columns)
```

In [None]:
svrs_2016 = pd.read_csv(SVRS2016)
display(svrs_2016.head())
svrs_2011, svrs_2016 = intersect_dfs(input_df1=svrs_2011, input_df2=svrs_2016)
print(svrs_2016.columns)

In [14]:
geo_svrs_2011 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=svrs_2011, input_match='district')
geo_svrs_2016 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=svrs_2016, input_match='district')

In [15]:
print(svrs_2011.shape)
print(svrs_2016.shape)
svrs_2011 = svrs_2011.merge(geo_svrs_2011, how='left', left_on='district', right_on='Geo')
svrs_2016 = svrs_2016.merge(geo_svrs_2016, how='left', left_on='district', right_on='Geo')
svrs_2011 = svrs_2011.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
svrs_2016 = svrs_2016.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
print(svrs_2011.shape)
print(svrs_2016.shape)

(64, 20)
(64, 20)
(64, 20)
(64, 20)


In [16]:
print(svrs_2011['rate_maternal_mortality'].mean())
print(svrs_2016['rate_maternal_mortality'].mean())

228.3010937500001
2.65078125


## DHIS2 Data

In [72]:
DHIS2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhis2/health_indicators/District_2011_NAME.csv'
DHIS2014 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhis2/health_indicators/District_2014_NAME.csv'
DHIS2017 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhis2/health_indicators/District_2017_NAME.csv'

In [73]:
dhis_2011 = pd.read_csv(DHIS2011)
dhis_2011['Geo'] = dhis_2011['True'].str.replace(" District", "")
display(dhis_2011.head())

Unnamed: 0,True,07Vaccine&LogisticsstockofUpazilaMunCC: Differences between Pentavalent doses and vial uses,07Vaccine&LogisticsstockofUpazilaMunCC: Penta vial Opening + Receive,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI Form need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI Investigation form need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI form E36 - E39 need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI line listing form need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI BCG diluent need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI BCG need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI CC supply book need with buffer,...,02ChildHealth: IMCI Wasting (%),02ChildHealth: Neonatal Case fatality rate (EmOC),02ChildHealth: Percentage of diarrhea reported at facility,02ChildHealth: Percentage of pneumonia reported at facility,05Logistics: Percentage of functional ambulance,05Logistics: Percentage of functional x-ray,AntenatalCare(ANC): 1st Visit ANC,AntenatalCare(ANC): 2nd Visit ANC,AntenatalCare(ANC): 3rd & more ANC,Geo
0,Bagerhat District,358.0,128774.0,643.0,395.0,398.0,509.0,26253.0,26373.0,93.0,...,1.8,0.2009,11.35,3.7,68.85,37.025,,,,Bagerhat
1,Bandarban District,-667.0,62055.0,115.0,112.0,110.0,120.0,8212.5,8362.5,4.0,...,0.16,0.1165,8.7,8.39,65.24,61.538,,,,Bandarban
2,Barguna District,0.0,109797.0,3856.0,33.0,36.0,38.0,15523.5,15523.5,14.0,...,1.4,1.1355,7.1,4.07,71.36,46.073,,,,Barguna
3,Barishal District,492.0,297323.0,1827.0,552.0,551.0,552.0,40608.0,40608.0,27.0,...,1.0,0.3973,11.74,5.68,81.82,51.923,,,,Barishal
4,Bhola District,10.0,198050.0,3499.0,95.0,92.0,94.0,29293.5,29430.0,41.0,...,8.3,0.0774,13.54,3.74,76.34,67.742,,,,Bhola


In [74]:
dhis_2014 = pd.read_csv(DHIS2014)
dhis_2014['Geo'] = dhis_2014['True'].str.replace(" District", "")
display(dhis_2014.head())

Unnamed: 0,True,07Vaccine&LogisticsstockofUpazilaMunCC: Differences between Pentavalent doses and vial uses,07Vaccine&LogisticsstockofUpazilaMunCC: Penta vial Opening + Receive,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI Form need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI Investigation form need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI form E36 - E39 need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI AEFI line listing form need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI BCG diluent need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI BCG need with buffer,07Vaccine&LogisticsstockofUpazilaMunCC: Upazila EPI CC supply book need with buffer,...,02ChildHealth: IMCI Wasting (%),02ChildHealth: Neonatal Case fatality rate (EmOC),02ChildHealth: Percentage of diarrhea reported at facility,02ChildHealth: Percentage of pneumonia reported at facility,05Logistics: Percentage of functional ambulance,05Logistics: Percentage of functional x-ray,AntenatalCare(ANC): 1st Visit ANC,AntenatalCare(ANC): 2nd Visit ANC,AntenatalCare(ANC): 3rd & more ANC,Geo
0,Bagerhat District,-17303.0,249362.0,643.0,395.0,398.0,509.0,26253.0,26373.0,93.0,...,1.2,0.1486,10.72,3.74,66.9,38.402,,,,Bagerhat
1,Bandarban District,12213.0,75120.0,115.0,112.0,110.0,120.0,8212.5,8362.5,4.0,...,0.11,0.0491,8.59,8.07,65.17,58.861,,,,Bandarban
2,Barguna District,-15554.0,217151.0,3856.0,33.0,36.0,38.0,15523.5,15523.5,14.0,...,0.85,1.0348,6.63,5.13,68.21,34.722,,,,Barguna
3,Barishal District,29932.0,378314.0,1827.0,552.0,551.0,552.0,40608.0,40608.0,27.0,...,0.58,0.0657,11.6,5.92,81.23,54.373,,,,Barishal
4,Bhola District,-18086.0,433679.0,3499.0,95.0,92.0,94.0,29293.5,29430.0,41.0,...,6.2,0.2078,14.24,4.99,72.43,56.122,,,,Bhola


In [56]:
dhis_2017 = pd.read_csv(DHIS2017)
dhis_2017['Geo'] = dhis_2017['True'].str.replace(" District", "")
display(dhis_2017.head())

Unnamed: 0,True,04Newborn: % of Nurse training on ETAT at SCANU,04Newborn: % of female baby admitted in SCANU reported individually,04Newborn: % of female baby admitted in SCANU reported monthly,04Newborn: % of functional Radiant warmer,04Newborn: % of male baby admitted in SCANU reported individually,04Newborn: % of male baby admitted in SCANU reported monthly,04Newborn: % of non functional Photo therapy unit,04Newborn: % of non-functioning Table Resuscitator with Radiant warmer,04Newborn: % of nurse allocated in SCANU among all nurses in the facility,...,03Immunization: Total Pentavalent Given,03Immunization: Total TT Given to 15 - 49 Y,03Immunization: Total TT Given to Pregnant Women,03Immunization: Total fIPV Given,03Immunization: fIPV1 Crude Coverage %,03Immunization: fIPV2 Crude Coverage %,AntenatalCare(ANC): 1st Visit ANC,AntenatalCare(ANC): 2nd Visit ANC,AntenatalCare(ANC): 3rd & more ANC,Geo
0,Bagerhat District,,41.5,39.1,40.4,58.5,60.9,38.8,24.1,0.0,...,172226.0,140738.0,29415.0,64545.0,57.7,53.6,,,,Bagerhat
1,Bandarban District,,48.3,46.1,0.0,51.7,53.9,0.0,0.0,0.0,...,67484.0,73615.0,9341.0,17521.0,42.6,32.2,,,,Bandarban
2,Barguna District,,0.0,,,100.0,,,,,...,123124.0,137826.0,38831.0,35770.0,45.9,38.8,,,,Barguna
3,Barishal District,,100.0,,,0.0,,,,,...,364881.0,332370.0,100269.0,120074.0,55.6,42.6,,,,Barishal
4,Bhola District,368.4,37.0,35.8,38.5,63.0,64.2,19.8,1.0,12.6,...,306078.0,337464.0,88113.0,100339.0,55.5,38.4,,,,Bhola


In [57]:
dhis2 = intersect_dfs(list_dfs={'DHIS2_2011': dhis_2011, 'DHIS2_2014':dhis_2014, 'DHIS2_2017':dhis_2017})
for key, value in dhis2.items():
    print(key)
    print(value.shape)

DHIS2_2011
(64, 350)
DHIS2_2014
(64, 350)
DHIS2_2017
(64, 350)


In [None]:
dhis_2011, dhis_2014, dhis_2017 = dhis2['DHIS2_2011'], dhis2['DHIS2_2014'], dhis2['DHIS2_2017']

In [19]:
geo_dhis_2011 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=dhis_2011, input_match='Geo')
geo_dhis_2014 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=dhis_2014, input_match='Geo')
geo_dhis_2017 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=dhis_2017, input_match='Geo')

In [20]:
print(dhis_2011.shape)
print(dhis_2014.shape)
print(dhis_2017.shape)
dhis_2011 = dhis_2011.merge(geo_dhis_2011, how='left', left_on='Geo', right_on='Geo')
dhis_2014 = dhis_2014.merge(geo_dhis_2014, how='left', left_on='Geo', right_on='Geo')
dhis_2016 = dhis_2017.merge(geo_dhis_2017, how='left', left_on='Geo', right_on='Geo')
dhis_2011 = dhis_2011.drop(['FuzzRatio','Geo',], axis=1)
dhis_2014 = dhis_2014.drop(['FuzzRatio','Geo',], axis=1)
dhis_2017 = dhis_2017.drop(['FuzzRatio','Geo',], axis=1)
print(dhis_2011.shape)
print(dhis_2014.shape)
print(dhis_2017.shape)

(64, 350)
(64, 350)
(64, 351)
(64, 351)


## DGFP Data

In [59]:
DGFP2011a = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_ngothanaprocess_2011.csv'
DGFP2011b = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_thanaprocess_2011.csv'
DGFP2011c = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_district_monthprocess_2011.csv'
DGFP2011d = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_ngodistrict_monthprocess_2011.csv'
DGFP2011e = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_distributionGO_distmonthProcess_2011.csv'
DGFP2011 = [DGFP2011a, DGFP2011b, DGFP2011c, DGFP2011d, DGFP2011e]


DGFP2014a = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_ngothanaprocess_2014.csv'
DGFP2014b = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_thanaprocess_2014.csv'
DGFP2014c = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_district_monthprocess_2014.csv'
DGFP2014d = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_ngodistrict_monthprocess_2014.csv'
DGFP2014e = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_distributionGO_distmonthProcess_2014.csv'
DGFP2014 = [DGFP2014a, DGFP2014b, DGFP2014c, DGFP2014d, DGFP2014e]


DGFP2017a = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_ngothanaprocess_2017.csv'
DGFP2017b = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_thanaprocess_2017.csv'
DGFP2017c = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_district_monthprocess_2017.csv'
DGFP2017d = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_ngodistrict_monthprocess_2017.csv'
DGFP2017e = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_distributionGO_distmonthProcess_2017.csv'
DGFP2017 = [DGFP2017a, DGFP2017b, DGFP2017c, DGFP2017d, DGFP2017e]

In [61]:
dgfp2011 = read_dgfp(files_list=DGFP2011)
dgfp2014 = read_dgfp(files_list=DGFP2014)
dgfp2017 = read_dgfp(files_list=DGFP2017)

In [64]:
dgfp2011[DGFP2011e]['distr_GOdistr_GOfemalepn'].sum()


0

In [65]:
dgfp2011df = pd.concat([dgfp2011[DGFP2011a], 
                        dgfp2011[DGFP2011b].drop('geo', axis=1), 
                        dgfp2011[DGFP2011c].drop('geo', axis=1),
                        dgfp2011[DGFP2011d].drop('geo', axis=1),
                        dgfp2011[DGFP2011e].drop('geo', axis=1)], axis=1)
print(dgfp2011df.shape)
print(dgfp2011df.columns)

(64, 59)
Index(['imp11subdistr_ngothanaprocessNGO_Percent_Pill',
       'imp11subdistr_ngothanaprocessNGO_Percent_Condom',
       'imp11subdistr_ngothanaprocessNGO_Percent_Injectable',
       'imp11subdistr_ngothanaprocessNGO_Percent_IUD',
       'imp11subdistr_ngothanaprocessNGO_Percent_Implant',
       'imp11subdistr_ngothanaprocessNGO_Percent_PerMale',
       'imp11subdistr_ngothanaprocessNGO_Percent_PerFemale',
       'imp11subdistr_ngothanaprocessNGO_CAR', 'geo',
       'imp11subdistr_thanaprocessPercent_Pill',
       'imp11subdistr_thanaprocessPercent_Condom',
       'imp11subdistr_thanaprocessPercent_Injectable',
       'imp11subdistr_thanaprocessPercent_IUD',
       'imp11subdistr_thanaprocessPercent_Implant',
       'imp11subdistr_thanaprocessPercent_PerMale',
       'imp11subdistr_thanaprocessPercent_PerFemale',
       'imp11subdistr_thanaprocessCAR',
       'imp12distr_monthprocessImp12DistrMonthThana_Percent_Pill',
       'imp12distr_monthprocessImp12DistrMonthThana_Percent

In [66]:
dgfp2014df = pd.concat([dgfp2014[DGFP2014a], 
                        dgfp2014[DGFP2014b].drop('geo', axis=1), 
                        dgfp2014[DGFP2014c].drop('geo', axis=1),
                        dgfp2014[DGFP2014d].drop('geo', axis=1),
                        dgfp2014[DGFP2014e].drop('geo', axis=1)], axis=1)
print(dgfp2014df.shape)
print(dgfp2014df.columns)

(64, 59)
Index(['imp11subdistr_ngothanaprocessNGO_Percent_Pill',
       'imp11subdistr_ngothanaprocessNGO_Percent_Condom',
       'imp11subdistr_ngothanaprocessNGO_Percent_Injectable',
       'imp11subdistr_ngothanaprocessNGO_Percent_IUD',
       'imp11subdistr_ngothanaprocessNGO_Percent_Implant',
       'imp11subdistr_ngothanaprocessNGO_Percent_PerMale',
       'imp11subdistr_ngothanaprocessNGO_Percent_PerFemale',
       'imp11subdistr_ngothanaprocessNGO_CAR', 'geo',
       'imp11subdistr_thanaprocessPercent_Pill',
       'imp11subdistr_thanaprocessPercent_Condom',
       'imp11subdistr_thanaprocessPercent_Injectable',
       'imp11subdistr_thanaprocessPercent_IUD',
       'imp11subdistr_thanaprocessPercent_Implant',
       'imp11subdistr_thanaprocessPercent_PerMale',
       'imp11subdistr_thanaprocessPercent_PerFemale',
       'imp11subdistr_thanaprocessCAR',
       'imp12distr_monthprocessImp12DistrMonthThana_Percent_Pill',
       'imp12distr_monthprocessImp12DistrMonthThana_Percent

In [67]:
dgfp2017df = pd.concat([dgfp2017[DGFP2017a], 
                        dgfp2017[DGFP2017b].drop('geo', axis=1), 
                        dgfp2017[DGFP2017c].drop('geo', axis=1), 
                        dgfp2017[DGFP2017d].drop('geo', axis=1),
                        dgfp2017[DGFP2017e].drop('geo', axis=1)], axis=1)
print(dgfp2017df.shape)
print(dgfp2017df.columns)

(64, 59)
Index(['imp11subdistr_ngothanaprocessNGO_Percent_Pill',
       'imp11subdistr_ngothanaprocessNGO_Percent_Condom',
       'imp11subdistr_ngothanaprocessNGO_Percent_Injectable',
       'imp11subdistr_ngothanaprocessNGO_Percent_IUD',
       'imp11subdistr_ngothanaprocessNGO_Percent_Implant',
       'imp11subdistr_ngothanaprocessNGO_Percent_PerMale',
       'imp11subdistr_ngothanaprocessNGO_Percent_PerFemale',
       'imp11subdistr_ngothanaprocessNGO_CAR', 'geo',
       'imp11subdistr_thanaprocessPercent_Pill',
       'imp11subdistr_thanaprocessPercent_Condom',
       'imp11subdistr_thanaprocessPercent_Injectable',
       'imp11subdistr_thanaprocessPercent_IUD',
       'imp11subdistr_thanaprocessPercent_Implant',
       'imp11subdistr_thanaprocessPercent_PerMale',
       'imp11subdistr_thanaprocessPercent_PerFemale',
       'imp11subdistr_thanaprocessCAR',
       'imp12distr_monthprocessImp12DistrMonthThana_Percent_Pill',
       'imp12distr_monthprocessImp12DistrMonthThana_Percent

In [68]:
dgfp_data = intersect_dfs({'DGFP2011': dgfp2011df, 'DGFP2014': dgfp2014df, 'DGFP2017': dgfp2017df})
dgfp2011df = dgfp_data['DGFP2011']
dgfp2014df = dgfp_data['DGFP2014']
dgfp2017df = dgfp_data['DGFP2017']

In [26]:
dgfp2011df["geo"] = dgfp2011df["geo"].astype(str)
print(dgfp2011df.shape)
dgfp2011df = dgfp2011df.merge(geo, how='left', left_on="geo", right_on="DistrictGeo")
print(dgfp2011df.shape)
dgfp2011df = dgfp2011df.drop('geo', axis=1)

(64, 59)
(64, 61)


In [69]:
dgfp2014df["geo"] = dgfp2014df["geo"].astype(str)
print(dgfp2014df.shape)
dgfp2014df = dgfp2014df.merge(geo, how='left', left_on="geo", right_on="DistrictGeo")
print(dgfp2014df.shape)
dgfp2014df = dgfp2014df.drop('geo', axis=1)

(64, 59)
(64, 61)


In [71]:
dgfp2017df["geo"] = dgfp2017df["geo"].astype(str)
print(dgfp2017df.shape)
dgfp2017df = dgfp2017df.merge(geo, how='left', left_on="geo", right_on="DistrictGeo")
print(dgfp2017df.shape)
dgfp2017df = dgfp2017df.drop('geo', axis=1)

(64, 59)
(64, 61)


## Demographics

In [29]:
woman15_45_2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/data_svrs_zila_2012.csv'
woman15_45_2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/data_svrs_zila_2015.csv'
TOTAL_POP2011 = 144043697
TOTAL_POP2016 = 162951560
data_svrs2011 = pd.read_csv(woman15_45_2011)
data_svrs2011['district'] = data_svrs2011['district'].str.replace(' Zila', '')
data_svrs2016 = pd.read_csv(woman15_45_2016)
data_svrs2011['total_bang_svrs'] = data_svrs2011['total_pop'].sum()
data_svrs2016['total_bang_svrs'] = data_svrs2016['total_pop'].sum()
data_svrs2011['total_bang'] = TOTAL_POP2011
data_svrs2016['total_bang'] = TOTAL_POP2016

data_svrs2011['total_pop_percent'] = data_svrs2011['total_pop']/data_svrs2011['total_pop'].sum()
data_svrs2016['total_pop_percent'] = data_svrs2016['total_pop']/data_svrs2016['total_pop'].sum()
data_svrs2011['total_pop_abs'] = data_svrs2011['total_pop_percent']*TOTAL_POP2011
data_svrs2016['total_pop_abs'] = data_svrs2016['total_pop_percent']*TOTAL_POP2016
data_svrs2011['woman15_45_abs'] = np.round(data_svrs2011['total_pop_abs'] * data_svrs2011['women_15.45y']/data_svrs2011['total_pop'])
data_svrs2016['woman15_45_abs'] = np.round(data_svrs2016['total_pop_abs'] * data_svrs2016['women_15.45y']/data_svrs2016['total_pop'])
display(data_svrs2011[['total_bang_svrs', 'total_pop', 'total_pop_percent', 'women_15.45y',
                      'total_pop_abs', 'woman15_45_abs', 'total_bang']].head())
display(data_svrs2016[['total_bang_svrs', 'total_pop', 'total_pop_percent', 'women_15.45y',
                      'total_pop_abs', 'woman15_45_abs', 'total_bang']].head())

Unnamed: 0,total_bang_svrs,total_pop,total_pop_percent,women_15.45y,total_pop_abs,woman15_45_abs,total_bang
0,1116845,13370,0.011971,3334,1724379.0,429999.0,144043697
1,1116845,11213,0.01004,2690,1446183.0,346939.0,144043697
2,1116845,13871,0.01242,3451,1788995.0,445088.0,144043697
3,1116845,19779,0.01771,4682,2550972.0,603855.0,144043697
4,1116845,16772,0.015017,3916,2163148.0,505061.0,144043697


Unnamed: 0,total_bang_svrs,total_pop,total_pop_percent,women_15.45y,total_pop_abs,woman15_45_abs,total_bang
0,939530,9274,0.009871,2330,1608477.0,404114.0,162951560
1,939530,3629,0.003863,895,629411.7,155228.0,162951560
2,939530,7773,0.008273,1981,1348145.0,343584.0,162951560
3,939530,45404,0.048326,11903,7874844.0,2064450.0,162951560
4,939530,15999,0.017029,3833,2774858.0,664793.0,162951560


In [30]:
geo_svrs_2011s = match_districts(ref_df=geo, ref_match='DistrictName', input_df=data_svrs2011, input_match='district')
geo_svrs_2016s = match_districts(ref_df=geo, ref_match='DistrictName', input_df=data_svrs2016, input_match='district')
print(data_svrs2011.shape)
print(data_svrs2016.shape)
data_svrs2011 = data_svrs2011.merge(geo_svrs_2011s, how='left', left_on='district', right_on='Geo')
data_svrs2016 = data_svrs2016.merge(geo_svrs_2016s, how='left', left_on='district', right_on='Geo')
data_svrs2011 = data_svrs2011.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
data_svrs2016 = data_svrs2016.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
print(data_svrs2011.shape)
print(data_svrs2016.shape)

(64, 47)
(64, 48)
(64, 47)
(64, 48)


## Recalculating indicators

In [31]:
print(sorted(data_svrs2011['DistrictName']) == sorted(dgfp2011df['DistrictName']))
print(sorted(data_svrs2016['DistrictName']) == sorted(dgfp2016df['DistrictName']))

True
True


In [32]:
dgfp2011df = dgfp2011df.sort_values("DistrictName")
data_svrs2011 = data_svrs2011.sort_values("DistrictName")
dgfp2011df, variables = distrGO_rates(input_df=dgfp2011df, pattern="distr_GO", 
              denominator_female=data_svrs2011['woman15_45_abs'], 
              denominator_male=data_svrs2011['total_bang'])
display(dgfp2011df[variables].head())

Unnamed: 0,distr_GOdistr_GOShukhi,distr_GOdistr_GOCondom,distr_GOdistr_GOPermanent_method,distr_GOdistr_GOecp,distr_GOdistr_GOPill_total,distr_GOdistr_GOIUD_remove,distr_GOdistr_GOmnp_saset,distr_GOdistr_GOImp_total,distr_GOdistr_GOIUD_partum,distr_GOdistr_GOApon,...,distr_GOdistr_GOper_male,distr_GOdistr_GOfemalepn,distr_GOdistr_GOfemalenormal,distr_GOdistr_GOInj_siringe,distr_GOdistr_GOImp_normal,distr_GOdistr_GOfemale_total,distr_GOdistr_GOmrm_pack,distr_GOdistr_GOImp_Jadel,distr_GOdistr_GOIUD_normal,distr_GOdistr_GOsanitary_pad
2,3657.7126,4532.7463,7.6412,2.7163,3657.7126,0.0,0.0,8.4006,0.0,0.0,...,0.0141,0.0,3.0646,0.0,8.4006,3.0646,0.0,0.0,8.3871,0.0
63,655.7759,886.7231,1.9211,1.2523,655.7759,0.0,0.0,1.7435,0.0,0.0,...,0.0041,0.0,0.5443,0.0,1.7435,0.5443,0.0,0.0,2.6429,0.0
11,1302.0635,771.2989,5.8866,0.8026,1302.0635,0.0,0.0,9.565,0.0,0.0,...,0.0172,0.0,0.4057,0.0,9.565,0.4057,0.0,0.0,4.3501,0.0
61,2935.3205,2969.484,9.1668,2.1889,2935.3205,0.0,0.0,7.5193,0.0,0.0,...,0.0194,0.0,3.673,0.0,7.5193,3.673,0.0,0.0,12.7257,0.0
51,2403.5347,3192.5034,5.3922,2.6037,2403.5347,0.0,0.0,7.4587,0.0,0.0,...,0.013,0.0,0.7171,0.0,7.4587,0.7171,0.0,0.0,6.844,0.0


In [33]:
dgfp2016df = dgfp2016df.sort_values(by ="DistrictName")
data_svrs2016 = data_svrs2016.sort_values("DistrictName")
dgfp2016df, variables = distrGO_rates(input_df=dgfp2016df, pattern="distr_GO", 
              denominator_female=data_svrs2016['woman15_45_abs'], 
              denominator_male=data_svrs2016['total_bang'])
display(dgfp2016df[variables].head())

Unnamed: 0,distr_GOdistr_GOShukhi,distr_GOdistr_GOCondom,distr_GOdistr_GOPermanent_method,distr_GOdistr_GOecp,distr_GOdistr_GOPill_total,distr_GOdistr_GOIUD_remove,distr_GOdistr_GOmnp_saset,distr_GOdistr_GOImp_total,distr_GOdistr_GOIUD_partum,distr_GOdistr_GOApon,...,distr_GOdistr_GOper_male,distr_GOdistr_GOfemalepn,distr_GOdistr_GOfemalenormal,distr_GOdistr_GOInj_siringe,distr_GOdistr_GOImp_normal,distr_GOdistr_GOfemale_total,distr_GOdistr_GOmrm_pack,distr_GOdistr_GOImp_Jadel,distr_GOdistr_GOIUD_normal,distr_GOdistr_GOsanitary_pad
2,4134.4882,6076.5373,4.4763,0.0,4209.646,1.5513,0.0,10.1809,0.0728,75.1577,...,0.0045,0.6258,1.7114,421.923,9.6017,2.3371,0.1048,0.5792,9.398,1.2632
63,653.2773,872.6982,1.5088,0.0,671.4688,0.3837,0.0,2.8958,0.0518,18.1915,...,0.003,0.0311,0.1996,108.9366,2.8595,0.2255,0.0985,0.0363,3.1421,0.0674
11,1175.802,745.9456,2.7186,0.0,1196.4034,0.3347,0.0,8.5741,0.0,20.6014,...,0.0057,0.5082,0.3266,228.126,8.1495,0.8348,0.1102,0.4245,4.0778,0.4123
61,602.7928,709.7482,1.1744,0.0,611.6951,0.4192,0.0,2.8073,0.0299,8.9024,...,0.008,0.2462,0.3437,116.021,2.6675,0.5894,0.0103,0.1399,1.5042,0.0697
51,2015.356,3057.8958,3.5129,0.0,2050.1934,0.241,1264.5546,13.331,0.9263,34.8374,...,0.0088,0.3184,0.0398,977.2401,13.0502,0.3316,95.9963,0.2808,8.1887,0.8268


In [34]:
dgfp2011df.head()

Unnamed: 0,imp12distr_monthprocessImp12DistrMonthThana_Percent_PerMale,imp12distr_monthprocessImp12DistrMonthThana_Percent_Injectable,distr_GOdistr_GOShukhi,distr_GOdistr_GOCondom,distr_GOdistr_GOPermanent_method,distr_GOdistr_GOecp,distr_GOdistr_GOPill_total,imp11subdistr_thanaprocessPercent_IUD,imp11subdistr_ngothanaprocessNGO_Percent_PerMale,imp11subdistr_thanaprocessPercent_PerFemale,...,imp12distr_monthprocessImp12DistrMonthThana_Percent_PerFemale,distr_GOdistr_GOImp_Jadel,imp11subdistr_ngothanaprocessNGO_Percent_Implant,imp11subdistr_thanaprocessPercent_PerMale,distr_GOdistr_GOIUD_normal,Imp12DistrNGOMonthThana_CAR,imp12distr_monthprocessImp12DistrMonthThana_CAR,distr_GOdistr_GOsanitary_pad,DistrictGeo,DistrictName
2,5.53,19.05,3657.7126,4532.7463,7.6412,2.7163,3657.7126,5.41,3.81,7.76,...,7.76,0.0,2.73,5.53,8.3871,80.02,80.83,0.0,4001,Bagerhat
63,4.76,24.25,655.7759,886.7231,1.9211,1.2523,655.7759,7.68,,6.65,...,6.65,0.0,,4.76,2.6429,,77.01,0.0,2003,Bandarban
11,7.91,28.08,1302.0635,771.2989,5.8866,0.8026,1302.0635,3.64,3.19,6.72,...,6.72,0.0,6.61,7.91,4.3501,69.28,75.47,0.0,1004,Barguna
61,2.46,21.85,2935.3205,2969.484,9.1668,2.1889,2935.3205,4.35,0.55,8.47,...,8.47,0.0,4.01,2.46,12.7257,68.72,72.62,0.0,1006,Barisal
51,2.71,42.8,2403.5347,3192.5034,5.3922,2.6037,2403.5347,2.52,1.48,4.01,...,4.01,0.0,1.45,2.71,6.844,73.41,72.18,0.0,1009,Bhola


In [35]:
dgfp2016df.head()

Unnamed: 0,imp12distr_monthprocessImp12DistrMonthThana_Percent_PerMale,imp12distr_monthprocessImp12DistrMonthThana_Percent_Injectable,distr_GOdistr_GOShukhi,distr_GOdistr_GOCondom,distr_GOdistr_GOPermanent_method,distr_GOdistr_GOecp,distr_GOdistr_GOPill_total,imp11subdistr_thanaprocessPercent_IUD,imp11subdistr_ngothanaprocessNGO_Percent_PerMale,imp11subdistr_thanaprocessPercent_PerFemale,...,imp12distr_monthprocessImp12DistrMonthThana_Percent_PerFemale,distr_GOdistr_GOImp_Jadel,imp11subdistr_ngothanaprocessNGO_Percent_Implant,imp11subdistr_thanaprocessPercent_PerMale,distr_GOdistr_GOIUD_normal,Imp12DistrNGOMonthThana_CAR,imp12distr_monthprocessImp12DistrMonthThana_CAR,distr_GOdistr_GOsanitary_pad,DistrictGeo,DistrictName
2,5.33,18.25,4134.4882,6076.5373,4.4763,0.0,4209.646,4.34,4.69,8.12,...,8.12,0.5792,4.35,5.33,9.398,83.61,81.39,1.2632,4001,Bagerhat
63,9.05,22.16,653.2773,872.6982,1.5088,0.0,671.4688,8.22,,6.3,...,6.3,0.0363,,9.05,3.1421,,77.95,0.0674,2003,Bandarban
11,9.46,28.0,1175.802,745.9456,2.7186,0.0,1196.4034,3.21,3.2,5.74,...,5.74,0.4245,5.77,9.46,4.0778,84.62,76.12,0.4123,1004,Barguna
61,2.98,21.32,602.7928,709.7482,1.1744,0.0,611.6951,4.07,0.97,8.42,...,8.42,0.1399,6.84,2.98,1.5042,76.43,75.77,0.0697,1006,Barisal
51,3.21,40.52,2015.356,3057.8958,3.5129,0.0,2050.1934,3.15,2.31,3.23,...,3.23,0.2808,4.96,3.21,8.1887,80.44,79.61,0.8268,1009,Bhola


## DHS Data

In [36]:
DHS2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/data/data_dhs_2011_clean.csv' 
DHS2014 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/data/data_dhs_2014_clean.csv' 

In [37]:
dhs_2011 = pd.read_csv(DHS2011)
display(dhs_2011.head())
dhs_2014 = pd.read_csv(DHS2014)
display(dhs_2014.head())

Unnamed: 0,district,prop_current_contraceptive,prop_unmet_need_family_planing,prop_antenatal_coverage,prop_antenatal_care4.,prop_institutional_delivery,prop_attendant_delivery,prop_caesarean,sex_ratio,dependency_ratio,...,prop_hypertensive,prop_pop_rural,prop_pop_women,prop_pop_rural_women,prop_women_15.45y_overwomen,prop_married_women_15.45y,prop_married_..15y,prop_female_head,prop_registered_under5,year
0,Bagerhat,65.52,7.89,29.91,26.56,30.66,35.58,15.74,93.31,74.37,...,8.92,11.48,44.85,5.67,46.74,80.33,73.46,11.29,32.84,2011
1,Bandarban,45.45,27.27,30.0,0.0,0.0,0.0,0.0,84.44,107.5,...,0.0,0.0,127.03,0.0,35.56,75.0,69.05,11.29,31.25,2011
2,Barguna,72.36,11.14,37.91,20.97,10.56,16.3,5.21,105.44,70.73,...,19.23,6.48,25.33,2.98,49.26,88.0,77.71,2.97,41.4,2011
3,Barisal,64.33,13.31,35.21,26.53,21.53,26.92,13.06,89.02,75.53,...,19.85,8.22,24.48,4.37,48.17,78.47,71.65,5.1,38.53,2011
4,Bhola,68.4,10.81,25.64,29.51,9.64,11.89,3.75,94.76,80.16,...,18.37,7.4,26.74,3.86,45.49,80.19,74.76,5.65,21.19,2011


Unnamed: 0,district,prop_current_contraceptive,prop_unmet_need_family_planing,prop_antenatal_coverage,prop_antenatal_care4.,prop_institutional_delivery,prop_attendant_delivery,prop_caesarean,sex_ratio,dependency_ratio,prop_pop_rural,prop_pop_women,prop_pop_rural_women,prop_women_15.45y_overwomen,prop_married_women_15.45y,prop_married_..15y,prop_female_head,prop_registered_under5,year
0,Bagerhat,67.82,11.0,48.69,25.13,26.49,36.95,22.38,94.78,56.82,14.38,39.86,7.53,48.65,74.84,72.88,4.17,37.56,2014
1,Bandarban,63.64,13.64,100.0,75.0,75.0,75.0,50.0,84.93,45.16,0.0,77.52,0.0,54.79,57.5,63.46,25.49,40.0,2014
2,Barguna,73.3,8.22,37.97,47.46,27.54,34.43,22.9,87.82,68.24,9.48,28.64,5.13,44.84,86.5,77.54,8.06,25.1,2014
3,Barisal,64.26,10.08,64.01,32.39,46.33,56.91,31.22,94.74,59.68,28.48,30.97,13.91,51.1,77.42,69.16,6.99,23.4,2014
4,Bhola,67.28,10.01,29.44,15.62,11.27,18.42,4.01,101.71,69.35,9.99,26.58,5.06,47.39,82.49,75.2,3.53,17.64,2014


In [38]:
print(geo.shape)
print(dhs_2011.shape)
geo_dhs_2011 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=dhs_2011, input_match='district')
print(geo_dhs_2011.shape)
print("#"*100)
print(geo.shape)
print(dhs_2014.shape)
geo_dhs_2014 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=dhs_2014, input_match='district')
print(geo_dhs_2014.shape)

(64, 2)
(64, 22)
(64, 4)
####################################################################################################
(64, 2)
(64, 19)
(64, 4)


In [39]:
print(dhs_2011.shape)
print(dhs_2014.shape)
dhs_2011 = dhs_2011.merge(geo_dhs_2011, how='left', left_on='district', right_on='Geo')
dhs_2014 = dhs_2014.merge(geo_dhs_2014, how='left', left_on='district', right_on='Geo')
dhs_2011 = dhs_2011.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
dhs_2014 = dhs_2014.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
print(dhs_2011.shape)
print(dhs_2014.shape)

(64, 22)
(64, 19)
(64, 22)
(64, 19)


### Renaming DHS data sets from 2014 to 2016


In [40]:
dhs_2011, dhs_2016 = intersect_dfs(input_df1= dhs_2011, input_df2= dhs_2014)

In [41]:
dhs_2011.equals(dhs_2016)

False

## Combine data

In [42]:
for ces, svrs, dhis, dgfp, dhs in zip(sorted(ces_2011['DistrictName']), sorted(svrs_2011['DistrictName']), 
                                 sorted(dhis_2011['DistrictName']), sorted(dgfp2011df['DistrictName']),
                                 sorted(dhs_2011['DistrictName'])):
    print("\n -------------------")
    print(ces, svrs, dhis, dgfp, dhs)
    print(f"CES vs SVRS: {ces == svrs}")
    print(f"CES vs DHIS: {ces == dhis}")
    print(f"CES vs DGFP: {ces == dgfp}")
    print(f"CES vs DHS: {ces == dhs}")
    print(f"SVRS vs DHIS: {svrs == dhis}")
    print(f"SVRS vs DGFP: {svrs == dgfp}")
    print(f"SVRS vs DHS: {svrs == dhs}")
    print(f"DHIS vs DGFP: {dhis == dgfp}")
    print(f"DHIS vs DHS: {dhis == dhs}")
    print(f"DHS vs DGFP: {dhs == dgfp}")


 -------------------
Bagerhat  Bagerhat  Bagerhat  Bagerhat  Bagerhat 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: True
DHS vs DGFP: True

 -------------------
Bandarban  Bandarban  Bandarban  Bandarban  Bandarban 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: True
DHS vs DGFP: True

 -------------------
Barguna  Barguna  Barguna  Barguna  Barguna 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: True
DHS vs DGFP: True

 -------------------
Barisal  Barisal  Barisal  Barisal  Barisal 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: True
DHS vs 

In [43]:
for ces, svrs, dhis, dgfp, dhs in zip(sorted(ces_2016['DistrictName']), sorted(svrs_2016['DistrictName']), 
                                 sorted(dhis_2016['DistrictName']), sorted(dgfp2016df['DistrictName']),
                                 sorted(dhs_2016['DistrictName'])):
    print("\n -------------------")
    print(ces, svrs, dhis, dgfp, dhs)
    print(f"CES vs SVRS: {ces == svrs}")
    print(f"CES vs DHIS: {ces == dhis}")
    print(f"CES vs DGFP: {ces == dgfp}")
    print(f"CES vs DHS: {ces == dhs}")
    print(f"SVRS vs DHIS: {svrs == dhis}")
    print(f"SVRS vs DGFP: {svrs == dgfp}")
    print(f"SVRS vs DHS: {svrs == dhs}")
    print(f"DHIS vs DGFP: {dhis == dgfp}")
    print(f"DHIS vs DHS: {dhis == dhs}")
    print(f"DHS vs DGFP: {dhs == dgfp}")


 -------------------
Bagerhat  Bagerhat  Bagerhat  Bagerhat  Bagerhat 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: True
DHS vs DGFP: True

 -------------------
Bandarban  Bandarban  Bandarban  Bandarban  Bandarban 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: True
DHS vs DGFP: True

 -------------------
Barguna  Barguna  Barguna  Barguna  Barguna 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: True
DHS vs DGFP: True

 -------------------
Barisal  Barisal  Barisal  Barisal  Barisal 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: True
DHS vs 


 -------------------
Natore  Natore  Natore  Natore  Natore 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: True
DHS vs DGFP: True

 -------------------
Netrakona  Netrakona  Netrakona  Netrakona  Netrakona 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: True
DHS vs DGFP: True

 -------------------
Nilphamari  Nilphamari  Nilphamari  Nilphamari  Nilphamari 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: True
DHS vs DGFP: True

 -------------------
Noakhali  Noakhali  Noakhali  Noakhali  Noakhali 
CES vs SVRS: True
CES vs DHIS: True
CES vs DGFP: True
CES vs DHS: True
SVRS vs DHIS: True
SVRS vs DGFP: True
SVRS vs DHS: True
DHIS vs DGFP: True
DHIS vs DHS: Tr

## Combining data

In [44]:
for a,b,c,d in zip(dgfp2011[DGFP2011a]['geo'], dgfp2011[DGFP2011b]['geo'], 
                   dgfp2011[DGFP2011c]['geo'], dgfp2011[DGFP2011d]['geo']):
    print(a,b,c,d)
    print(a == b)
    print(a == c)
    print(a == d)
    print(b == c)
    print(b == d)
    print(c == d)
    print("\n -----------")

5577 5577 5577 5577
True
True
True
True
True
True

 -----------
4044 4044 4044 4044
True
True
True
True
True
True

 -----------
4001 4001 4001 4001
True
True
True
True
True
True

 -----------
3061 3061 3061 3061
True
True
True
True
True
True

 -----------
1079 1079 1079 1079
True
True
True
True
True
True

 -----------
5527 5527 5527 5527
True
True
True
True
True
True

 -----------
3026 3026 3026 3026
True
True
True
True
True
True

 -----------
2022 2022 2022 2022
True
True
True
True
True
True

 -----------
3029 3029 3029 3029
True
True
True
True
True
True

 -----------
5010 5010 5010 5010
True
True
True
True
True
True

 -----------
4087 4087 4087 4087
True
True
True
True
True
True

 -----------
1004 1004 1004 1004
True
True
True
True
True
True

 -----------
4018 4018 4018 4018
True
True
True
True
True
True

 -----------
2046 2046 2046 2046
True
True
True
True
True
True

 -----------
6036 6036 6036 6036
True
True
True
True
True
True

 -----------
1042 1042 1042 1042
True
True
True
True


### Preparing 2011

In [45]:
drop_vars = ['DistrictName', 'DistrictGeo']
ces_2011 = ces_2011.sort_values(by='DistrictName').reset_index(drop=True)
svrs_2011 = svrs_2011.sort_values(by='DistrictName').reset_index(drop=True)
dhis_2011 = dhis_2011.sort_values(by='DistrictName').reset_index(drop=True)
dgfp2011df = dgfp2011df.sort_values(by='DistrictName').reset_index(drop=True)
dhs_2011 = dhs_2011.sort_values(by='DistrictName').reset_index(drop=True)

d2011 = [ces_2011, svrs_2011.drop(drop_vars,axis=1), dhis_2011.drop(drop_vars,axis=1),
         dgfp2011df.drop(drop_vars,axis=1), dhs_2011.drop(drop_vars, axis=1)]
df2011 = pd.concat(d2011, axis=1)
df2011.shape

(64, 468)

### Preparing 2016

In [46]:
ces_2016 = ces_2016.sort_values(by='DistrictName').reset_index(drop=True)
svrs_2016 = svrs_2016.sort_values(by='DistrictName').reset_index(drop=True)
dhis_2016 = dhis_2016.sort_values(by='DistrictName').reset_index(drop=True)
dgfp2016df = dgfp2016df.sort_values(by='DistrictName').reset_index(drop=True)
d2016 = [ces_2016, svrs_2016.drop(drop_vars,axis=1),
         dhis_2016.drop(drop_vars,axis=1), dgfp2016df.drop(drop_vars,axis=1),
         dhs_2016.drop(drop_vars, axis=1)]
df2016 = pd.concat(d2016, axis=1)
df2016.shape

(64, 468)

## Determining outcome variables 

In [47]:
print(df2011['rate_maternal_mortality'].mean())
print(df2016['rate_maternal_mortality'].mean())

228.3010937500001
2.65078125


In [48]:
print(df2011['rate_under5y_mortality'].mean())
print(df2016['rate_under5y_mortality'].mean())

42.829375
39.446562500000006


In [49]:
print(df2011['prop_antenatal_coverage'].mean())
print(df2016['prop_antenatal_coverage'].mean())

37.97062499999999
56.92234374999999


In [50]:
print(df2011['prop_unmet_need_family_planing'].mean())
print(df2016['prop_unmet_need_family_planing'].mean())

13.562187500000004
11.942968750000002


```
print(df2011['ORS_RHF_ORT'].mean())
print(df2016['ORS_RHF_ORT'].mean())
```

In [57]:
df2011.shape
468-3

465

In [55]:
list(df2011.columns)

['OPV2_Children12M',
 'Fully_Children12M',
 'Measles_Children23M',
 'TT3_Mother0-11MChildren',
 'VitACoverage_Children12-59M',
 'Fully_Children23M',
 'OPV3_Children12M',
 'OPV1_Children23M',
 'OPV3_Children23M',
 'OPV1_Children12M',
 'PENTA1_Children12M',
 'PENTA1_Children23M',
 'BCG_Children12M',
 'OPV2_Children23M',
 'PENTA3_Children12M',
 'TT4_Mother0-11MChildren',
 'PENTA2_Children12M',
 'DistrictName',
 'TT5_Mother0-11MChildren',
 'Measles_Children12M',
 'PENTA3_Children23M',
 'BCG_Children23M',
 'PENTA2_Children23M',
 'TT2_Mother0-11MChildren',
 'TT1_Mother0-11MChildren',
 'DistrictGeo',
 'prop_women_15.45y_overwomen',
 'rate_death',
 'prop_pop_rural_women',
 'rate_fertility',
 'rate_live_births',
 'rate_maternal_mortality',
 'prop_registered_births',
 'rate_under5y_mortality',
 'dependency_ratio',
 'prop_married_..15y',
 'sex_ratio',
 'prop_pop_rural',
 'prop_live_births',
 'prop_married_women_15.45y',
 'prop_deaths_rural',
 'rate_infant_mortality',
 'prop_pop_women',
 'rate_chi

In [47]:
df2011.to_csv('/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2011.csv', index=False, index_label=False)
df2016.to_csv('/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2016.csv', index=False, index_label=False)

In [48]:
df2011['year'] = 2011
df2016['year'] = 2016
tmp = pd.concat([df2011, df2016], axis=0)
tmp.to_csv('/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all.csv')
print(tmp.shape)

(128, 469)
