# NOTEBOOK

## Data sources:
- DHIS2 health indicators: 2011 and 2016
- DGFP health indicators: 2011 and 2016
- DHS raw variables: 2011 and 2014
- SVRS raw variables: 2012 and 2015
- CES indicators: 2011 and 2016

## Time points:
![Timpoints](timepoints.png)

In [None]:
import os
import re
import numpy as np
import pandas as pd
from collections import Counter
from fuzzywuzzy import fuzz

In [None]:
def intersect_dfs(input_df1, input_df2):
    df1 = input_df1.copy(deep=True)
    df2 = input_df2.copy(deep=True)
    subset_var = list(set(list(df1.columns)).intersection(set(list(df2.columns))))
    return df1[subset_var], df2[subset_var]

def read_ces(files_list, common=True):
    data_dict = {}
    for file in files_list:
        data_dict[file] = pd.read_csv(file, encoding='cp850')
        data_dict[file].rename(columns={'Survey.Units"':'geo'}, inplace=True)
        subset = [not bool(re.search(r"Division|Launch District|CC|KCC|RCC|DCC|SCC|CCC|BCC|Urban|Rural|CC Slum| Slum|National", geo)) for geo in data_dict[file]['Survey.Units']]
        print(Counter([not bool(re.search(r"Division|Launch District|CC|KCC|RCC|DCC|SCC|CCC|BCC|Urban|Rural|CC Slum| Slum|National", geo)) for geo in data_dict[file]['Survey.Units']]))
        data_dict[file] = data_dict[file].loc[subset,:]
        print(data_dict[file].shape)
    return data_dict

def match_districts(ref_df, ref_match, input_df, input_match):
    out = pd.DataFrame()
    for key, code in enumerate(input_df[input_match]):
        code_match = {}
        code_match['FuzzRatio'] = [fuzz.ratio(ref_code, code)  for ref_code in ref_df[ref_match]]
        code_match['Geo'] = code
        code_match['DistrictGeo'] = [value[0] for value in ref_df.values]
        code_match['DistrictName'] = [value[1]  for value in ref_df.values]
        code_match = pd.DataFrame.from_dict(code_match)
        out = out.append(code_match.sort_values('FuzzRatio', ascending=False).iloc[0,:])
    for var in list(out.columns):
        if out[var].dtype.kind == 'f':
            out[var] = out[var].astype(int)
            out[var] = out[var].astype(str)
            code_length = max([len(char) for char in out[var]])
            out[var] = out[var].str.pad(width=code_length, side='left', fillchar='0') 
    return out

def read_dgfp(files_list):
    data_dict = {}
    for file in files_list:
        data_dict[file] = pd.read_csv(file)
    return data_dict

def distrGO_rates(input_df, pattern, denominator_male, denominator_female):
    df = input_df.copy(deep=True)
    vars_rates = [var for var in df.columns if pattern in var]
    for var in vars_rates:
        if "_male" in var:
            df[var] = np.round(df[var]/(denominator_male/1000),4)
        else: 
            df[var] = np.round(df[var]/(denominator_female/1000), 4)
    return df, vars_rates

## GEOS

## CES Data

In [None]:
CES2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2011.csv'
CES2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata/gdata_unicef_ces_2016.csv'
ces_list = [CES2011, CES2016]
ces = read_ces(files_list=ces_list, common=True)
ces.keys()

In [None]:
ces_2011, ces_2016 = intersect_dfs(input_df1=ces[ces_list[0]], input_df2=ces[ces_list[1]])
print(ces_2011.shape)
print(ces_2016.shape)

In [None]:
ces_2011['DistrictCode'] = ces_2011['DistrictCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2011['DivisionCode'] = ces_2011['DivisionCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2011['DistrictGeo'] = ces_2011['DivisionCode'].str.cat(ces_2011['DistrictCode'], sep="")
ces_2011 = ces_2011.drop(['DivisionName', 'DivisionCode', 'Geo', 'DistrictCode', 'Year', 'FuzzRatio', 'Survey.Units'], axis=1)
ces_2011.head()

In [None]:
ces_2016['DistrictCode'] = ces_2016['DistrictCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2016['DivisionCode'] = ces_2016['DivisionCode'].astype(str).str.pad(width=2, side='left', fillchar='0')
ces_2016['DistrictGeo'] = ces_2016['DivisionCode'].str.cat(ces_2016['DistrictCode'], sep="")
ces_2016 = ces_2016.drop(['DivisionName', 'DivisionCode', 'Geo', 'DistrictCode', 'Year', 'FuzzRatio', 'Survey.Units'], axis=1)
ces_2016.head()

In [None]:
check_list = []
for var1, var2 in zip(sorted(ces_2011['DistrictName']), sorted(ces_2011['DistrictName'])):
    check_list.append(var1==var2)
print(all(check_list))
geo = ces_2011[['DistrictGeo', 'DistrictName']]
geo.head()

## SVRS Data 

In [None]:
SVRS2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/data_svrs_zila_2012_clean.csv'
SVRS2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/data_svrs_zila_2015_clean.csv'

In [None]:
svrs_2011 = pd.read_csv(SVRS2011)
svrs_2011['district'] = svrs_2011['district'].str.replace(' Zila', '')
display(svrs_2011.head())
svrs_2016 = pd.read_csv(SVRS2016)
display(svrs_2016.head())
svrs_2011, svrs_2016 = intersect_dfs(input_df1=svrs_2011, input_df2=svrs_2016)
print(svrs_2011.columns)
print(svrs_2016.columns)

In [None]:
geo_svrs_2011 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=svrs_2011, input_match='district')
geo_svrs_2016 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=svrs_2016, input_match='district')

In [None]:
print(svrs_2011.shape)
print(svrs_2016.shape)
svrs_2011 = svrs_2011.merge(geo_svrs_2011, how='left', left_on='district', right_on='Geo')
svrs_2016 = svrs_2016.merge(geo_svrs_2016, how='left', left_on='district', right_on='Geo')
svrs_2011 = svrs_2011.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
svrs_2016 = svrs_2016.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
print(svrs_2011.shape)
print(svrs_2016.shape)

In [None]:
print(svrs_2011['rate_maternal_mortality'].mean())
print(svrs_2016['rate_maternal_mortality'].mean())

## DHIS2 Data

In [None]:
DHIS2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhis2/health_indicators/District_2011_NAME.csv'
DHIS2014 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhis2/health_indicators/District_2014_NAME.csv'
DHIS2017 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhis2/health_indicators/District_2017_NAME.csv'

In [None]:
dhis_2011 = pd.read_csv(DHIS2011)
dhis_2011['Geo'] = dhis_2011['True'].str.replace(" District", "")
display(dhis_2011.head())
dhis_2016 = pd.read_csv(DHIS2016)
dhis_2016['Geo'] = dhis_2016['True'].str.replace(" District", "")
display(dhis_2016.head())
dhis_2011, dhis_2016 = intersect_dfs(input_df1=dhis_2011, input_df2=dhis_2016)
print(dhis_2011.shape)
print(dhis_2016.shape)

In [None]:
geo_dhis_2011 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=dhis_2011, input_match='Geo')
geo_dhis_2016 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=dhis_2016, input_match='Geo')

In [None]:
print(dhis_2011.shape)
print(dhis_2016.shape)
dhis_2011 = dhis_2011.merge(geo_dhis_2011, how='left', left_on='Geo', right_on='Geo')
dhis_2016 = dhis_2016.merge(geo_dhis_2016, how='left', left_on='Geo', right_on='Geo')
dhis_2011 = dhis_2011.drop(['FuzzRatio','Geo',], axis=1)
dhis_2016 = dhis_2016.drop(['FuzzRatio','Geo',], axis=1)
print(dhis_2011.shape)
print(dhis_2016.shape)

## DGFP Data

In [None]:
DGFP2011a = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_ngothanaprocess_2011.csv'
DGFP2011b = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_thanaprocess_2011.csv'
DGFP2011c = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_district_monthprocess_2011.csv'
DGFP2011d = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_ngodistrict_monthprocess_2011.csv'
DGFP2011e = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_distributionGO_distmonthProcess_2011.csv'
DGFP2011 = [DGFP2011a, DGFP2011b, DGFP2011c, DGFP2011d, DGFP2011e]
DGFP2016a = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_ngothanaprocess_2016.csv'
DGFP2016b = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp11subdistr_thanaprocess_2016.csv'
DGFP2016c = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_district_monthprocess_2016.csv'
DGFP2016d = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_ngodistrict_monthprocess_2016.csv'
DGFP2016e = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/data_dgfp_imp12distr_distributionGO_distmonthProcess_2016.csv'

DGFP2016 = [DGFP2016a, DGFP2016b, DGFP2016c, DGFP2016d, DGFP2016e]

In [None]:
dgfp2011 = read_dgfp(files_list=DGFP2011)
dgfp2016 = read_dgfp(files_list=DGFP2016)

In [None]:
dgfp2011[DGFP2011e]['distr_GOdistr_GOfemalepn'].sum()

In [None]:
dgfp2011df = pd.concat([dgfp2011[DGFP2011a], 
                        dgfp2011[DGFP2011b].drop('geo', axis=1), 
                        dgfp2011[DGFP2011c].drop('geo', axis=1),
                        dgfp2011[DGFP2011d].drop('geo', axis=1),
                        dgfp2011[DGFP2011e].drop('geo', axis=1)], axis=1)
print(dgfp2011df.shape)
print(dgfp2011df.columns)
dgfp2016df = pd.concat([dgfp2016[DGFP2016a], 
                        dgfp2016[DGFP2016b].drop('geo', axis=1), 
                        dgfp2016[DGFP2016c].drop('geo', axis=1), 
                        dgfp2016[DGFP2016d].drop('geo', axis=1),
                        dgfp2016[DGFP2016e].drop('geo', axis=1)], axis=1)
print(dgfp2016df.shape)
print(dgfp2016df.columns)

In [None]:
dgfp2011df, dgfp2016df = intersect_dfs(input_df1= dgfp2011df, input_df2= dgfp2016df)

In [None]:
dgfp2011df["geo"] = dgfp2011df["geo"].astype(str)
print(dgfp2011df.shape)
dgfp2011df = dgfp2011df.merge(geo, how='left', left_on="geo", right_on="DistrictGeo")
print(dgfp2011df.shape)
dgfp2011df = dgfp2011df.drop('geo', axis=1)

In [None]:
dgfp2016df["geo"] = dgfp2016df["geo"].astype(str)
print(dgfp2016df.shape)
dgfp2016df = dgfp2016df.merge(geo, how='left', left_on="geo", right_on="DistrictGeo")
print(dgfp2016df.shape)
dgfp2016df = dgfp2016df.drop('geo', axis=1)

In [None]:
dgfp2011df.columns

## Demographics

In [None]:
woman15_45_2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/data_svrs_zila_2012.csv'
woman15_45_2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/data_svrs_zila_2015.csv'
TOTAL_POP2011 = 144043697
TOTAL_POP2016 = 162951560
data_svrs2011 = pd.read_csv(woman15_45_2011)
data_svrs2011['district'] = data_svrs2011['district'].str.replace(' Zila', '')
data_svrs2016 = pd.read_csv(woman15_45_2016)
data_svrs2011['total_bang_svrs'] = data_svrs2011['total_pop'].sum()
data_svrs2016['total_bang_svrs'] = data_svrs2016['total_pop'].sum()
data_svrs2011['total_bang'] = TOTAL_POP2011
data_svrs2016['total_bang'] = TOTAL_POP2016

data_svrs2011['total_pop_percent'] = data_svrs2011['total_pop']/data_svrs2011['total_pop'].sum()
data_svrs2016['total_pop_percent'] = data_svrs2016['total_pop']/data_svrs2016['total_pop'].sum()
data_svrs2011['total_pop_abs'] = data_svrs2011['total_pop_percent']*TOTAL_POP2011
data_svrs2016['total_pop_abs'] = data_svrs2016['total_pop_percent']*TOTAL_POP2016
data_svrs2011['woman15_45_abs'] = np.round(data_svrs2011['total_pop_abs'] * data_svrs2011['women_15.45y']/data_svrs2011['total_pop'])
data_svrs2016['woman15_45_abs'] = np.round(data_svrs2016['total_pop_abs'] * data_svrs2016['women_15.45y']/data_svrs2016['total_pop'])
display(data_svrs2011[['total_bang_svrs', 'total_pop', 'total_pop_percent', 'women_15.45y',
                      'total_pop_abs', 'woman15_45_abs', 'total_bang']].head())
display(data_svrs2016[['total_bang_svrs', 'total_pop', 'total_pop_percent', 'women_15.45y',
                      'total_pop_abs', 'woman15_45_abs', 'total_bang']].head())

In [None]:
geo_svrs_2011s = match_districts(ref_df=geo, ref_match='DistrictName', input_df=data_svrs2011, input_match='district')
geo_svrs_2016s = match_districts(ref_df=geo, ref_match='DistrictName', input_df=data_svrs2016, input_match='district')
print(data_svrs2011.shape)
print(data_svrs2016.shape)
data_svrs2011 = data_svrs2011.merge(geo_svrs_2011s, how='left', left_on='district', right_on='Geo')
data_svrs2016 = data_svrs2016.merge(geo_svrs_2016s, how='left', left_on='district', right_on='Geo')
data_svrs2011 = data_svrs2011.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
data_svrs2016 = data_svrs2016.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
print(data_svrs2011.shape)
print(data_svrs2016.shape)

## Recalculating indicators

In [None]:
print(sorted(data_svrs2011['DistrictName']) == sorted(dgfp2011df['DistrictName']))
print(sorted(data_svrs2016['DistrictName']) == sorted(dgfp2016df['DistrictName']))

In [None]:
dgfp2011df = dgfp2011df.sort_values("DistrictName")
data_svrs2011 = data_svrs2011.sort_values("DistrictName")
dgfp2011df, variables = distrGO_rates(input_df=dgfp2011df, pattern="distr_GO", 
              denominator_female=data_svrs2011['woman15_45_abs'], 
              denominator_male=data_svrs2011['total_bang'])
display(dgfp2011df[variables].head())

In [None]:
dgfp2016df = dgfp2016df.sort_values(by ="DistrictName")
data_svrs2016 = data_svrs2016.sort_values("DistrictName")
dgfp2016df, variables = distrGO_rates(input_df=dgfp2016df, pattern="distr_GO", 
              denominator_female=data_svrs2016['woman15_45_abs'], 
              denominator_male=data_svrs2016['total_bang'])
display(dgfp2016df[variables].head())

In [None]:
dgfp2011df.head()

In [None]:
dgfp2016df.head()

## DHS Data

In [None]:
DHS2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/data/data_dhs_2011_clean.csv' 
DHS2014 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/data/data_dhs_2014_clean.csv' 

In [None]:
dhs_2011 = pd.read_csv(DHS2011)
display(dhs_2011.head())
dhs_2014 = pd.read_csv(DHS2014)
display(dhs_2014.head())

In [None]:
print(geo.shape)
print(dhs_2011.shape)
geo_dhs_2011 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=dhs_2011, input_match='district')
print(geo_dhs_2011.shape)
print("#"*100)
print(geo.shape)
print(dhs_2014.shape)
geo_dhs_2014 = match_districts(ref_df=geo, ref_match='DistrictName', input_df=dhs_2014, input_match='district')
print(geo_dhs_2014.shape)

In [None]:
print(dhs_2011.shape)
print(dhs_2014.shape)
dhs_2011 = dhs_2011.merge(geo_dhs_2011, how='left', left_on='district', right_on='Geo')
dhs_2014 = dhs_2014.merge(geo_dhs_2014, how='left', left_on='district', right_on='Geo')
dhs_2011 = dhs_2011.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
dhs_2014 = dhs_2014.drop(['FuzzRatio','Geo', 'district', 'year'], axis=1)
print(dhs_2011.shape)
print(dhs_2014.shape)

### Renaming DHS data sets from 2014 to 2016


In [None]:
dhs_2011, dhs_2016 = intersect_dfs(input_df1= dhs_2011, input_df2= dhs_2014)

In [None]:
dhs_2011.equals(dhs_2016)

## Combine data

In [None]:
for ces, svrs, dhis, dgfp, dhs in zip(sorted(ces_2011['DistrictName']), sorted(svrs_2011['DistrictName']), 
                                 sorted(dhis_2011['DistrictName']), sorted(dgfp2011df['DistrictName']),
                                 sorted(dhs_2011['DistrictName'])):
    print("\n -------------------")
    print(ces, svrs, dhis, dgfp, dhs)
    print(f"CES vs SVRS: {ces == svrs}")
    print(f"CES vs DHIS: {ces == dhis}")
    print(f"CES vs DGFP: {ces == dgfp}")
    print(f"CES vs DHS: {ces == dhs}")
    print(f"SVRS vs DHIS: {svrs == dhis}")
    print(f"SVRS vs DGFP: {svrs == dgfp}")
    print(f"SVRS vs DHS: {svrs == dhs}")
    print(f"DHIS vs DGFP: {dhis == dgfp}")
    print(f"DHIS vs DHS: {dhis == dhs}")
    print(f"DHS vs DGFP: {dhs == dgfp}")

In [None]:
for ces, svrs, dhis, dgfp, dhs in zip(sorted(ces_2016['DistrictName']), sorted(svrs_2016['DistrictName']), 
                                 sorted(dhis_2016['DistrictName']), sorted(dgfp2016df['DistrictName']),
                                 sorted(dhs_2016['DistrictName'])):
    print("\n -------------------")
    print(ces, svrs, dhis, dgfp, dhs)
    print(f"CES vs SVRS: {ces == svrs}")
    print(f"CES vs DHIS: {ces == dhis}")
    print(f"CES vs DGFP: {ces == dgfp}")
    print(f"CES vs DHS: {ces == dhs}")
    print(f"SVRS vs DHIS: {svrs == dhis}")
    print(f"SVRS vs DGFP: {svrs == dgfp}")
    print(f"SVRS vs DHS: {svrs == dhs}")
    print(f"DHIS vs DGFP: {dhis == dgfp}")
    print(f"DHIS vs DHS: {dhis == dhs}")
    print(f"DHS vs DGFP: {dhs == dgfp}")

## Combining data

In [None]:
for a,b,c,d in zip(dgfp2011[DGFP2011a]['geo'], dgfp2011[DGFP2011b]['geo'], 
                   dgfp2011[DGFP2011c]['geo'], dgfp2011[DGFP2011d]['geo']):
    print(a,b,c,d)
    print(a == b)
    print(a == c)
    print(a == d)
    print(b == c)
    print(b == d)
    print(c == d)
    print("\n -----------")

### Preparing 2011

In [None]:
drop_vars = ['DistrictName', 'DistrictGeo']
ces_2011 = ces_2011.sort_values(by='DistrictName').reset_index(drop=True)
svrs_2011 = svrs_2011.sort_values(by='DistrictName').reset_index(drop=True)
dhis_2011 = dhis_2011.sort_values(by='DistrictName').reset_index(drop=True)
dgfp2011df = dgfp2011df.sort_values(by='DistrictName').reset_index(drop=True)
dhs_2011 = dhs_2011.sort_values(by='DistrictName').reset_index(drop=True)

d2011 = [ces_2011, svrs_2011.drop(drop_vars,axis=1), dhis_2011.drop(drop_vars,axis=1),
         dgfp2011df.drop(drop_vars,axis=1), dhs_2011.drop(drop_vars, axis=1)]
df2011 = pd.concat(d2011, axis=1)
df2011.shape

### Preparing 2016

In [None]:
ces_2016 = ces_2016.sort_values(by='DistrictName').reset_index(drop=True)
svrs_2016 = svrs_2016.sort_values(by='DistrictName').reset_index(drop=True)
dhis_2016 = dhis_2016.sort_values(by='DistrictName').reset_index(drop=True)
dgfp2016df = dgfp2016df.sort_values(by='DistrictName').reset_index(drop=True)
d2016 = [ces_2016, svrs_2016.drop(drop_vars,axis=1),
         dhis_2016.drop(drop_vars,axis=1), dgfp2016df.drop(drop_vars,axis=1),
         dhs_2016.drop(drop_vars, axis=1)]
df2016 = pd.concat(d2016, axis=1)
df2016.shape

## Determining outcome variables 

In [None]:
print(df2011['rate_maternal_mortality'].mean())
print(df2016['rate_maternal_mortality'].mean())

In [None]:
print(df2011['rate_under5y_mortality'].mean())
print(df2016['rate_under5y_mortality'].mean())

In [None]:
print(df2011['prop_antenatal_coverage'].mean())
print(df2016['prop_antenatal_coverage'].mean())

In [None]:
print(df2011['prop_unmet_need_family_planing'].mean())
print(df2016['prop_unmet_need_family_planing'].mean())

```
print(df2011['ORS_RHF_ORT'].mean())
print(df2016['ORS_RHF_ORT'].mean())
```

In [None]:
df2011.shape
468-3

In [None]:
list(df2011.columns)

In [None]:
df2011.to_csv('/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2011.csv', index=False, index_label=False)
df2016.to_csv('/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2016.csv', index=False, index_label=False)

In [None]:
df2011['year'] = 2011
df2016['year'] = 2016
tmp = pd.concat([df2011, df2016], axis=0)
tmp.to_csv('/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all.csv')
print(tmp.shape)