# Data Preprocessing - DHIS2 Data - Health Indicators

## Summary

- **Input data**: Data downloaded via API from DHIS2 System. This data is stored in *data/dhis2/data* directories
- **Processing**:
    - Getting Upazila and District level data
    - Reading and processing and transforming data in the following form:
        - Rows: *Geographical levels (upazila and districts)*
        - Columns: *Variables/health indicators*
        - Datasets: *Years [2009 - 2018]*
- **Output data**: Processed data is stored in *data/output/dhis2/data

## Packages

In [74]:
import os
import glob
import pandas as pd
import collections

## Custom Functions

In [4]:
def list_files(path):
    """
    Function gets the list of all file from the target directory,
    takes absolute path, splits file names to extract:
    - File name
    - Data element
    - File type
    - Year
    - Geographical level/group
    - Absolute path of the file name
    Returns all this as a dataframe
    """
    path_all = os.path.join(path, '*')
    abs_file_names = glob.glob(path_all, recursive=True)
    file_names = [os.path.split(glob.glob(file_path)[0])[1] for file_path in abs_file_names]
    data_collection = [file_name.split("_")[0] for file_name in file_names]
    year = [file_name.split("_")[1] for file_name in file_names]
    geo_level = [file_name.split("_")[2] for file_name in file_names]
    file_type = [file_name[-8:-4] for file_name in file_names]
    files_df = pd.DataFrame.from_dict({'file_names': file_names, 
                                       'data_collection': data_collection,
                                       'year': year,
                                       'geo_level': geo_level,
                                       'file_type': file_type,
                                       'abs_file_names': abs_file_names
                                      })

    return files_df

def create_sets(input_df, geo_levels, years, file_types):
    """
    input_df: Output from function list_files()
    geo_levels: Defining geographic levels:
                If as a list than it is a subset of geo levels from input_df
                Otherwise it should ba string that specifies the column name
    years: Defining years:
           If as a list than it is a subset of geo levels from input_df
           Otherwise it should ba string that specifies the column name
    file_types: Either NAME or CODE. This is defined in the rawfiles.
                NAME: Descriptive name of geographical unit as key ID variable
                CODE: Geo-level code of geogrpahical unit as key ID variable
    return: Dictionary by geo-level, year and file type
    """
    df = input_df.copy(deep=True)
    datasets = {}
    if ((type(geo_levels) == list) & (type(years) == list)):
        for geo_level in geo_levels:
            for year in years:
                for file_type in file_types:
                    name = geo_level + '_' + year + '_' + file_type
                    datasets[name] = df[(df['geo_level']==geo_level) &
                                        (df['year']==year) & 
                                        (df['file_type']==file_type)] 
    else:
        for geo_level in df[geo_levels].unique():
            for year in df[years].unique():
                for file_type in file_types:
                    name = geo_level + '_' + year + '_' + file_type
                    datasets[name] = df[(df['geo_level']==geo_level) & 
                                        (df['year']==year) & 
                                        (df['file_type']==file_type)]
    return datasets

def select_sets(datasets):
    """
    datasets: Output from function create_sets
    returns
        - Data dictionary with loaded datasets by geo-level and year
        # - Data frame with two columns:
            # - List of file names with absolute paths
            # - Logical column whether the loaded file is empty or not
    """
    data_dict = {}
    # summary_df = {'filenames': list(), 'emptyness': list()}
    for key, item in datasets.items():
        data_dict[key] = dict()
        for i in item.index:
            tmp = pd.read_csv(item['abs_file_names'][i])
            # summary_df['filenames'].append(os.path.split(item['abs_file_names'][i])[1])
            # summary_df['emptyness'].append(tmp.empty)
            data_dict[key][item['data_collection'][i]] = tmp
    return data_dict # pd.DataFrame(summary_df)


def extract_variables(data_dict):
    """
    data_dict: Output from select_sets function
    returns: Dataframe that summarizes all loaded files and includes following columns:
        - variable_name: variable names if datasets is not empty otherwise it states 'No Variables'
        - data_element: What data element from DHIS2 loaded datasets corresponds to
        - is_empty: Is it empty or not
        - geo_level: for example upazila or zilla
        - year: 2009 - 2018
    """
    output_dict = {'year':[], 'data_element':[], 'geo_level':[], 'var_name':[], 'is_empty':[]}
    for key_dict, item_dict in data_dict.items():
        for key, item in data_dict[key_dict].items():
            if item.empty:
                output_dict['var_name'].append('No Variables')
                output_dict['data_element'].append(key)
                output_dict['is_empty'].append(True)
                output_dict['geo_level'].append(key_dict.split("_")[1])
                output_dict['year'].append(key_dict.split("_")[0])
            else:
                for var in item['Data'].unique():
                    output_dict['var_name'].append(var)
                    output_dict['data_element'].append(key)
                    output_dict['is_empty'].append(item.empty)
                    output_dict['geo_level'].append(key_dict.split("_")[1])
                    output_dict['year'].append(key_dict.split("_")[0])
        return pd.DataFrame.from_dict(output_dict)

    
def transform_data(data_dict):
    """
    data_dict: Output from function select_sets
    returns: Dictionary (by geo-level and year) of pivoted dataframes which are not empty
    """
    tdata_dict = {}
    for key_dict, item_dict in data_dict.items():
        tdata_dict[key_dict] = list()
        for key, item in data_dict[key_dict].items():
            if item.empty:
                print('Provided dataframe is empty and therefore is not processed')
            else:
                tmp = pd.pivot_table(data=item, 
                                     values='Value', 
                                     index='Organisation unit',
                                     columns='Data', 
                                     aggfunc='first')
                tmp.columns = key + ": " + tmp.columns
                tdata_dict[key_dict].append(tmp)
    return tdata_dict


def merge_data(data_dict):
    """
    data_dict: Output from transform_data function
    returns: Dictionary of merged datasets by year and geo-level
    """
    output_dict = {}
    for key, value in data_dict.items():
        tmp_list = sorted(value, key=len, reverse=True)
        output_dict[key] = tmp_list[0].join(tmp_list[1:])
    return output_dict


def write_data(data_dict, path):
    """
    data_dict: Output from merge_data function
    Writes out all dictionary elements as csv files
    """
    for key, value in data_dict.items():
        value.to_csv(os.path.join(path,key + '.csv'), index=True, index_label=True)
        print(f"Writing {key} to {os.path.join(path,key + '.csv')}")


## Main

In [7]:
DATA_PATH = '/Users/edinhamzic/Symphony/wb_bangladesh/data/dhis2/health_indicators'
WD = '/Users/edinhamzic/Symphony/wb_bangladesh/'
OUT = '/Users/edinhamzic/Symphony/wb_bangladesh/output/dhis2/health_indicators'


In [28]:
dhis2_files = list_files(DATA_PATH)
display(dhis2_files.head())
dhis2_geos = ['Upazila', 'District']
dhis2_years = ['2009','2010','2011','2012','2013','2014','2015','2016','2017', '2018']
dhis2_names = ['CODE']

Unnamed: 0,file_names,data_collection,year,geo_level,file_type,abs_file_names
0,AntenatalCare(ANC)_2017_UpazilaHealthComplex_C...,AntenatalCare(ANC),2017,UpazilaHealthComplex,CODE,/Users/edinhamzic/Symphony/wb_bangladesh/data/...
1,02ChildHealth_2012_District_CODE.csv,02ChildHealth,2012,District,CODE,/Users/edinhamzic/Symphony/wb_bangladesh/data/...
2,05Logistics_2011_District_CODE.csv,05Logistics,2011,District,CODE,/Users/edinhamzic/Symphony/wb_bangladesh/data/...
3,AntenatalCare(ANC)_2016_District_CODE.csv,AntenatalCare(ANC),2016,District,CODE,/Users/edinhamzic/Symphony/wb_bangladesh/data/...
4,03Immunization_2010_Upazila_NAME.csv,03Immunization,2010,Upazila,NAME,/Users/edinhamzic/Symphony/wb_bangladesh/data/...


## Creating, selecting/reading, summarizing, transforming and merging datasets

In [29]:
# Creating sets
datasets_dict = create_sets(dhis2_files, geo_levels=dhis2_geos, years=dhis2_years, file_types=dhis2_names)

# Selecting and reading files
datasets= select_sets(datasets_dict)

# Summarizing datasets
data_summary = extract_variables(datasets)

# Transform and merge
data = merge_data(transform_data(datasets))
len(data)

Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed


20

In [30]:
for var in data_summary[data_summary['data_element'] == '02EPIUpazilaStock']['var_name']:
    print(var)

In [31]:
data_summary.data_element.unique()

array(['AntenatalCare(ANC)', '07Vaccine&LogisticsstockofUpazilaMunCC',
       '03Immunization', '05Logistics',
       '06Vaccine&LogisticsstockofDistrict', '04Newborn',
       '01MaternalHealth', '02ChildHealth'], dtype=object)

## Writing output

### Writing data

In [32]:
write_data(data_dict=data, path=OUT)

Writing Upazila_2009_CODE to /Users/edinhamzic/Symphony/wb_bangladesh/output/dhis2/health_indicators/Upazila_2009_CODE.csv
Writing Upazila_2010_CODE to /Users/edinhamzic/Symphony/wb_bangladesh/output/dhis2/health_indicators/Upazila_2010_CODE.csv
Writing Upazila_2011_CODE to /Users/edinhamzic/Symphony/wb_bangladesh/output/dhis2/health_indicators/Upazila_2011_CODE.csv
Writing Upazila_2012_CODE to /Users/edinhamzic/Symphony/wb_bangladesh/output/dhis2/health_indicators/Upazila_2012_CODE.csv
Writing Upazila_2013_CODE to /Users/edinhamzic/Symphony/wb_bangladesh/output/dhis2/health_indicators/Upazila_2013_CODE.csv
Writing Upazila_2014_CODE to /Users/edinhamzic/Symphony/wb_bangladesh/output/dhis2/health_indicators/Upazila_2014_CODE.csv
Writing Upazila_2015_CODE to /Users/edinhamzic/Symphony/wb_bangladesh/output/dhis2/health_indicators/Upazila_2015_CODE.csv
Writing Upazila_2016_CODE to /Users/edinhamzic/Symphony/wb_bangladesh/output/dhis2/health_indicators/Upazila_2016_CODE.csv
Writing Upazila_

### Writing summary

In [33]:
data_summary.to_csv(os.path.join(OUT, 'data_summary.csv'), index=False, index_label=False)

In [77]:
def create_metadata(path, pattern, out_path):
    datasets_dict = {'variable':[], 'year':[], 'geo':[]}
    for path, subfolds, files in os.walk(path):
        for file in files:
            if pattern in file:
                tmp = pd.read_csv(os.path.join(path,file))
                datasets_dict['variable'] = datasets_dict['variable'] + list(tmp.columns)
                datasets_dict['geo'] = datasets_dict['geo'] + [file.split('_')[0]] * tmp.shape[1]
                datasets_dict['year'] = datasets_dict['year'] + [file.split('_')[1]] * tmp.shape[1]
    out = pd.DataFrame.from_dict(datasets_dict)
    out['Health Indicator Group'] = out['variable'].str.split(': ').str[0]
    out['Indicator'] = out['variable'].str.split(': ').str[1]
    out = out[['Indicator', 'Health Indicator Group', 'geo', 'year']]
    display(out)
    out.to_csv(os.path.join(out_path, 'healthindicators_metadata.csv'), index=False, index_label=False) 

In [78]:
OUT = '../../output/dhis2/health_indicators'
create_metadata(path=OUT, pattern='NAME', out_path='../../output/dhis2/health_indicators/')

Unnamed: 0,Indicator,Health Indicator Group,geo,year
0,,True,District,2012
1,% of maternal death reported individually with...,01MaternalHealth,District,2012
2,Institutional C-section rate,01MaternalHealth,District,2012
3,Institutional Normal Delivery Rate (Only EmOC ...,01MaternalHealth,District,2012
4,Maternal Case fatality rate (EmOC),01MaternalHealth,District,2012
5,Percentage of cases with puerperal sepsis amon...,01MaternalHealth,District,2012
6,Percentage of postpartum hemorrhage among admi...,01MaternalHealth,District,2012
7,Proportion of cases with postpartum hemorrhage...,01MaternalHealth,District,2012
8,Proportion of cases with puerperal sepsis amon...,01MaternalHealth,District,2012
9,Total Complicated Mother,01MaternalHealth,District,2012
