# Data Preprocessing - DHIS2 Data

## Summary

- **Input data**: Data downloaded via API from DHIS2 System. This data is stored in *data/dhis2/data* directories
- **Processing**:
    - Getting Upazila and District level data
    - Reading and processing and transforming data in the following form:
        - Rows: *Geographical levels (upazila and districts)*
        - Columns: *Variables/health indicators*
        - Datasets: *Years*
- **Output data**: Processed data is stored in *data/output/dhis2/data

## Packages

In [252]:
import os
import glob
import pandas as pd
import collections

## Custom Functions

In [246]:
def list_files(path):
    """
    Function gets the list of all file from the target directory,
    takes absolute path, splits file names to extract:
    - File name
    - Data element
    - File type
    - Year
    - Geographical level/group
    - Absolute path of the file name
    Returns all this as a dataframe
    """
    path_all = os.path.join(path, '*')
    abs_file_names = glob.glob(path_all, recursive=True)
    file_names = [os.path.split(glob.glob(file_path)[0])[1] for file_path in abs_file_names]
    data_collection = [file_name.split("_")[0] for file_name in file_names]
    year = [file_name.split("_")[1] for file_name in file_names]
    geo_level = [file_name.split("_")[2] for file_name in file_names]
    file_type = [file_name[-8:-4] for file_name in file_names]
    files_df = pd.DataFrame.from_dict({'file_names': file_names, 
                                       'data_collection': data_collection,
                                       'year': year,
                                       'geo_level': geo_level,
                                       'file_type': file_type,
                                       'abs_file_names': abs_file_names
                                      })

    return files_df

def create_sets(input_df, geo_levels, years, file_types):
    """
    input_df: Output from function list_files()
    geo_levels: Defining geographic levels:
                If as a list than it is a subset of geo levels from input_df
                Otherwise it should ba string that specifies the column name
    years: Defining years:
           If as a list than it is a subset of geo levels from input_df
           Otherwise it should ba string that specifies the column name
    file_types: Either NAME or CODE. This is defined in the rawfiles.
                NAME: Descriptive name of geographical unit as key ID variable
                CODE: Geo-level code of geogrpahical unit as key ID variable
    return: Dictionary by geo-level, year and file type
    """
    df = input_df.copy(deep=True)
    datasets = {}
    if ((type(geo_levels) == list) & (type(years) == list)):
        for geo_level in geo_levels:
            for year in years:
                for file_type in file_types:
                    name = geo_level + '_' + year + '_' + file_type
                    datasets[name] = df[(df['geo_level']==geo_level) &
                                        (df['year']==year) & 
                                        (df['file_type']==file_type)] 
    else:
        for geo_level in df[geo_levels].unique():
            for year in df[years].unique():
                for file_type in file_types:
                    name = geo_level + '_' + year + '_' + file_type
                    datasets[name] = df[(df['geo_level']==geo_level) & 
                                        (df['year']==year) & 
                                        (df['file_type']==file_type)]
    return datasets

def select_sets(datasets):
    """
    datasets: Output from function create_sets
    returns
        - Data dictionary with loaded datasets by geo-level and year
        # - Data frame with two columns:
            # - List of file names with absolute paths
            # - Logical column whether the loaded file is empty or not
    """
    data_dict = {}
    # summary_df = {'filenames': list(), 'emptyness': list()}
    for key, item in datasets.items():
        data_dict[key] = dict()
        for i in item.index:
            tmp = pd.read_csv(item['abs_file_names'][i])
            # summary_df['filenames'].append(os.path.split(item['abs_file_names'][i])[1])
            # summary_df['emptyness'].append(tmp.empty)
            data_dict[key][item['data_collection'][i]] = tmp
    return data_dict # pd.DataFrame(summary_df)


def extract_variables(data_dict):
    """
    data_dict: Output from select_sets function
    returns: Dataframe that summarizes all loaded files and includes following columns:
        - variable_name: variable names if datasets is not empty otherwise it states 'No Variables'
        - data_element: What data element from DHIS2 loaded datasets corresponds to
        - is_empty: Is it empty or not
        - geo_level: for example upazila or zilla
        - year: 2009 - 2018
    """
    output_dict = {'year':[], 'data_element':[], 'geo_level':[], 'var_name':[], 'is_empty':[]}
    for key_dict, item_dict in data_dict.items():
        for key, item in data_dict[key_dict].items():
            if item.empty:
                output_dict['var_name'].append('No Variables')
                output_dict['data_element'].append(key)
                output_dict['is_empty'].append(True)
                output_dict['geo_level'].append(key_dict.split("_")[1])
                output_dict['year'].append(key_dict.split("_")[0])
            else:
                for var in item['Data'].unique():
                    output_dict['var_name'].append(var)
                    output_dict['data_element'].append(key)
                    output_dict['is_empty'].append(item.empty)
                    output_dict['geo_level'].append(key_dict.split("_")[1])
                    output_dict['year'].append(key_dict.split("_")[0])
        return pd.DataFrame.from_dict(output_dict)

    
def transform_data(data_dict):
    """
    data_dict: Output from function select_sets
    returns: Dictionary (by geo-level and year) of pivoted dataframes which are not empty
    """
    tdata_dict = {}
    for key_dict, item_dict in data_dict.items():
        tdata_dict[key_dict] = list()
        for key, item in data_dict[key_dict].items():
            if item.empty:
                print('Provided dataframe is empty and therefore is not processed')
            else:
                tmp = pd.pivot_table(data=item, 
                                     values='Value', 
                                     index='Organisation unit',
                                     columns='Data', 
                                     aggfunc='first')
                tmp.columns = key + ": " + tmp.columns
                tdata_dict[key_dict].append(tmp)
    return tdata_dict


def merge_data(data_dict):
    """
    data_dict: Output from transform_data function
    returns: Dictionary of merged datasets by year and geo-level
    """
    output_dict = {}
    for key, value in data_dict.items():
        tmp_list = sorted(value, key=len, reverse=True)
        output_dict[key] = tmp_list[0].join(tmp_list[1:])
    return output_dict


## Main

In [250]:
DATA_PATH = '/Users/edinhamzic/Symphony/wb_bangladesh/data/dhis2/data'
WD = '/Users/edinhamzic/Symphony/wb_bangladesh/'

In [251]:
dhis2_files = list_files(DATA_PATH)
display(dhis2_files.head())
dhis2_geos = ['Upazila', 'District']
dhis2_years = ['2009','2010','2011','2012','2013','2014','2015','2016','2017', '2018']
dhis2_names = ['NAME']
datasets_dict = create_sets(dhis2_files, geo_levels=dhis2_geos, years=dhis2_years, file_types=dhis2_names)
# datasets_dict = create_sets(dhis2_files, geo_levels='geo_level', years='year')
datasets, empty, filenames = select_sets(datasets_dict)
variable_summary = extract_variables(datasets)
data = merge_data(transform_data(datasets))
len(data)

Unnamed: 0,file_names,data_collection,year,geo_level,file_type,abs_file_names
0,04EPIDistrictRequisition_2011_UpazilaandDistri...,04EPIDistrictRequisition,2011,UpazilaandDistrictlevelHF,NAME,/Users/edinhamzic/Symphony/wb_bangladesh/data/...
1,02EPIUpazilaStock_2010_UpazilaHealthComplex_CO...,02EPIUpazilaStock,2010,UpazilaHealthComplex,CODE,/Users/edinhamzic/Symphony/wb_bangladesh/data/...
2,10KMC_2011_DistrictNGO&PrivateTotal_CODE.csv,10KMC,2011,DistrictNGO&PrivateTotal,CODE,/Users/edinhamzic/Symphony/wb_bangladesh/data/...
3,11EMEN(MBFFI)_2012_DistrictNGO&PrivateTotal_CO...,11EMEN(MBFFI),2012,DistrictNGO&PrivateTotal,CODE,/Users/edinhamzic/Symphony/wb_bangladesh/data/...
4,07EmONC_2009_UpazilaandDistrictlevelHF_NAME.csv,07EmONC,2009,UpazilaandDistrictlevelHF,NAME,/Users/edinhamzic/Symphony/wb_bangladesh/data/...


Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not processed
Provided dataframe is empty and therefore is not process

20

## Summarize missingness and data

In [8]:
empty = pd.DataFrame.from_dict({'filennames': filenames, 'emptyness':emptyness})
print(f"Missing data elements (not downloaed):  {empty[emptyness]['filennames'].str.split('_', expand=True)[0].unique()}")
print("\n")
print(f"Years for which above data elements are missing: {[int(year) for year in empty[emptyness]['filennames'].str.split('_', expand=True)[1].unique()]}")
print("\n")
print(f"Geolevels for which above data elements are missing {empty[emptyness]['filennames'].str.split('_', expand=True)[2].unique()}")
print("\n")

Missing data elements (not downloaed):  ['04EPIDistrictRequisition' '05EPIDistrictSupply' '03EPIDistrictStock'
 '10KMC' '06EPIUpazilaSupply' '01EPIReport(Routinevaccination)'
 '09AdolescentHealth' '02EPIUpazilaStock' '11EMEN(MBFFI)']


Years for which above data elements are missing: [2011, 2010, 2012, 2009, 2016, 2015, 2013, 2018, 2014, 2017]


Geolevels for which above data elements are missing ['UpazilaandDistrictlevelHF' 'UpazilaHealthComplex'
 'DistrictNGO&PrivateTotal' 'District' 'DistrictHospital' 'Upazila'
 'UNICEF-MNHIdistrict' 'DistrictandNational' 'UnionandUpazilalevelHF']


