# Importance 12 Districts: district_monthprocess

### Importance 12 Districts: district_monthprocess
- Step 1: List all files and check if empty. Subset based on emptyness
- Step 2: Read file and add file name, split file name to additional columns and add column with path
- Step 3: Define variables and if necessary to calculate relative values
- Step 4: Determine how to sumarize columns if possible to sum or average and do so
- Step 5: Break data by years
- Step 6: Summarize data by zilas. Keep numeric variables, division, zila and year
- Step 7: Create file with zilas, division and 
- Step 8: Write everything to the output files

## Paths

In [20]:
DATA = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/data/dgfp/data/importance12_districts/district_monthprocess/'
GEO = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/geos/dgfp_geo.csv'
OUT = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dgfp/data/'

## Packages

In [21]:
import os
import re
import sys
import glob
import numpy as np
import pandas as pd

## Custom functions

In [22]:
def list_files(path):
    files_list = []
    for path, subdirs, files in os.walk(path):
        for file in files:
            if bool(re.search(pattern=r'.csv', string=file)):
                files_list.append(os.path.join(path, file))
    return files_list
    
def read_arrange_files(files_list):
    data_list = {}
    log_columns = ['empty', 'full_name_path', 'file_name1', 'file_name2',
                   'year', 'month', 'upazila', 'division', 'upazila_full']
    log_array = []
    for file in files_list:
        tmp = pd.read_csv(file, sep='\t', skiprows=1)
        tmp['Month'] = tmp['MonthYear'].str.split(pat=" ", expand=True)[0]
        tmp['Year'] = tmp['MonthYear'].str.split(pat=" ", expand=True)[1]
        log_list = []
        log_list.append(tmp.empty)
        if tmp.empty:
            log_list.append(file)
            log_list.append(os.path.split(file)[1])
            name = os.path.split(file)[1]
            name = name.replace(".csv", "")
            log_list.append(name)
            name_split = name.split("-")
            log_list.append(name_split[0])
            log_list.append(name_split[1])
            log_list.append(name_split[3])
            log_list.append(name_split[4])
            log_list.append(name_split[4]+name_split[3])
        else:
            tmp['full_name_path'] = file
            log_list.append(file)
            tmp['file_name1'] = os.path.split(file)[1]
            log_list.append(os.path.split(file)[1])
            name = os.path.split(file)[1]
            name = name.replace(".csv", "")
            tmp['file_name2'] = name
            log_list.append(name)
            name_split = name.split("-")
            tmp['year'] = int(name_split[0])
            log_list.append(name_split[0])
            tmp['month'] = int(name_split[1])
            log_list.append(name_split[1])
            tmp['upazila'] = name_split[3]
            log_list.append(name_split[3])
            tmp['division'] = name_split[4]
            log_list.append(name_split[4])
            tmp['upazila_full'] = name_split[4]+name_split[3]
            log_list.append(name_split[4]+name_split[3])
            data_list[name] = tmp
        log_array.append(log_list)
    return data_list, pd.DataFrame(log_array, columns=log_columns)

def concat_data(ddict):
    out_list = []
    for key, value in ddict.items():
        out_list.append(value)
    df = pd.concat(out_list, axis=0)
    return df

def break_by_years(input_df, year_var):
    df = input_df.copy(deep=True)
    out_dict = {}
    for year in df[year_var].unique():
        out_dict[str(year)] = df[df[year_var] == year]
    return out_dict

def summarize_zilas(data_dict):
    out_dict = {}
    for key, value in data_dict.items():
        tmp = value.copy(deep=True)
        subset_vars = ['upazila', 'division', 'EC', 'Pill', 'Con', 'Inj', 'IUD',
                       'Imp','PerM', 'PerF', 'TPer', 'TUser']
        tmp = tmp[subset_vars]
        tmp = tmp.groupby('upazila').sum()
        tmp = tmp.reset_index()
        out_dict[key] = tmp
    return out_dict

def calculate_indicators(data_dict):
    out_dict = {}
    for key, value in data_dict.items():
        tmp = value.copy(deep=True)
        tmp['Imp12DistrMonthThana_Percent_Pill'] = np.round(tmp['Pill']/tmp['TUser']*100, 2)
        tmp['Imp12DistrMonthThana_Percent_Condom'] = np.round(tmp['Con']/tmp['TUser']*100, 2)
        tmp['Imp12DistrMonthThana_Percent_Injectable'] = np.round(tmp['Inj']/tmp['TUser']*100, 2)
        tmp['Imp12DistrMonthThana_Percent_IUD'] = np.round(tmp['IUD']/tmp['TUser']*100, 2)
        tmp['Imp12DistrMonthThana_Percent_Implant'] = np.round(tmp['Imp']/tmp['TUser']*100, 2)
        tmp['Imp12DistrMonthThana_Percent_PerMale'] = np.round(tmp['PerM']/tmp['TUser']*100, 2)
        tmp['Imp12DistrMonthThana_Percent_PerFemale'] = np.round(tmp['PerF']/tmp['TUser']*100, 2)
        tmp['Imp12DistrMonthThana_CAR'] = np.round(tmp['TUser']/tmp['EC']*100, 2)
        tmp = tmp[['upazila', 'Imp12DistrMonthThana_Percent_Pill', 'Imp12DistrMonthThana_Percent_Condom',
                   'Imp12DistrMonthThana_Percent_Injectable','Imp12DistrMonthThana_Percent_IUD',
                   'Imp12DistrMonthThana_Percent_Implant', 'Imp12DistrMonthThana_Percent_PerMale',
                   'Imp12DistrMonthThana_Percent_PerFemale', 'Imp12DistrMonthThana_CAR',]]
        out_dict[key] = tmp
    return out_dict


def update_geos(datad, geo_df):
    out = {}
    for key, value in datad.items():
        tmp = geo_df.merge(value, how='left', left_on='upazila_dgfp', right_on='upazila')
        out[key] = tmp
    return out


def clean_up(data_dict):
    out = {}
    for key, value in data_dict.items():
        tmp = value
        tmp['geo'] = tmp['division_geo'].str.cat(tmp['zila_geo'], sep="")
        tmp = value.drop(['upazila_dgfp', 'division_dgfp', 'upazila_full_dgfp', 'division_geo',
                          'division', 'zila_geo','zila','upazila'], axis=1)
        out[key] = tmp
    return out

## Data reading and processing 

- Variables:
    - Dist_Name: District
    - MonthYear: Month & Year
    - EC: Eligible Couple_Eligible Couple
    - Pill: Oral Pill_Oral Pill
    - Con: Condom_Condom
    - Inj: Injectable_Injectable
    - IUD: Intra-uterine device
    - Imp: Implant_Implant
    - PerM:	Permanent Method_Male
    - PerF:	Permanent Method_Female
    - TPer:	Permanent Method_Total
    - TUser: Total Acceptors_Total Acceptors
    - CAR: CAR(%)_CAR(%)
    - Unnamed: 13:	Unnamed
    - full_name_path: Full path name
    - file_name1: File name with extension
    - file_name2: File name without extension
    - year: Year
    - month: Month
    - upazila: Upazila
    - division: Division
    - upazila_full: Upazila full geo code
- Indicators:
    - Imp12DistrMonthThana_Percent_Pill = Pill/TUser
    - Imp12DistrMonthThana_Percent_Condom = Con/TUser
    - Imp12DistrMonthThana_Percent_Injectable = Inj/TUser
    - Imp12DistrMonthThana_Percent_IUD = IUD/TUser
    - Imp12DistrMonthThana_Percent_Implant = Imp/TUser
    - Imp12DistrMonthThana_Percent_PerMale = PerM/Tpermanent
    - Imp12DistrMonthThana_Percent_PerFemale = PerF/Tpermanent
    - Imp12DistrMonthThana_CAR = TUser/EC

In [23]:
files = list_files(path=DATA) 
data, log = read_arrange_files(files_list=files)

In [24]:
print(data['2018-12-28-0305-03'].columns)
data['2018-12-28-0305-03'].head()

Index(['Dist_Name', 'MonthYear', 'EC', 'Pill', 'Con', 'Inj', 'IUD', 'Imp',
       'PerM', 'PerF', 'TPer', 'TUser', 'CAR', 'Unnamed: 13', 'Month', 'Year',
       'full_name_path', 'file_name1', 'file_name2', 'year', 'month',
       'upazila', 'division', 'upazila_full'],
      dtype='object')


Unnamed: 0,Dist_Name,MonthYear,EC,Pill,Con,Inj,IUD,Imp,PerM,PerF,...,Month,Year,full_name_path,file_name1,file_name2,year,month,upazila,division,upazila_full
0,Bhola,January 2007,2222,4,5,10,3,7,4,4,...,January,2007,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,2018-12-28-0305-03.csv,2018-12-28-0305-03,2018,12,305,3,30305
1,Bhola,July 2007,317570,116039,8577,51228,7486,4885,3877,10861,...,July,2007,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,2018-12-28-0305-03.csv,2018-12-28-0305-03,2018,12,305,3,30305
2,Bhola,August 2007,318611,115697,8083,54104,6476,4883,3900,10891,...,August,2007,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,2018-12-28-0305-03.csv,2018-12-28-0305-03,2018,12,305,3,30305
3,Bhola,September 2007,316764,113577,8170,55810,6428,4861,4001,10917,...,September,2007,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,2018-12-28-0305-03.csv,2018-12-28-0305-03,2018,12,305,3,30305
4,Bhola,October 2007,319189,113131,7611,57210,6362,4841,4040,10945,...,October,2007,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,2018-12-28-0305-03.csv,2018-12-28-0305-03,2018,12,305,3,30305


### Log data

In [25]:
log['empty'].value_counts()
log_true = log[log['empty'] == True]
display(log_true.head())

Unnamed: 0,empty,full_name_path,file_name1,file_name2,year,month,upazila,division,upazila_full


## Missing data by years

In [26]:
print(log_true['year'].value_counts())

Series([], Name: year, dtype: int64)


## Missing data by months

In [27]:
print(log_true['month'].value_counts())

Series([], Name: month, dtype: int64)


## Missing data by divisions

In [28]:
print(log_true['division'].value_counts())

Series([], Name: division, dtype: int64)


In [29]:
data_df = concat_data(ddict=data)
print(data_df.shape)

(8698, 24)


In [30]:
display(data_df.head())
data_df.columns

Unnamed: 0,Dist_Name,MonthYear,EC,Pill,Con,Inj,IUD,Imp,PerM,PerF,...,Month,Year,full_name_path,file_name1,file_name2,year,month,upazila,division,upazila_full
0,Bhola,January 2007,2222,4,5,10,3,7,4,4,...,January,2007,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,2018-12-28-0305-03.csv,2018-12-28-0305-03,2018,12,305,3,30305
1,Bhola,July 2007,317570,116039,8577,51228,7486,4885,3877,10861,...,July,2007,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,2018-12-28-0305-03.csv,2018-12-28-0305-03,2018,12,305,3,30305
2,Bhola,August 2007,318611,115697,8083,54104,6476,4883,3900,10891,...,August,2007,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,2018-12-28-0305-03.csv,2018-12-28-0305-03,2018,12,305,3,30305
3,Bhola,September 2007,316764,113577,8170,55810,6428,4861,4001,10917,...,September,2007,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,2018-12-28-0305-03.csv,2018-12-28-0305-03,2018,12,305,3,30305
4,Bhola,October 2007,319189,113131,7611,57210,6362,4841,4040,10945,...,October,2007,/Users/edinhamzic/Symphony/wb_bangladesh/Bangl...,2018-12-28-0305-03.csv,2018-12-28-0305-03,2018,12,305,3,30305


Index(['Dist_Name', 'MonthYear', 'EC', 'Pill', 'Con', 'Inj', 'IUD', 'Imp',
       'PerM', 'PerF', 'TPer', 'TUser', 'CAR', 'Unnamed: 13', 'Month', 'Year',
       'full_name_path', 'file_name1', 'file_name2', 'year', 'month',
       'upazila', 'division', 'upazila_full'],
      dtype='object')

## Break data by years

In [31]:
datad = break_by_years(input_df=data_df, year_var='Year')
datad.keys()

dict_keys(['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'])

## Summarize data 

In [32]:
data = summarize_zilas(data_dict=datad)

In [33]:
data = calculate_indicators(data_dict=data)
for key, value in data.items():
    print(key)
    print(value.shape)

2007
(64, 9)
2008
(64, 9)
2009
(64, 9)
2010
(64, 9)
2011
(64, 9)
2012
(64, 9)
2013
(64, 9)
2014
(64, 9)
2015
(64, 9)
2016
(64, 9)
2017
(64, 9)
2018
(64, 9)


## Read geographies

In [34]:
dgfp_geo = pd.read_csv(GEO)
dgfp_geo['upazila_dgfp'] = dgfp_geo['upazila_dgfp'].astype(str).str.pad(width=4, side='left', fillchar='0')
dgfp_geo['division_dgfp'] = dgfp_geo['division_dgfp'].astype(str).str.pad(width=2, side='left', fillchar='0')
dgfp_geo['upazila_full_dgfp'] = dgfp_geo['upazila_full_dgfp'].astype(str).str.pad(width=6, side='left', fillchar='0')
dgfp_geo['division_geo'] = dgfp_geo['division_geo'].astype(str).str.pad(width=2, side='left', fillchar='0')
dgfp_geo['zila_geo'] = dgfp_geo['zila_geo'].astype(str).str.pad(width=2, side='left', fillchar='0')
dgfp_geo = dgfp_geo.drop(['thana_dgfp', 'Unnamed: 8'], axis=1)
print(dgfp_geo.shape)
dgfp_geo.head()
dgfp_geo = dgfp_geo.drop_duplicates()
print(dgfp_geo.shape)

(29260, 7)
(64, 7)


## Update geographies

In [35]:
data = update_geos(datad=data, geo_df=dgfp_geo)

## Clean up before writing data

In [36]:
data = clean_up(data_dict=data)

In [37]:
data['2007'].keys()

Index(['Imp12DistrMonthThana_Percent_Pill',
       'Imp12DistrMonthThana_Percent_Condom',
       'Imp12DistrMonthThana_Percent_Injectable',
       'Imp12DistrMonthThana_Percent_IUD',
       'Imp12DistrMonthThana_Percent_Implant',
       'Imp12DistrMonthThana_Percent_PerMale',
       'Imp12DistrMonthThana_Percent_PerFemale', 'Imp12DistrMonthThana_CAR',
       'geo'],
      dtype='object')

## Writing data

In [38]:
def write_out(data_dict, out_dir):
    out = {}
    for key, value in data_dict.items():
        var_names = ["imp12distr_monthprocess" + var if var != 'geo' else var for var in value.columns]
        value.columns = var_names
        value.to_csv(path_or_buf=os.path.join(out_dir, "data_dgfp_imp12distr_district_monthprocess_" + str(key) + ".csv"), 
                    index=False, index_label=False)
write_out(data_dict=data, out_dir=OUT)