## DGFP: Processing GEO files

In [1]:
import os
import glob
import logging
import pandas as pd

## To do list: 
- **DONE: Extract all names from datasets**
- **DONE: Define variable names for dataset 1**
- **DONE: Define variable names for dataset 2**
- **DONE: Define variable names for dataset 3**
- **DONE: Define variable names for dataset 4**
- **DONE: Define variable names for dataset 5**
- **DONE: Define variable names for dataset 6**
- **DONE: Define variable names for dataset 7**
- **DONE: Define variable names for dataset 8**
- **DONE: Define variable names for dataset 9**
- **Extract subdistrict codes and names and align**
- Import BBS codes from BBS data
- Align DGFP codes with BBS codes from union list
- Write out data

In [155]:
def read_data(path, ext):
    files_list = glob.glob(path + '/*.' + ext)
    data = list()
    for file in files_list:
        file_name = os.path.split(file)[1][:-4]
        program = file.split("/")[-2]
        try:
            tmp = pd.read_csv(file, sep='\t', comment=' ')
        except pd.errors.EmptyDataError:
            print(f"{file} has no columns to parse. Moving on")
            continue
        file_name_comp = file_name.split('-')
        tmp['Year'] = file_name_comp[0]
        tmp['Month'] = file_name_comp[1]
        tmp['District'] =file_name_comp[3]
        tmp['Division'] = file_name_comp[4]
        tmp['program'] = program
        data.append(tmp)
    return pd.concat(data, axis=0)

def extract_var_info(files_paths, ext, out):
    names_dict = {'dir':[], 'names':[]}
    dirs = pd.read_csv(files_paths, header=None, names=['dirs'])
    for folder in dirs['dirs']:
        print(folder)
        tmp = read_data(path=folder, ext=ext)
        print(tmp.shape)
        display(tmp.head())
        names_dict['names'] = names_dict['names'] + list(tmp.columns)
        names_dict['dir'] = names_dict['dir'] + [folder]*len(list(tmp.columns))
        output = pd.DataFrame.from_dict(names_dict)
        output.to_csv(OUT, index=False, index_label=False)
    return output

def extract_geo_info(files_paths, ext, vars_to_select, year):
    dirs = pd.read_csv(files_paths, header=None, names=['dirs'])
    out = {}
    for folder in dirs['dirs']:
        tmp = read_data(path=folder, ext=ext)
        if len(tmp[year].unique()) > 2:
            subset = [var for var in list(tmp.columns) if var in vars_to_select]
            print(list(tmp.columns))
            print(subset)
            tmp = tmp[subset]
            print(tmp.shape)
            out[folder] = tmp
    return out

def import_process(in_file, out_file):
    with open(in_file, 'r') as f:
        file = f.readlines()
    # out = [line for line in file if line[0].isdigit()]
    out = []
    for line in file:
        if line[0].isdigit():
            line = line.split(",", 4)
            line[4] = line[4].replace(",", " ")
            line = ",".join(line)
            out.append(line)
    with open(out_file, 'w') as w:
        for line in out:
            w.write(line)
    return out


In [112]:
PATHS = '../../output/dgfp/selected_list_dir.txt'
OUT = '../../output/dgfp/dataset_vars.csv'

In [24]:
output = extract_var_info(files_paths=PATHS, ext='csv', out=OUT)
cols = ['thana', 'thana_Name', 'Dist_Name', 'monthid', 'Year', 'Month', 'District', 'Division', 'program']
print(cols)
output = extract_geo_info(files_paths=PATHS, ext='csv', vars_to_select=cols, year = 'Year')


['thana', 'thana_Name', 'Dist_Name', 'monthid', 'Year', 'Month', 'District', 'Division', 'program']
['thana', 'ECouple', 'Pill', 'Condom', 'Injectable', 'IUD', 'Implant', 'PerMale', 'PerFemale', 'Tpermanent', 'GrandTotal', 'CAR', 'Unnamed: 12', 'Year', 'Month', 'District', 'Division', 'program']
['thana', 'Year', 'Month', 'District', 'Division', 'program']
(29317, 6)
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-10-28-0507-05.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0102-07.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0405-04.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0305-03.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0114-01.csv has no columns to parse. Moving on
../../data

../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0414-08.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0108-07.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0411-04.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0112-01.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-10-28-0414-08.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0501-05.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0201-02.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-10-28-0104-07.csv has no columns to parse. Moving on
../../da

../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0417-08.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0107-07.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0603-06.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-10-28-0202-02.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0412-04.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0111-01.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-10-28-0210-02.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0415-08.csv has no columns to parse. Moving on
../../da

In [161]:
masters_dict = {}
for key, value in output.items():
    print(key)
    years_dict = {}
    for index, item in enumerate(sorted(value['Year'].unique())):
        print(index,item)
        years_dict[str(item)] = value[value['Year'] == item]
        print(years_dict.keys())
    masters_dict[str(value['program'].unique()[0])] = years_dict

for key, value in masters_dict.items():
    print(key)
    print(value.keys())

../../data/dgfp/data/importance11_subdistricts/ngothana_process
0 2008
dict_keys(['2008'])
1 2009
dict_keys(['2008', '2009'])
2 2010
dict_keys(['2008', '2009', '2010'])
3 2011
dict_keys(['2008', '2009', '2010', '2011'])
4 2012
dict_keys(['2008', '2009', '2010', '2011', '2012'])
5 2013
dict_keys(['2008', '2009', '2010', '2011', '2012', '2013'])
6 2014
dict_keys(['2008', '2009', '2010', '2011', '2012', '2013', '2014'])
7 2015
dict_keys(['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015'])
8 2016
dict_keys(['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016'])
9 2017
dict_keys(['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017'])
10 2018
dict_keys(['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'])
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess
0 2016
dict_keys(['2016'])
1 2017
dict_keys(['2016', '2017'])
2 2018
dict_keys(['2016', '2017', '2018'])
../../data/dgfp/data/import

In [None]:
    logging.basicConfig(filename='check_geos.log',level=logging.INFO)
    logging.info("Checking all geo variables by years available in the following datasets: ")
    logging.info(f"{data_dict.keys()}")

In [219]:
def check_geos(data_dict, start_year, end_year):
    years = [str(year) for year in range(start_year,end_year+1)]
    out = {}
    for index, year in enumerate(years):
        year_check = {}
        for key, value in data_dict.items():
            if str(year) in list(value.keys()):
                try:
                    year_check[key] = value[year]['thana']
                except KeyError:
                    year_check[key] = value[year]['thana_Name']
            else:
                pass
        out[year] = year_check
    return out

In [248]:
test = check_geos(data_dict=masters_dict, start_year=2008, end_year=2018)
test.keys()

dict_keys(['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018'])

In [247]:
for key, value in test.items():
    print("#"*100)
    print(f"Year: {key}")
    for key1, value1 in value.items():
        print(key1)
        print(len(value1.unique()))

####################################################################################################
Year: 2008
ngothana_process
242
thana_process
467
####################################################################################################
Year: 2009
ngothana_process
257
thana_process
468
####################################################################################################
Year: 2010
ngothana_process
258
thana_process
468
####################################################################################################
Year: 2011
ngothana_process
249
thana_process
469
####################################################################################################
Year: 2012
ngothana_process
250
thana_process
471
####################################################################################################
Year: 2013
ngothana_process
242
thana_process
471
##############################################################################################

In [245]:
print(len(set(test['2008']['thana_process']).union(set(test['2008']['ngothana_process']))))
print(len(set(test['2008']['ngothana_process']).union(set(test['2008']['thana_process']))))
print(len(set(test['2008']['thana_process']).intersection(set(test['2008']['ngothana_process']))))
print(len(set(test['2008']['ngothana_process']).intersection(set(test['2008']['thana_process']))))

467
467


In [250]:
print(len(set(test['2018']['thana_process']).union(set(test['2008']['ngothana_process']))))
print(len(set(test['2008']['ngothana_process']).union(set(test['2018']['thana_process']))))
print(len(set(test['2018']['thana_process']).intersection(set(test['2008']['ngothana_process']))))
print(len(set(test['2008']['ngothana_process']).intersection(set(test['2018']['thana_process']))))

471
471
242
242


## Importing original BBS list of geogrpahies

In [196]:
t = import_process(in_file='../../data/geo_files/unionList.csv',out_file='../../data/geo_files/unionList_updated.csv')
unionList = pd.read_csv('../../data/geo_files/unionList_updated.csv', 
                        header=None,
                        names=['Division', 'Zila', 'Upazila', 'Union', 'Name'],
                        dtype = {'Division':'object', 'Zila':'object', 'Upazila':'object', 'Union':'object', 'Name':'object'}
                       )
geo = pd.read_excel('../../data/geo_files/geo_.xlsx', header=1)
display(unionList.head())
display(c.head())

Unnamed: 0,Division,Zila,Upazila,Union,Name
0,10,,,,BARISAL
1,10,4.0,,,BARGUNA
2,10,4.0,9.0,,AMTALI
3,10,4.0,9.0,13.0,AMTALI
4,10,4.0,9.0,15.0,ARPANGASHIA


Unnamed: 0,Division Code,Division name,District Code,District Name,Upazilla code,Upazilla,Union code,Union,Population Total,Unnamed: 9
0,40,Khulna,1,Bagerhat Zila,8,Bagerhat Sadar Upazila,1,Ward No-01,5339,
1,40,Khulna,1,Bagerhat Zila,8,Bagerhat Sadar Upazila,2,Ward No-02,5406,
2,40,Khulna,1,Bagerhat Zila,8,Bagerhat Sadar Upazila,3,Ward No-03,7688,
3,40,Khulna,1,Bagerhat Zila,8,Bagerhat Sadar Upazila,4,Ward No-04,4530,
4,40,Khulna,1,Bagerhat Zila,8,Bagerhat Sadar Upazila,5,Ward No-05,4297,


In [204]:
geo = geo[['Division Code', 'Division name', 'District Code','District Name', 'Upazilla code', 'Upazilla']]
print(geo.shape)
geo = geo.drop_duplicates()
print(geo.shape)
geo.to_excel('../../data/geo_files/geo_.xlsx')

(544, 6)
(544, 6)


In [183]:
zila_summary = []
for code in unionList['Division']:
    if str(code) != 'nan':
        t = len(str(code))
        zila_summary.append(t)
collections.Counter(zila_summary)

Counter({2: 5152})

In [185]:
unionList['Division'].value_counts()

30    1409
20    1066
40     651
50     640
55     604
10     401
60     381
Name: Division, dtype: int64

In [198]:
geo['Upazilla'].unique()

array(['Bagerhat Sadar Upazila ', 'Mongla Upazila ',
       'Morrelganj Upazila ', 'Bandarban Sadar Upazila ', 'Lama Upazila ',
       'Amtali Upazila ', 'Barguna Sadar Upazila ', 'Betagi Upazila ',
       'Patharghata Upazila ', 'Bakerganj Upazila ',
       'Banari Para Upazila ', 'Gaurnadi Upazila ',
       'Barisal Sadar Upazila ', 'Mehendiganj Upazila ',
       'Muladi Upazila ', 'Bhola Sadar Upazila ', 'Burhanuddin Upazila ',
       'Char Fasson Upazila ', 'Daulat Khan Upazila ',
       'Lalmohan Upazila ', 'Adamdighi Upazila ', 'Bogra Sadar Upazila ',
       'Dhunat Upazila ', 'Dhupchanchia Upazila ', 'Gabtali Upazila ',
       'Kahaloo Upazila ', 'Nandigram Upazila ', 'Sariakandi Upazila ',
       'Shajahanpur Upazila ', 'Sherpur Upazila ', 'Shibganj Upazila ',
       'Sonatola Upazila ', 'Akhaura Upazila ',
       'Brahmanbaria Sadar Upazila ', 'Kasba Upazila ',
       'Nabinagar Upazila ', 'Chandpur Sadar Upazila ',
       'Faridganj Upazila ', 'Hajiganj Upazila ', 'Kachua Upa