## DGFP: Processing GEO files

In [1]:
import os
import glob
import pandas as pd

In [10]:
def read_data(path, ext):
    files_list = glob.glob(path + '/*.' + ext)
    data = list()
    for file in files_list:
        file_name = os.path.split(file)[1][:-4]
        program = file.split("/")[-2]
        try:
            tmp = pd.read_csv(file, sep='\t', comment=' ')
        except pd.errors.EmptyDataError:
            print(f"{file} has no columns to parse. Moving on")
            continue
        file_name_comp = file_name.split('-')
        tmp['Year'] = file_name_comp[0]
        tmp['Month'] = file_name_comp[1]
        tmp['District'] =file_name_comp[3]
        tmp['Division'] = file_name_comp[4]
        tmp['program'] = program
        data.append(tmp)
    return pd.concat(data, axis=0)

def extract_var_info(files_paths, ext, out):
    tmp_dict = {}
    names_dict = {'dir':[], 'names':[]}
    dirs = pd.read_csv(files_paths, header=None, names=['dirs'])
    for folder in dirs['dirs']:
        print(folder)
        tmp = read_data(path=folder, ext=ext)
        print(tmp.shape)
        display(tmp.head())
        names_dict['names'] = names_dict['names'] + list(tmp.columns)
        names_dict['dir'] = names_dict['dir'] + [folder]*len(list(tmp.columns))
        tmp_dict[folder] = tmp.tail() 
        output = pd.DataFrame.from_dict(names_dict)
        output.to_csv(OUT, index=False, index_label=False)
    return output

In [11]:
PATHS = '../../output/dgfp/selected_list_dir.txt'
OUT = '../../output/dgfp/dataset_vars.csv'
output = extract_var_info(files_paths=paths, ext='csv', out=OUT)

../../data/dgfp/data/importance12_subdistricts/distributionGO_thanaconsuprocess
(488, 32)


Unnamed: 0,thana_Name,Shukhi,Apon,Pill_total,Condom,Inj_vial,Inj_siringe,IUD_normal,IUD_partum,IUD_total,...,per_male,femalenormal,femalepn,female_total,Permanent_method,Year,Month,District,Division,program
0,Bhola,,,,,,,,,,...,,,,,,2018,12,305,3,distributionGO_thanaconsuprocess
1,Daulatkhan,1170943.0,5931.0,1176874.0,1683632.0,523945.0,305899.0,3635.0,34.0,3669.0,...,186.0,293.0,19.0,310.0,496.0,2018,12,305,3,distributionGO_thanaconsuprocess
2,Tajumuddin,764903.0,3274.0,768177.0,1101356.0,411956.0,239922.0,1946.0,0.0,1946.0,...,859.0,227.0,2.0,229.0,1088.0,2018,12,305,3,distributionGO_thanaconsuprocess
3,Borhanuddin,1363617.0,7640.0,1371257.0,803312.0,367333.0,195240.0,4047.0,84.0,4131.0,...,1136.0,532.0,63.0,590.0,1726.0,2018,12,305,3,distributionGO_thanaconsuprocess
4,Lalmohan,2019211.0,9325.0,2028536.0,2445112.0,804332.0,500964.0,5205.0,243.0,5448.0,...,3548.0,567.0,99.0,672.0,4220.0,2018,12,305,3,distributionGO_thanaconsuprocess


../../data/dgfp/data/importance12_subdistricts/distributionNGO_thanaconsuprocess
(355, 27)


Unnamed: 0,thana_Name,Shukhi,Apon,Condom,Inj_Vial,Inj_Siringe,IUD_Normal,IUD_Partum,IUD_Remove,Implanon,...,IFA_number,Sanitary_Pad,NSV,Tubectomy_Normal,Tubectomy_Partum,Year,Month,District,Division,program
0,Bhola,,,,,,,,,,...,,,,,,2018,12,305,3,distributionNGO_thanaconsuprocess
1,Daulatkhan,616509.0,483.0,1367133.0,185424.0,106876.0,652.0,0.0,2.0,229.0,...,0.0,0.0,82.0,1.0,0.0,2018,12,305,3,distributionNGO_thanaconsuprocess
2,Tajumuddin,319609.0,559.0,807033.0,171031.0,90743.0,693.0,0.0,12.0,19.0,...,0.0,0.0,20.0,1.0,0.0,2018,12,305,3,distributionNGO_thanaconsuprocess
3,Lalmohan,739027.0,2240.0,1466517.0,329384.0,204038.0,918.0,0.0,41.0,499.0,...,96050.0,0.0,51.0,29.0,0.0,2018,12,305,3,distributionNGO_thanaconsuprocess
4,Charfashion,530526.0,0.0,2260833.0,244589.0,209620.0,972.0,0.0,2.0,13.0,...,0.0,0.0,0.0,0.0,0.0,2018,12,305,3,distributionNGO_thanaconsuprocess


../../data/dgfp/data/importance12_districts/mchgo_distmonthprocess
(2112, 77)


Unnamed: 0,monthid,MCH37,MCH38,MCH39,MCH40,MCH41,MCH42,MCH43,MCH44,MCH45,...,MCH103,MCH104,MCH105,MCH106,MCH107,Year,Month,District,Division,program
0,2016-01-28,2768,6689,9457,6479,15936,1544,929,932,369,...,714,0,91,54,134,2018,12,305,3,mchgo_distmonthprocess
1,2016-02-28,3037,6413,9450,8232,17682,1375,971,851,548,...,1828,0,80,78,152,2018,12,305,3,mchgo_distmonthprocess
2,2016-03-28,1329,7615,8944,8294,17238,1281,996,770,617,...,1383,1,97,84,262,2018,12,305,3,mchgo_distmonthprocess
3,2016-04-28,1512,7289,8801,8066,16867,1812,1005,731,572,...,2490,0,95,79,208,2018,12,305,3,mchgo_distmonthprocess
4,2016-05-28,1913,6462,8375,7621,15996,1331,1054,928,663,...,1300,0,61,73,243,2018,12,305,3,mchgo_distmonthprocess


../../data/dgfp/data/importance12_districts/ngodistrict_monthprocess
(7489, 19)


Unnamed: 0,Dist_Name,MonthYear,EC,Pill,Con,Inj,IUD,Imp,PerM,PerF,TPer,TUser,CAR,Unnamed: 13,Year,Month,District,Division,program
0,Bhola,July,,,,,,,,,,,,,2018,12,305,3,ngodistrict_monthprocess
1,Bhola,August,,,,,,,,,,,,,2018,12,305,3,ngodistrict_monthprocess
2,Bhola,September,,,,,,,,,,,,,2018,12,305,3,ngodistrict_monthprocess
3,Bhola,October,,,,,,,,,,,,,2018,12,305,3,ngodistrict_monthprocess
4,Bhola,November,,,,,,,,,,,,,2018,12,305,3,ngodistrict_monthprocess


../../data/dgfp/data/importance12_districts/distributionGO_distmonthProcess
(8672, 32)


Unnamed: 0,monthid,Shukhi,Apon,Pill_total,Condom,Inj_vial,Inj_siringe,IUD_normal,IUD_partum,IUD_total,...,per_male,femalenormal,femalepn,female_total,Permanent_method,Year,Month,District,Division,program
0,"July,2007",91661,0,91661,52581,24197,0,136,0,136,...,47,35,0,35,82,2018,12,305,3,distributionGO_distmonthProcess
1,"August,2007",84221,0,84221,46020,18188,0,87,0,87,...,81,36,0,36,117,2018,12,305,3,distributionGO_distmonthProcess
2,"September,2007",84714,0,84714,56346,20243,0,108,0,108,...,148,46,0,46,194,2018,12,305,3,distributionGO_distmonthProcess
3,"October,2007",81598,0,81598,43630,23969,0,133,0,133,...,73,54,0,54,127,2018,12,305,3,distributionGO_distmonthProcess
4,"November,2007",84763,0,84763,50118,22869,0,434,0,434,...,221,121,0,121,342,2018,12,305,3,distributionGO_distmonthProcess


../../data/dgfp/data/importance12_districts/district_monthprocess
(8698, 19)


Unnamed: 0,Dist_Name,MonthYear,EC,Pill,Con,Inj,IUD,Imp,PerM,PerF,TPer,TUser,CAR,Unnamed: 13,Year,Month,District,Division,program
0,Bhola,January,,,,,,,,,,,,,2018,12,305,3,district_monthprocess
1,Bhola,July,,,,,,,,,,,,,2018,12,305,3,district_monthprocess
2,Bhola,August,,,,,,,,,,,,,2018,12,305,3,district_monthprocess
3,Bhola,September,,,,,,,,,,,,,2018,12,305,3,district_monthprocess
4,Bhola,October,,,,,,,,,,,,,2018,12,305,3,district_monthprocess


../../data/dgfp/data/importance11_subdistricts/ngothana_process
(29317, 18)


Unnamed: 0,thana,ECouple,Pill,Condom,Injectable,IUD,Implant,PerMale,PerFemale,Tpermanent,GrandTotal,CAR,Unnamed: 12,Year,Month,District,Division,program
0,Panchgarh,,,,,,,,,,,,,2012,9,103,7,ngothana_process
1,Atowari,9493.0,3363.0,274.0,1555.0,401.0,266.0,774.0,737.0,1511.0,7370.0,77.6362,,2012,9,103,7,ngothana_process
2,Debiganj,11190.0,3979.0,283.0,3035.0,211.0,95.0,740.0,287.0,1027.0,8630.0,77.1224,,2012,9,103,7,ngothana_process
0,Moheshpur,14440.0,4864.0,930.0,3234.0,652.0,410.0,81.0,1249.0,1330.0,11420.0,79.0859,,2013,1,204,2,ngothana_process
1,Jhenaidaha,,,,,,,,,,,,,2013,1,204,2,ngothana_process


../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-10-28-0507-05.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0102-07.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0405-04.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0305-03.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0114-01.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0409-04.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0209-02.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/

../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0108-07.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0411-04.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0112-01.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-10-28-0414-08.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0501-05.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0201-02.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-10-28-0104-07.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-10-28-0403-04.csv has no columns to parse. Moving on
../../da

../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0417-08.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0107-07.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0603-06.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-10-28-0202-02.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-12-28-0412-04.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0111-01.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-10-28-0210-02.csv has no columns to parse. Moving on
../../data/dgfp/data/importance11_subdistricts/mchgo_thanaprocess/2018-11-28-0415-08.csv has no columns to parse. Moving on
../../da

Unnamed: 0,thana_Name,MCH37,MCH38,MCH39,MCH40,MCH41,MCH42,MCH43,MCH44,MCH45,...,MCH103,MCH104,MCH105,MCH106,MCH107,Year,Month,District,Division,program
0,Mymensingh,,,,,,,,,,...,,,,,,2018,9,415,8,mchgo_thanaprocess
1,Trisal,593.0,1866.0,2459.0,2249.0,4708.0,257.0,245.0,179.0,121.0,...,151.0,0.0,10.0,11.0,63.0,2018,9,415,8,mchgo_thanaprocess
2,Fulbaria,362.0,1091.0,1453.0,1510.0,2963.0,152.0,172.0,174.0,99.0,...,152.0,0.0,4.0,10.0,68.0,2018,9,415,8,mchgo_thanaprocess
3,Bhaluka,251.0,719.0,970.0,1342.0,2312.0,218.0,248.0,201.0,113.0,...,188.0,2.0,9.0,6.0,48.0,2018,9,415,8,mchgo_thanaprocess
4,Muktagacha,434.0,1331.0,1765.0,1906.0,3671.0,431.0,501.0,343.0,316.0,...,170.0,0.0,0.0,0.0,73.0,2018,9,415,8,mchgo_thanaprocess


../../data/dgfp/data/importance11_subdistricts/thana_process
(63891, 18)


Unnamed: 0,thana,ECouple,Pill,Condom,Injectable,IUD,Implant,PerMale,PerFemale,Tpermanent,GrandTotal,CAR,Unnamed: 12,Year,Month,District,Division,program
0,Boda,51742.0,21788.0,873.0,8018.0,840.0,3573.0,3183.0,3480.0,6663.0,41755.0,80.6985,,2012,9,103,7,thana_process
1,Panchgarh,,,,,,,,,,,,,2012,9,103,7,thana_process
2,Tetulia,26033.0,12321.0,848.0,3058.0,1024.0,1291.0,794.0,1605.0,2399.0,20941.0,80.4402,,2012,9,103,7,thana_process
3,Atowari,28599.0,13166.0,622.0,2986.0,653.0,1111.0,2257.0,2119.0,4376.0,22914.0,80.1217,,2012,9,103,7,thana_process
4,Debiganj,47815.0,19301.0,1178.0,10259.0,903.0,1109.0,3362.0,2674.0,6036.0,38786.0,81.1168,,2012,9,103,7,thana_process


## To do list: 
- **DONE: Extract all names from datasets**
- **DONE: Define variable names for dataset 1**
- **DONE: Define variable names for dataset 2**
- **DONE: Define variable names for dataset 3**
- **DONE: Define variable names for dataset 4**
- **DONE: Define variable names for dataset 5**
- **DONE: Define variable names for dataset 6**
- **DONE: Define variable names for dataset 7**
- **DONE: Define variable names for dataset 8**
- **DONE: Define variable names for dataset 9**
- Create geo files from all files and compare

In [None]:
cols = ['thana', 'thana_Name', 'Dist_Name', 'monthid', 'Year', 'Month', 'District', 'Division', 'program']
cols

