# fuzzy_match

## TO DO LIST:
- DONE: Import processed UNICEF data
- DONE: Evaluate UNICEF geo variables - All districts, with CC and divisons and other stuff
- Get raw geo xlsx file and process and export to output
- Create function for matching zilas and exporting data

```
!pip install fuzzywuzzy
!pip install python-Levenshtein
```

In [1]:
import os
import re
import pandas as pd
from fuzzywuzzy import fuzz

In [2]:
os.getcwd()

'/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/utils/fuzzy_match'

## Create and clena geo reference

In [3]:
GEO = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/data/geo_files/bbs_geos/geo.xlsx'
DATA = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/data'
OUT_DIR = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/unicef/gdata'

In [4]:
def process_zilas_upazilas(path, out_path):
    tmp = pd.read_excel(path, header=1)
    tmp = tmp[['Division Code','Division name', 'District Code','District Name','Upazilla code','Upazilla']]
    tmp.columns = ['DivisionCode','DivisionName', 'DistrictCode','DistrictName','UpazillaCode','UpazillaName']
    tmp['DistrictName'] = tmp['DistrictName'].str.replace(' Zila', '')
    tmp['UpazillaName'] = tmp['UpazillaName'].str.replace(' Upazila', '')
    tmp = tmp.drop_duplicates()
    tmp.to_csv(out_path+'/georef_zila_upazila.csv', index=False, index_label=False)
    return tmp, list(tmp.columns)


def import_data(dir_path):
    data_dict = {}
    for path, subdir, files in os.walk(dir_path):
        for file in files:
            if bool(re.search("^data_unicef_ces", file)):
                try:
                    data_dict[file] = pd.read_csv(os.path.join(path,file))
                except UnicodeDecodeError:
                    data_dict[file] = pd.read_csv(os.path.join(path,file), encoding='latin-1')
    return data_dict

def get_input_geos(path):
    tmp = pd.read_csv(path)
    return tmp

def match_districts(ref_df, ref_match, input_df):
    out = pd.DataFrame()
    print(input_df.columns)
    idvar = [var for var in input_df.columns if bool(re.search(r"Survey|Unit",var))]
    print(idvar)
    for key, code in enumerate(input_df[idvar[0]]):
        code_match = {}
        code_match['FuzzRatio'] = [fuzz.ratio(ref_code, code)  for ref_code in ref_df[ref_match]]
        code_match['Geo'] = code
        code_match['DivisionCode'] = [value[0] for value in ref_df.values]
        code_match['DivisionName'] = [value[1]  for value in ref_df.values]
        code_match['DistrictCode'] = [value[2]  for value in ref_df.values]
        code_match['DistrictName'] = [value[3]  for value in ref_df.values]
        code_match = pd.DataFrame.from_dict(code_match)
        out = out.append(code_match.sort_values('FuzzRatio', ascending=False).iloc[0,:])
    for var in list(out.columns):
        if out[var].dtype.kind == 'f':
            out[var] = out[var].astype(int)
            out[var] = out[var].astype(str)
            code_length = max([len(char) for char in out[var]])
            out[var] = out[var].str.pad(width=code_length, side='left', fillchar='0') 
    return out

def merge_out(ref_df, ref_match, data_dict, out_path):
    out_dict = {}
    for key, value in data_dict.items():
        print("#"*80)
        print(f"key shape: {value.shape}")
        matched_df = match_districts(ref_df=ref_df, ref_match='DistrictName', input_df=value)
        print(f"matched_df shape: {matched_df.shape}")
        id_var = [var for var in value.columns if bool(re.search(r"Survey|Unit",var))]
        print(id_var)
        out_df = matched_df.merge(right=value, how='inner', left_on='Geo', right_on=id_var[0])
        out_df.to_csv(os.path.join(out_path, 'g'+ key), index=False, index_label=False)
        print(f"out_df shape: {out_df.shape}")
        out_df['GeoCode'] = out_df['DivisionCode'].str.cat(out_df['DistrictCode'], sep='')
        out_dict['g'+ key] = out_df
    return out_dict

In [5]:
data = import_data(dir_path=DATA)
data.keys()

dict_keys(['data_unicef_ces_2016.csv', 'data_unicef_ces_2014.csv', 'data_unicef_ces_2015.csv', 'data_unicef_ces_2011.csv', 'data_unicef_ces_2010.csv', 'data_unicef_ces_2006.csv', 'data_unicef_ces_2013.csv'])

In [6]:
geo,geo_list = process_zilas_upazilas(path=GEO, out_path='/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/geo_files/data/')
display(geo.head())
geo_list

Unnamed: 0,DivisionCode,DivisionName,DistrictCode,DistrictName,UpazillaCode,UpazillaName
0,40,Khulna,1,Bagerhat,8,Bagerhat Sadar
9,40,Khulna,1,Bagerhat,58,Mongla
17,40,Khulna,1,Bagerhat,60,Morrelganj
26,20,Chittagong,3,Bandarban,14,Bandarban Sadar
35,20,Chittagong,3,Bandarban,51,Lama


['DivisionCode',
 'DivisionName',
 'DistrictCode',
 'DistrictName',
 'UpazillaCode',
 'UpazillaName']

In [7]:
geo_matched = merge_out(ref_df=geo, 
                ref_match='DistrictName',
                data_dict=data,
                out_path=OUT_DIR)

################################################################################
key shape: (88, 46)
Index(['Survey.Units', 'BCG_Children23M', 'OPV1_Children23M',
       'PENTA1_Children23M', 'OPV2_Children23M', 'PENTA2_Children23M',
       'OPV3_Children23M', 'PENTA3_Children23M', 'Measles_Children23M',
       'Fully_Children23M', 'BCG_Children12M', 'OPV1_Children12M',
       'PENTA1_Children12M', 'OPV2_Children12M', 'PENTA2_Children12M',
       'OPV3_Children12M', 'PENTA3_Children12M', 'Measles_Children12M',
       'Fully_Children12M', 'DropoutPENTA1-PENTA3_Male23M',
       'DropoutPENTA1-PENTA3_Female23M', 'DropoutPENTA1-PENTA3_Children23M',
       'DropoutPENTA1-Measles_Male23M', 'DropoutPENTA1-Measles_Female23M',
       'DropoutPENTA1-Measles_Children23M', 'InvalidPENTA1_Infant12M',
       'InvalidPENTA2_Infant12M', 'InvalidPENTA3_Infant12M',
       'InvalidMeasles_Infant12M', 'Measles2ndDoseCrude',
       'Measles2ndDose_Children18M', 'Measles2ndDose_Children23M',
       'TT1_Mot

matched_df shape: (98, 6)
['Survey.Units']
out_df shape: (98, 50)
