Import libraries

In [None]:
import pandas as pd
import numpy as np
import recordlinkage

Specify path directories

In [None]:
# Specify root folder
root_folder = r'\Development\facility_mapping_old'

# Combined Excel file
new_excel_file_path = root_folder + r'\Data\master_file_new.xlsx'

# Zone file
zone_mappings_file_path = root_folder + r'\Output\zone_mappings.xlsx'

# Woreda file
woreda_mappings_file_path = root_folder + r'\Output\woreda_mappings.xlsx'

# HP mapping path
hp_mappings_file_path = root_folder + r'\Output\hp_final_mappings.xlsx'

# HP mapping path
hc_mappings_file_path = root_folder + '\Output\hc_final_mappings.xlsx'

Read in data and rename columns where necessary

In [None]:
df_new_eCHIS_combined = pd.read_excel(new_excel_file_path, sheet_name='echis_master')
df_new_eCHIS_combined = df_new_eCHIS_combined.add_suffix('_echis')

df_dhis2 = pd.read_excel(new_excel_file_path, sheet_name='dhis2_master')
df_dhis2.rename(columns={'Region':'region_name_dhis2', 'Zone':'zone_name_dhis2', 'Woreda':'woreda_name_dhis2', 'PHCU':'hc_name_dhis2', 'Facility Name':'facility_name_dhis2'}, inplace=True)

print('# of records in DHIS2:\t\t', len(df_dhis2))

print('# of records in mfr:\t\t', len(df_new_eCHIS_combined))


Pre-Processing

In [None]:
# Make adjustments for Dire Dawa
df_new_eCHIS_combined['zone_name_echis'] = df_new_eCHIS_combined[['woreda_name_echis','zone_name_echis','region_name_echis']].apply(lambda x : x.woreda_name_echis if x.region_name_echis == 'Dire Dawa' else x.zone_name_echis, axis=1)

print('# of potential health posts in mfr', len(df_new_eCHIS_combined))

Using recordlinkage to link dhis2 and echis regions

In [None]:
region_e = df_new_eCHIS_combined[['region_name_echis']].drop_duplicates().dropna().reset_index(drop=True)
region_d = df_dhis2[['region_name_dhis2']].drop_duplicates().dropna().reset_index(drop=True)

print(region_e)
print(region_d)

indexer = recordlinkage.Index()
indexer.full()
region_candidate_links = indexer.index(region_e, region_d)

compare_cl = recordlinkage.Compare()
compare_cl.string('region_name_echis', 'region_name_dhis2', method='jarowinkler', label='match_score')

regions_linked = compare_cl.compute(region_candidate_links, region_e, region_d)
regions_linked = regions_linked.reset_index()

regions_linked = regions_linked.merge(region_e.reset_index().rename({'index':'level_0'},axis=1), how='left', on='level_0')

regions_linked = regions_linked.merge(region_d.reset_index().rename({'index':'level_1'},axis=1), how='left', on='level_1')

regions_linked['ranked'] = regions_linked.groupby('level_0')['match_score'].rank(ascending=False)

regions_linked = regions_linked[regions_linked['ranked'] == 1]

# Add the mapped regions to the eCHIS dataframe
df_new_eCHIS_combined_with_mappings = df_new_eCHIS_combined.merge(regions_linked[['region_name_echis','region_name_dhis2']], how='left', left_on='region_name_echis', right_on='region_name_echis')

df_new_eCHIS_combined_with_mappings