# Cleaning and exploring data for Macro-Eyes

In [326]:
import os
import pathlib
from pathlib import Path 
import pandas as pd
import numpy as np
import re

To do:
* Remove duplicate rows
* Check for different text referring to same facility eg 'hospital' vs 'chp'

It will be helpful later to display all the columns:

In [327]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 200)

## Read in raw data

In [328]:
df_raw = pd.read_excel('Health Facility Assessment (1).xlsx',index_col=0, usecols='B:GW')
df = df_raw.copy()
df=df.iloc[:50,:]

## Delete empty columns

In [329]:
# the columns whose names end with 'choice_labels' are empty, so we can delete them
choice_label_cols = []
for col in df.columns:
    if len((re.findall('\.choice_labels',col)))!=0:
        choice_label_cols.append(col)

df = df.drop(choice_label_cols,axis=1)


## Shorten column names

Let's start by displaying the column names:

In [330]:
cols = list(df.columns)
print(*cols,sep='\n')

form.health_centre_information.facility_name
form.facility_gps
form.health_centre_information.location_information.region_province
form.health_centre_information.location_information.district
form.health_centre_information.location_information.chiefdom
form.health_centre_information.location_information.facility_location
form.health_centre_information.facility_type
form.health_centre_information.facility_type_other
form.health_centre_information.managing_authority
form.health_centre_information.managing_authority_other
form.health_centre_information.setting
form.health_centre_information.outpatient_only
form.health_centre_information.capacity.number_consultation_rooms
form.health_centre_information.capacity.number_inpatient_beds
form.health_centre_information.capacity.number_maternity_beds
form.health_centre_information.group_number_employed.list_community_health_officer.cho_number_in_post
form.health_centre_information.group_number_employed.list_community_health_officer.cho_number_pre

These column names are painfully long to work with, so I will remove some unnecesary words. 

In [331]:
new_cols = []
for i,name in enumerate(cols):
    name = re.sub('form\.','',name)
    name = re.sub('health_centre_information\.','',name)
    name = re.sub('location_information\.','',name)
    name = re.sub('group_number_employed\.','',name)
    name = re.sub('list_.*\.','',name)
    name = re.sub('ql_information_education_communication\.','',name)
    name = re.sub('ql_human_resources\.','',name)
    name = re.sub('ql_surveillance\.','',name)
    name = re.sub('ql_triage_and_early_recognition\.','',name)
    name = re.sub('ql_chw\.','',name)
    name = re.sub('ql_isolation_physical_distancing\.','',name)
    
    # remove everything from 'grp_infection_prevention_and_control.' to the next '.'
    name = re.sub('grp_infection_prevention_and_control\..*\.','',name) 
    # remove any remaining instances of 'grp_infection_prevention_and_control.'
    name = re.sub('grp_infection_prevention_and_control\.','',name) 
    
    # remove everything before the word 'log'
    name = re.sub('.*\.log_','log_',name)
    
    new_cols.append(name)

    print(name)
df.columns = new_cols

facility_name
facility_gps
region_province
district
chiefdom
facility_location
facility_type
facility_type_other
managing_authority
managing_authority_other
setting
outpatient_only
capacity.number_consultation_rooms
capacity.number_inpatient_beds
capacity.number_maternity_beds
cho_number_in_post
cho_number_present_at_visit
cht_number_in_post
cht_number_present_at_visit
cha_number_in_post
cha_number_present_at_visit
cm_number_in_post
cm_number_present_at_visit
sechn_number_in_post
sechn_number_present_at_visit
lab_tech_number_in_post
lab_tech_number_present_at_visit
mch_aides_number_in_post
mch_aides_number_present_at_visit
cleaner_porter_number_in_post
cleaner_porter_number_present_at_visit
other_number_in_post
other_number_present_at_visit
other_staff_cadre
sections_to_review
consultations.head_count.month_1
consultations.head_count.month_2
consultations.head_count.month_3
consultations.head_count.month_4
consultations.general_outpatient.month_1
consultations.general_outpatient.month_

Abbreviations in the new column names:
* cho: community health officer
* cht: community health technician
* cha: community health assistant
* cm: community midwives
* sechn: state enrolled community health nurse
* mch: maternal and child health
* hr: human resources
* iec: information education communication
* surv: surveillance
* ter: triage and early recognition
* chw: community health worker
* iso: isolation
* ppe: personal protective equipment
* ipc: infection prevention and control
* wcd: waste collection and disposal
* ds: disinfection and sterilization
* log: logistics

## Some preliminary changes to the data

There are a few obvious things we can do right off the bat to make the data easier to work with. 

Firstly, the missing values in the dataset are indicated by '---'. Explicitly saying that the values are missing is more useful, so we replace the '---' values with np.nan. 

In [332]:
# replace --- values with NA
df = df.replace('---', np.nan, regex=True)

We can also split the GPS values into latitude and longitude:

In [333]:
# split GPS values into longitude and latitude
df.insert(loc=1,column = 'facility_gps_lat',value=df['facility_gps'].apply(lambda x: x.split(',')[0]))
df.insert(loc=2,column = 'facility_gps_lon',value=df['facility_gps'].apply(lambda x: x.split(',')[1]))
df = df.drop('facility_gps',axis=1)

  df.insert(loc=1,column = 'facility_gps_lat',value=df['facility_gps'].apply(lambda x: x.split(',')[0]))
  df.insert(loc=2,column = 'facility_gps_lon',value=df['facility_gps'].apply(lambda x: x.split(',')[1]))


We can change yes and no values to true and false values:

In [334]:
# replace 'yes' with True and 'no' with False
df['outpatient_only'] = df['outpatient_only'].apply(lambda x: True if x=='yes' else x)
df['outpatient_only'] = df['outpatient_only'].apply(lambda x: False if x=='no' else x)


## Data types

The dataframe will be easier to use for modeling and to work with in general if the data types for the columns are specified. Let us explore the current data types:

In [335]:
df.dtypes

facility_name                                                                 object
facility_gps_lat                                                              object
facility_gps_lon                                                              object
region_province                                                               object
district                                                                      object
chiefdom                                                                      object
facility_location                                                             object
facility_type                                                                 object
facility_type_other                                                          float64
managing_authority                                                            object
managing_authority_other                                                     float64
setting                                                          

There are lots of 'object' data types. Let's make the dataframe easier to use for modelling by assigning the correct data types. There are a lot of columns, so I assign data types in a bit of a hacky way below to save time. I consider the part of the column name after the last full stop. If it contains a particular keyword, I give it a particular data type. I use 16 bits for the integers because it's very unlikely that any facility will have more than around 32000 of any of the quantities in this database. 

In [336]:
type_dict = {} # make a dictionary specifying data types for columns 
for col in df.columns:
    name_parts = col.split('.')
    
    if len(re.findall('number_', name_parts[-1]))!=0:
        type_dict[col] = 'Int16'
    elif name_parts[-1][:6]=='month_':
        type_dict[col] = 'Int16'
    elif name_parts[-1][:3]=='hr_':
        type_dict[col] = 'string'
    elif name_parts[-1][:4]=='iec_':
        type_dict[col] = 'string'
    elif name_parts[-1][:5]=='surv_':
        type_dict[col] = 'string'
    elif name_parts[-1][:4]=='ter_':
        type_dict[col] = 'string'
    elif name_parts[-1][:4]=='chw_':
        type_dict[col] = 'string'
    elif name_parts[-1][:4]=='iso_':
        type_dict[col] = 'string'
    elif name_parts[-1][:4]=='ipc_':
        type_dict[col] = 'string'
    elif name_parts[-1][:4]=='log_':
        type_dict[col] = 'string'
    elif name_parts[-1][:15]=='monthly_average':
        type_dict[col] = 'float64'
    elif name_parts[-1][:6]=='score_':
        type_dict[col] = 'float64'
type_dict_2 = {'facility_name': 'string', 'facility_gps_lat': 'float64', 'facility_gps_lon':'float64', 'region_province': 'string', 'district': 'string', 'chiefdom': 'string', 'facility_location': 'string', 'facility_type': 'string', 'facility_type_other': 'string', 'managing_authority': 'string', 'managing_authority_other': 'string', 'setting': 'string', 'outpatient_only': 'boolean' }
type_dict = type_dict | type_dict_2 # join two dictionaries

df=df.astype(type_dict)

We can check that the data types are now correct:

In [337]:
df.dtypes

facility_name                                                                 string
facility_gps_lat                                                             float64
facility_gps_lon                                                             float64
region_province                                                               string
district                                                                      string
chiefdom                                                                      string
facility_location                                                             string
facility_type                                                                 string
facility_type_other                                                           string
managing_authority                                                            string
managing_authority_other                                                      string
setting                                                          

## Specifying case

Python and many other languages are case sensitive, so a word like 'Python' is treated differently from 'python'. We can better detect duplicates if we make the case consistent throughout. 

In [338]:
# get the names of all the string columns
str_cols = [i for i in df.columns if df.dtypes[i]=='string']

# make the string columns all lowercase
df[str_cols] = df[str_cols].apply(lambda x: x.str.lower(),axis=1)

# make location names all uppercase
df['facility_name'] = df['facility_name'].str.upper()
df['region_province'] = df['region_province'].str.upper()
df['district'] = df['district'].str.upper()
df['chiefdom'] = df['chiefdom'].str.upper()
df['facility_location'] = df['facility_location'].str.upper()


## Dealing with duplicates

Now it's time to investigate duplicate data. 

Let's look for duplicate facility names:

In [339]:
df_dup_1 = df[df.duplicated(['facility_name'],keep=False)].sort_values('facility_name')
df_dup_1

Unnamed: 0_level_0,facility_name,facility_gps_lat,facility_gps_lon,region_province,district,chiefdom,facility_location,facility_type,facility_type_other,managing_authority,managing_authority_other,setting,outpatient_only,capacity.number_consultation_rooms,capacity.number_inpatient_beds,capacity.number_maternity_beds,cho_number_in_post,cho_number_present_at_visit,cht_number_in_post,cht_number_present_at_visit,cha_number_in_post,cha_number_present_at_visit,cm_number_in_post,cm_number_present_at_visit,sechn_number_in_post,sechn_number_present_at_visit,lab_tech_number_in_post,lab_tech_number_present_at_visit,mch_aides_number_in_post,mch_aides_number_present_at_visit,cleaner_porter_number_in_post,cleaner_porter_number_present_at_visit,other_number_in_post,other_number_present_at_visit,other_staff_cadre,sections_to_review,consultations.head_count.month_1,consultations.head_count.month_2,consultations.head_count.month_3,consultations.head_count.month_4,consultations.general_outpatient.month_1,consultations.general_outpatient.month_2,consultations.general_outpatient.month_3,consultations.general_outpatient.month_4,consultations.deliveries.month_1,consultations.deliveries.month_2,consultations.deliveries.month_3,consultations.deliveries.month_4,consultations.pent_vaccines.month_1,consultations.pent_vaccines.month_2,consultations.pent_vaccines.month_3,consultations.pent_vaccines.month_4,hr_focal_point,hr_staff_received_info,hr_healthcare_provider_training,hr_healthcare_provider_revised_training,hr_daily_staff_list,iec_handwashing_procedure,iec_physical_distancing,iec_covering_nose_mouth,iec_early_symptom_recognition,iec_when_facility_vs_home,iec_rational_ppe_use,iec_helpline_number,surv_procedure_for_notification,surv_official_case_definition,surv_hotline_number,surv_timely_data_reported_to_district,ter_screening_area_set_up,ter_symptom_screening_questionnaires,ter_temperature_measurement_at_triage,ter_physical_distancing_in_waiting,ter_separate_waiting_for_symptomatic,chw_trained_precautions,chw_trained_community_service,chw_drugs,chw_gloves,chw_masks,chw_iec_materials,iso_designated_isolation_for_suspected,iso_distance_between_patients_in_waiting,iso_distance_between_patient_beds,iso_transfer_referral_protocol,ipc_ppe_medical_masks,ipc_ppe_disp_surgical_masks,ipc_ppe_eye_protection,ipc_ppe_examination_gloves,ipc_ppe_surgical_gloves,ipc_ppe_long_cuffed_gloves,ipc_ppe_heavy_duty_gloves,ipc_ppe_long_sleeved_gown,ipc_ppe_waterproof_aprons,ipc_ppe_plan_staff_trained_on_ppe,ipc_ppe_plan_ppe_poster_displayed,ipc_ppe_plan_fit_test_kit,ipc_ppe_plan_contingency_for_shortages,ipc_wcd_colour_coded_bins,ipc_wcd_clinical_waste_bags,ipc_wcd_laundry_receptacles_at_patient_rooms,ipc_wcd_incinerator,ipc_wash_clean_running_water,ipc_wash_hand_soap,ipc_wash_liquid_soap,ipc_wash_disp_hand_towels,ipc_wash_alcohol_based_hand_gel,ipc_ds_protocol_facility_disinfection,ipc_ds_protocol_equipment_sterilisation,ipc_ds_environmental_disinfectant,ipc_ds_cleaning_schedule_in_toilets,ipc_ds_protocol_corpse_handling,log_referral_plan,log_cellphone_landline_swradio,log_tracer_drugs,log_albendazole,log_amoxicillin,log_ampicillin,log_chlorhexidine_5,log_chlorhexidine_7,log_gentamicin,log_folic,log_ferrous_and_folic,log_compound_sodium,log_co_trimoxazole_400,log_co_trimoxazole_200,log_metronidazole_250,log_metronidazole_200,log_methyldopa,log_magnesium_sulphate,log_lidocaine,log_ibuprofen,log_surgical_spirit,log_sodium_chloride,log_povidone,log_paracetamol_500,log_paracetamol_250,log_ors,log_gauze,log_cotton_wool,log_cannula_iv_20,log_cannula_iv_24,log_zinc_sulphate,log_water,log_needle_23,log_needle_21,log_glove_giving,log_glove_surgical,log_glove_gyn,log_glove_exam,log_tape,log_syringe,log_oxytocin,log_diazepam,log_misoprostol,log_glucose,consultations.head_count.monthly_average_head_counts,consultations.general_outpatient.monthly_average_general_outpatient,consultations.deliveries.monthly_average_deliveries,consultations.pent_vaccines.monthly_average_pent_vaccines,score_human_resources,score_max_human_resources,score_information_education_communication,score_max_information_education_communication,score_surveillance,score_max_surveillance,score_triage_and_early_recognition,score_max_triage_and_early_recognition,score_chw,score_max_chw,score_isolation,score_max_isolation,score_infection_prevention_and_control_ppe,score_max_infection_prevention_and_control_ppe,score_infection_prevention_and_control_ppe_plan,score_max_infection_prevention_and_control_ppe_plan,score_infection_prevention_and_control_waste_collection_and_disposal,score_max_infection_prevention_and_control_waste_collection_and_disposal,score_infection_prevention_and_control_water_sanitation_and_hygiene,score_max_infection_prevention_and_control_water_sanitation_and_hygiene,score_infection_prevention_and_control_disinfection_and_sterilization,score_max_infection_prevention_and_control_disinfection_and_sterilization,score_infection_prevention_and_control,score_max_infection_prevention_and_control,question1.score_logistics_patient_and_sample_transfer,question1.score_max_logistics_patient_and_sample_transfer,score_total,score_max_total
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1
41,AMBER CHP,-63.524311,4.30623,NORTH-WESTERN,KARENE,MASSACHUSETTS,PORT CAROLBURGH,mch_post,,government_public,,peri_urban,False,1,2,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,,consultations hr information_education_communi...,481,355,264,223,57,50,40,28,12,11,9,8,6,12,12,12,not_completed,completed,completed,completed,completed,displayed,displayed,not_displayed,displayed,displayed,displayed,not_displayed,not_in_place,fully_operational,fully_operational,fully_operational,not_in_place,not_in_place,not_in_place,partially_operational,not_in_place,adequate,adequate,adequate,adequate,not_in_place,not_in_place,partially_operational,not_in_place,not_in_place,fully_operational,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,not_available,sufficient_supplies,sufficient_supplies,sufficient_supplies,partially_achieved,available_fully_achieved,not_available,not_available,sufficient_supplies,sufficient_supplies,not_available,sufficient_supplies,sufficient_supplies,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,available,not_available,available,not_available,available,not_available,available,not_available,available,available,not_available,,available,not_available,available,not_available,available,available,available,available,available,available,available,available,not_available,available,available,not_available,not_available,not_available,available,available,not_available,not_available,available,available,available,available,not_available,available,available,available,not_available,not_available,330.75,43.75,10.0,10.5,4.0,5.0,5.0,7.0,3.0,4.0,0.5,5.0,3.0,6.0,1.5,4.0,8.0,9.0,1.5,4.0,3.0,4.0,1.0,5.0,0.0,5.0,13.5,27.0,26.0,44.0,56.5,102.0
44,AMBER CHP,-51.206867,-4.695275,SOUTHERN,MOYAMBA,ALABAMA,NEW WALTERPORT,community_health_centre,,government_public,,slum,True,2,5,1,2,2,0,0,0,0,0,0,0,0,0,0,3,3,1,1,1,1,"1 security, 2 TBA, Laboratory Assistant, 1 vo...",consultations hr information_education_communi...,492,537,486,524,396,392,291,254,11,7,10,11,12,13,13,15,completed,completed,completed,completed,completed,displayed,displayed,displayed,displayed,displayed,displayed,displayed,fully_operational,fully_operational,partially_operational,fully_operational,fully_operational,fully_operational,fully_operational,fully_operational,fully_operational,partial,partial,partial,partial,partial,adequate,fully_operational,fully_operational,fully_operational,fully_operational,risk_of_shortage,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,available_fully_achieved,available_fully_achieved,available_fully_achieved,available_fully_achieved,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,risk_of_shortage,not_available,not_available,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,available,partially_available,available,available,available,not_available,available,partially_available,available,,available,available,available,available,available,available,available,available,available,available,available,available,available,available,available,available,not_available,available,available,available,available,available,available,available,available,,available,available,available,available,available,available,not_available,available,509.75,333.25,9.75,13.25,5.0,5.0,7.0,7.0,3.5,4.0,5.0,5.0,3.0,6.0,4.0,4.0,8.5,9.0,4.0,4.0,4.0,4.0,2.5,5.0,4.0,5.0,23.0,27.0,36.0,44.0,86.5,102.0
25,JOANNA CLINIC,15.754015,-153.871776,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,rural,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388,153,180,183,133,30,21,55,18,10,14,14,17,18,12,19,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,partially_operational,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,partial,not_in_place,,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,partially_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,partially_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,0.5,4.0,1.0,5.0,0.5,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,12.0,44.0,22.0,102.0
47,JOANNA CLINIC,60.190934,162.787117,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,peri_urban,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388,153,180,183,133,30,21,55,18,10,14,14,17,18,12,19,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,fully_operational,fully_operational,partially_operational,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,not_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,2.0,4.0,1.5,5.0,0.0,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,11.0,44.0,22.5,102.0


Let's see how many elements are in this dataframe:

In [340]:
len(df_dup_1)

4

It is possible that some duplicates are 'hiding' because the names are spelled slightly differently. Let's see if we can catch any like that by printing out all the facilities that have location matches with other facilities. 

In [341]:
df[df.duplicated(['facility_location'],keep=False)].sort_values('facility_location')

Unnamed: 0_level_0,facility_name,facility_gps_lat,facility_gps_lon,region_province,district,chiefdom,facility_location,facility_type,facility_type_other,managing_authority,managing_authority_other,setting,outpatient_only,capacity.number_consultation_rooms,capacity.number_inpatient_beds,capacity.number_maternity_beds,cho_number_in_post,cho_number_present_at_visit,cht_number_in_post,cht_number_present_at_visit,cha_number_in_post,cha_number_present_at_visit,cm_number_in_post,cm_number_present_at_visit,sechn_number_in_post,sechn_number_present_at_visit,lab_tech_number_in_post,lab_tech_number_present_at_visit,mch_aides_number_in_post,mch_aides_number_present_at_visit,cleaner_porter_number_in_post,cleaner_porter_number_present_at_visit,other_number_in_post,other_number_present_at_visit,other_staff_cadre,sections_to_review,consultations.head_count.month_1,consultations.head_count.month_2,consultations.head_count.month_3,consultations.head_count.month_4,consultations.general_outpatient.month_1,consultations.general_outpatient.month_2,consultations.general_outpatient.month_3,consultations.general_outpatient.month_4,consultations.deliveries.month_1,consultations.deliveries.month_2,consultations.deliveries.month_3,consultations.deliveries.month_4,consultations.pent_vaccines.month_1,consultations.pent_vaccines.month_2,consultations.pent_vaccines.month_3,consultations.pent_vaccines.month_4,hr_focal_point,hr_staff_received_info,hr_healthcare_provider_training,hr_healthcare_provider_revised_training,hr_daily_staff_list,iec_handwashing_procedure,iec_physical_distancing,iec_covering_nose_mouth,iec_early_symptom_recognition,iec_when_facility_vs_home,iec_rational_ppe_use,iec_helpline_number,surv_procedure_for_notification,surv_official_case_definition,surv_hotline_number,surv_timely_data_reported_to_district,ter_screening_area_set_up,ter_symptom_screening_questionnaires,ter_temperature_measurement_at_triage,ter_physical_distancing_in_waiting,ter_separate_waiting_for_symptomatic,chw_trained_precautions,chw_trained_community_service,chw_drugs,chw_gloves,chw_masks,chw_iec_materials,iso_designated_isolation_for_suspected,iso_distance_between_patients_in_waiting,iso_distance_between_patient_beds,iso_transfer_referral_protocol,ipc_ppe_medical_masks,ipc_ppe_disp_surgical_masks,ipc_ppe_eye_protection,ipc_ppe_examination_gloves,ipc_ppe_surgical_gloves,ipc_ppe_long_cuffed_gloves,ipc_ppe_heavy_duty_gloves,ipc_ppe_long_sleeved_gown,ipc_ppe_waterproof_aprons,ipc_ppe_plan_staff_trained_on_ppe,ipc_ppe_plan_ppe_poster_displayed,ipc_ppe_plan_fit_test_kit,ipc_ppe_plan_contingency_for_shortages,ipc_wcd_colour_coded_bins,ipc_wcd_clinical_waste_bags,ipc_wcd_laundry_receptacles_at_patient_rooms,ipc_wcd_incinerator,ipc_wash_clean_running_water,ipc_wash_hand_soap,ipc_wash_liquid_soap,ipc_wash_disp_hand_towels,ipc_wash_alcohol_based_hand_gel,ipc_ds_protocol_facility_disinfection,ipc_ds_protocol_equipment_sterilisation,ipc_ds_environmental_disinfectant,ipc_ds_cleaning_schedule_in_toilets,ipc_ds_protocol_corpse_handling,log_referral_plan,log_cellphone_landline_swradio,log_tracer_drugs,log_albendazole,log_amoxicillin,log_ampicillin,log_chlorhexidine_5,log_chlorhexidine_7,log_gentamicin,log_folic,log_ferrous_and_folic,log_compound_sodium,log_co_trimoxazole_400,log_co_trimoxazole_200,log_metronidazole_250,log_metronidazole_200,log_methyldopa,log_magnesium_sulphate,log_lidocaine,log_ibuprofen,log_surgical_spirit,log_sodium_chloride,log_povidone,log_paracetamol_500,log_paracetamol_250,log_ors,log_gauze,log_cotton_wool,log_cannula_iv_20,log_cannula_iv_24,log_zinc_sulphate,log_water,log_needle_23,log_needle_21,log_glove_giving,log_glove_surgical,log_glove_gyn,log_glove_exam,log_tape,log_syringe,log_oxytocin,log_diazepam,log_misoprostol,log_glucose,consultations.head_count.monthly_average_head_counts,consultations.general_outpatient.monthly_average_general_outpatient,consultations.deliveries.monthly_average_deliveries,consultations.pent_vaccines.monthly_average_pent_vaccines,score_human_resources,score_max_human_resources,score_information_education_communication,score_max_information_education_communication,score_surveillance,score_max_surveillance,score_triage_and_early_recognition,score_max_triage_and_early_recognition,score_chw,score_max_chw,score_isolation,score_max_isolation,score_infection_prevention_and_control_ppe,score_max_infection_prevention_and_control_ppe,score_infection_prevention_and_control_ppe_plan,score_max_infection_prevention_and_control_ppe_plan,score_infection_prevention_and_control_waste_collection_and_disposal,score_max_infection_prevention_and_control_waste_collection_and_disposal,score_infection_prevention_and_control_water_sanitation_and_hygiene,score_max_infection_prevention_and_control_water_sanitation_and_hygiene,score_infection_prevention_and_control_disinfection_and_sterilization,score_max_infection_prevention_and_control_disinfection_and_sterilization,score_infection_prevention_and_control,score_max_infection_prevention_and_control,question1.score_logistics_patient_and_sample_transfer,question1.score_max_logistics_patient_and_sample_transfer,score_total,score_max_total
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1
25,JOANNA CLINIC,15.754015,-153.871776,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,rural,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388.0,153.0,180.0,183.0,133.0,30.0,21.0,55.0,18.0,10.0,14.0,14.0,17.0,18.0,12.0,19.0,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,partially_operational,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,partial,not_in_place,,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,partially_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,partially_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,0.5,4.0,1.0,5.0,0.5,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,12.0,44.0,22.0,102.0
47,JOANNA CLINIC,60.190934,162.787117,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,peri_urban,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388.0,153.0,180.0,183.0,133.0,30.0,21.0,55.0,18.0,10.0,14.0,14.0,17.0,18.0,12.0,19.0,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,fully_operational,fully_operational,partially_operational,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,not_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,2.0,4.0,1.5,5.0,0.0,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,11.0,44.0,22.5,102.0
18,ROBERT CHP,-44.270199,153.380144,WESTERN AREA,WESTERN AREA URBAN,KENTUCKY,COLLEENBURGH,community_health_centre,,government_public,,urban,True,4,5,4,1,0,0,0,0,0,2,1,7,3,0,0,7,3,0,0,2,0,nursing aide,consultations hr information_education_communi...,1148.0,931.0,997.0,874.0,567.0,462.0,504.0,402.0,26.0,32.0,28.0,28.0,37.0,42.0,42.0,33.0,not_completed,partially_completed,not_completed,partially_completed,not_completed,displayed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,partially_operational,partially_operational,fully_operational,fully_operational,partially_operational,partially_operational,fully_operational,fully_operational,partially_operational,adequate,adequate,adequate,partial,partial,not_in_place,not_in_place,partially_operational,partially_operational,fully_operational,risk_of_shortage,risk_of_shortage,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,sufficient_supplies,risk_of_shortage,risk_of_shortage,risk_of_shortage,not_available,risk_of_shortage,risk_of_shortage,risk_of_shortage,risk_of_shortage,risk_of_shortage,risk_of_shortage,not_available,available,not_available,partially_available,not_available,not_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,not_available,partially_available,partially_available,available,not_available,available,not_available,not_available,available,not_available,not_available,available,partially_available,available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,not_available,not_available,987.5,483.75,28.5,38.5,1.0,5.0,1.0,7.0,3.0,4.0,3.5,5.0,3.0,6.0,2.0,4.0,2.0,9.0,0.0,4.0,1.5,4.0,2.0,5.0,2.0,5.0,7.5,27.0,11.0,44.0,32.0,102.0
20,STEVEN TERTIARY HOSPITAL,-9.28385,51.93341,WESTERN,WESTERN AREA URBAN,ILLINOIS,COLLEENBURGH,community_health_centre,,government_public,,urban,True,3,2,1,2,0,0,0,0,0,2,1,3,0,4,1,7,1,1,0,1,0,Nursing Aide,hr information_education_communication surveil...,,,,,,,,,,,,,,,,,not_completed,not_completed,not_completed,not_completed,not_completed,displayed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,partially_operational,not_in_place,not_in_place,fully_operational,partially_operational,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,partially_operational,partially_operational,partially_operational,partially_operational,not_available,,,,,,,,,not_available,not_available,not_available,not_available,risk_of_shortage,risk_of_shortage,not_available,sufficient_supplies,risk_of_shortage,risk_of_shortage,risk_of_shortage,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5.0,1.0,7.0,1.5,4.0,0.5,5.0,0.0,6.0,2.0,4.0,0.0,9.0,0.0,4.0,2.0,4.0,1.5,5.0,0.5,5.0,4.0,27.0,0.0,44.0,9.0,102.0
23,ZOE HOSPITAL/CHC,-26.352292,97.523226,WESTERN,WESTERN AREA URBAN,RHODE ISLAND,COLLEENBURGH,mch_post,,government_public,,urban,True,2,3,2,0,0,0,0,0,0,0,0,2,1,0,0,3,3,0,0,0,0,,consultations hr information_education_communi...,693.0,649.0,733.0,542.0,439.0,522.0,513.0,344.0,25.0,18.0,20.0,15.0,34.0,32.0,20.0,32.0,partially_completed,partially_completed,partially_completed,partially_completed,not_completed,displayed,not_displayed,displayed,displayed,displayed,displayed,displayed,partially_operational,partially_operational,fully_operational,fully_operational,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,adequate,adequate,not_in_place,not_in_place,partial,not_in_place,not_in_place,not_in_place,not_in_place,partially_operational,not_available,risk_of_shortage,not_available,risk_of_shortage,not_available,not_available,not_available,risk_of_shortage,risk_of_shortage,partially_achieved,not_available,not_available,not_available,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,risk_of_shortage,not_available,risk_of_shortage,risk_of_shortage,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,available,not_available,available,not_available,available,not_available,not_available,not_available,available,not_available,available,not_available,available,available,available,available,not_available,not_available,available,available,not_available,available,available,available,not_available,not_available,available,not_available,partially_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,available,,,,654.25,454.5,19.5,29.5,2.0,5.0,6.0,7.0,3.0,4.0,0.0,5.0,1.5,6.0,0.5,4.0,2.0,9.0,0.5,4.0,1.0,4.0,1.5,5.0,1.0,5.0,6.0,27.0,18.0,44.0,37.0,102.0
42,ROBERT REGIONAL HOSPITAL,-15.22435,87.25965,WESTERN,WESTERN AREA URBAN,RHODE ISLAND,COLLEENBURGH,community_health_centre,,government_public,,urban,True,4,6,2,2,1,0,0,2,0,3,1,8,1,3,2,4,0,0,0,0,0,,consultations hr information_education_communi...,880.0,1050.0,911.0,652.0,352.0,544.0,657.0,250.0,40.0,41.0,96.0,28.0,0.0,70.0,52.0,0.0,completed,partially_completed,partially_completed,not_completed,not_completed,displayed,displayed,displayed,displayed,displayed,not_displayed,displayed,not_in_place,partially_operational,fully_operational,fully_operational,partially_operational,fully_operational,fully_operational,fully_operational,not_in_place,adequate,partial,adequate,partial,partial,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,not_available,not_available,not_available,risk_of_shortage,risk_of_shortage,not_available,risk_of_shortage,risk_of_shortage,risk_of_shortage,risk_of_shortage,not_available,risk_of_shortage,risk_of_shortage,risk_of_shortage,risk_of_shortage,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,partially_available,not_available,partially_available,partially_available,available,partially_available,partially_available,not_available,not_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,partially_available,not_available,not_available,873.25,450.75,51.25,30.5,2.0,5.0,6.0,7.0,2.5,4.0,3.5,5.0,2.5,6.0,0.0,4.0,0.5,9.0,0.5,4.0,1.5,4.0,2.0,5.0,1.5,5.0,6.0,27.0,9.5,44.0,32.0,102.0
16,ROBERT HOSPITAL,43.308952,134.454731,NORTH-WESTERN,PORT LOKO,KENTUCKY,VALERIEBOROUGH,community_health_centre,,government_public,,urban,True,3,4,3,1,1,0,0,0,0,1,1,0,0,0,0,3,2,0,0,0,0,,consultations,301.0,226.0,352.0,149.0,155.0,120.0,140.0,97.0,25.0,26.0,20.0,10.0,23.0,17.0,15.0,23.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,257.0,128.0,20.25,19.5,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0
33,REBEKAH TERTIARY HOSPITAL,-64.696061,-155.675806,NORTH-WESTERN,PORT LOKO,MONTANA,VALERIEBOROUGH,community_health_post,,government_public,,peri_urban,True,2,0,3,0,0,0,0,0,0,0,0,3,2,0,0,3,3,0,0,0,0,,consultations hr information_education_communi...,780.0,805.0,760.0,1044.0,363.0,448.0,340.0,403.0,40.0,37.0,28.0,29.0,34.0,39.0,35.0,35.0,not_completed,completed,not_completed,not_completed,completed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,displayed,displayed,fully_operational,not_in_place,fully_operational,fully_operational,fully_operational,not_in_place,partially_operational,fully_operational,fully_operational,partial,partial,adequate,adequate,not_in_place,not_in_place,partially_operational,fully_operational,fully_operational,fully_operational,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,available_fully_achieved,not_available,available_fully_achieved,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,sufficient_supplies,not_available,not_available,not_available,not_available,sufficient_supplies,not_available,sufficient_supplies,sufficient_supplies,not_available,available,partially_available,,partially_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_available,partially_available,not_available,not_available,not_available,available,partially_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,available,not_available,not_available,available,not_available,not_available,not_available,not_available,not_available,not_available,partially_available,partially_available,available,partially_available,available,not_available,not_available,847.25,388.5,33.5,35.75,2.0,5.0,3.0,7.0,3.0,4.0,3.5,5.0,2.5,6.0,3.5,4.0,0.0,9.0,2.5,4.0,4.0,4.0,1.0,5.0,2.0,5.0,9.5,27.0,11.0,44.0,38.0,102.0
34,ANDREW HOSPITAL/CHC,44.582128,-149.870484,NORTH-WESTERN,PORT LOKO,OREGON,VALERIEBOROUGH,community_health_post,,government_public,,peri_urban,True,1,0,2,0,0,0,0,0,0,0,0,2,1,0,0,0,0,1,1,0,0,,consultations hr information_education_communi...,288.0,243.0,283.0,141.0,189.0,145.0,145.0,90.0,11.0,7.0,16.0,2.0,12.0,11.0,26.0,0.0,not_completed,completed,partially_completed,not_completed,completed,displayed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,fully_operational,not_in_place,fully_operational,fully_operational,not_in_place,not_in_place,partially_operational,fully_operational,not_in_place,partial,partial,partial,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,not_in_place,fully_operational,risk_of_shortage,risk_of_shortage,sufficient_supplies,sufficient_supplies,sufficient_supplies,not_available,not_available,risk_of_shortage,not_available,partially_achieved,available_fully_achieved,not_available,available_fully_achieved,sufficient_supplies,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,sufficient_supplies,not_available,risk_of_shortage,not_available,not_available,available,available,,available,partially_available,not_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,partially_available,not_available,not_available,available,available,partially_available,available,not_available,available,available,available,not_available,available,partially_available,available,not_available,not_available,partially_available,available,partially_available,not_available,not_available,not_available,not_available,available,partially_available,not_available,available,available,not_available,partially_available,238.75,142.25,9.0,12.25,2.5,5.0,1.0,7.0,3.0,4.0,1.5,5.0,1.0,6.0,2.0,4.0,4.5,9.0,2.5,4.0,2.0,4.0,0.5,5.0,1.5,5.0,11.0,27.0,18.5,44.0,40.5,102.0


From the above, it looks like there are no cases where facilities in the same location have two slightly different names. 

It is also possible that some of the duplicate facility names refer to independent clinics that happen to have the same name. Let's also consider the number of consultations in the facility to get a clearer picture. 

In [342]:
df_dup_2 = df[df.duplicated(['facility_name','consultations.head_count.month_1','consultations.head_count.month_2','consultations.head_count.month_3','consultations.head_count.month_4'],keep=False)].sort_values('facility_name')
df_dup_2

Unnamed: 0_level_0,facility_name,facility_gps_lat,facility_gps_lon,region_province,district,chiefdom,facility_location,facility_type,facility_type_other,managing_authority,managing_authority_other,setting,outpatient_only,capacity.number_consultation_rooms,capacity.number_inpatient_beds,capacity.number_maternity_beds,cho_number_in_post,cho_number_present_at_visit,cht_number_in_post,cht_number_present_at_visit,cha_number_in_post,cha_number_present_at_visit,cm_number_in_post,cm_number_present_at_visit,sechn_number_in_post,sechn_number_present_at_visit,lab_tech_number_in_post,lab_tech_number_present_at_visit,mch_aides_number_in_post,mch_aides_number_present_at_visit,cleaner_porter_number_in_post,cleaner_porter_number_present_at_visit,other_number_in_post,other_number_present_at_visit,other_staff_cadre,sections_to_review,consultations.head_count.month_1,consultations.head_count.month_2,consultations.head_count.month_3,consultations.head_count.month_4,consultations.general_outpatient.month_1,consultations.general_outpatient.month_2,consultations.general_outpatient.month_3,consultations.general_outpatient.month_4,consultations.deliveries.month_1,consultations.deliveries.month_2,consultations.deliveries.month_3,consultations.deliveries.month_4,consultations.pent_vaccines.month_1,consultations.pent_vaccines.month_2,consultations.pent_vaccines.month_3,consultations.pent_vaccines.month_4,hr_focal_point,hr_staff_received_info,hr_healthcare_provider_training,hr_healthcare_provider_revised_training,hr_daily_staff_list,iec_handwashing_procedure,iec_physical_distancing,iec_covering_nose_mouth,iec_early_symptom_recognition,iec_when_facility_vs_home,iec_rational_ppe_use,iec_helpline_number,surv_procedure_for_notification,surv_official_case_definition,surv_hotline_number,surv_timely_data_reported_to_district,ter_screening_area_set_up,ter_symptom_screening_questionnaires,ter_temperature_measurement_at_triage,ter_physical_distancing_in_waiting,ter_separate_waiting_for_symptomatic,chw_trained_precautions,chw_trained_community_service,chw_drugs,chw_gloves,chw_masks,chw_iec_materials,iso_designated_isolation_for_suspected,iso_distance_between_patients_in_waiting,iso_distance_between_patient_beds,iso_transfer_referral_protocol,ipc_ppe_medical_masks,ipc_ppe_disp_surgical_masks,ipc_ppe_eye_protection,ipc_ppe_examination_gloves,ipc_ppe_surgical_gloves,ipc_ppe_long_cuffed_gloves,ipc_ppe_heavy_duty_gloves,ipc_ppe_long_sleeved_gown,ipc_ppe_waterproof_aprons,ipc_ppe_plan_staff_trained_on_ppe,ipc_ppe_plan_ppe_poster_displayed,ipc_ppe_plan_fit_test_kit,ipc_ppe_plan_contingency_for_shortages,ipc_wcd_colour_coded_bins,ipc_wcd_clinical_waste_bags,ipc_wcd_laundry_receptacles_at_patient_rooms,ipc_wcd_incinerator,ipc_wash_clean_running_water,ipc_wash_hand_soap,ipc_wash_liquid_soap,ipc_wash_disp_hand_towels,ipc_wash_alcohol_based_hand_gel,ipc_ds_protocol_facility_disinfection,ipc_ds_protocol_equipment_sterilisation,ipc_ds_environmental_disinfectant,ipc_ds_cleaning_schedule_in_toilets,ipc_ds_protocol_corpse_handling,log_referral_plan,log_cellphone_landline_swradio,log_tracer_drugs,log_albendazole,log_amoxicillin,log_ampicillin,log_chlorhexidine_5,log_chlorhexidine_7,log_gentamicin,log_folic,log_ferrous_and_folic,log_compound_sodium,log_co_trimoxazole_400,log_co_trimoxazole_200,log_metronidazole_250,log_metronidazole_200,log_methyldopa,log_magnesium_sulphate,log_lidocaine,log_ibuprofen,log_surgical_spirit,log_sodium_chloride,log_povidone,log_paracetamol_500,log_paracetamol_250,log_ors,log_gauze,log_cotton_wool,log_cannula_iv_20,log_cannula_iv_24,log_zinc_sulphate,log_water,log_needle_23,log_needle_21,log_glove_giving,log_glove_surgical,log_glove_gyn,log_glove_exam,log_tape,log_syringe,log_oxytocin,log_diazepam,log_misoprostol,log_glucose,consultations.head_count.monthly_average_head_counts,consultations.general_outpatient.monthly_average_general_outpatient,consultations.deliveries.monthly_average_deliveries,consultations.pent_vaccines.monthly_average_pent_vaccines,score_human_resources,score_max_human_resources,score_information_education_communication,score_max_information_education_communication,score_surveillance,score_max_surveillance,score_triage_and_early_recognition,score_max_triage_and_early_recognition,score_chw,score_max_chw,score_isolation,score_max_isolation,score_infection_prevention_and_control_ppe,score_max_infection_prevention_and_control_ppe,score_infection_prevention_and_control_ppe_plan,score_max_infection_prevention_and_control_ppe_plan,score_infection_prevention_and_control_waste_collection_and_disposal,score_max_infection_prevention_and_control_waste_collection_and_disposal,score_infection_prevention_and_control_water_sanitation_and_hygiene,score_max_infection_prevention_and_control_water_sanitation_and_hygiene,score_infection_prevention_and_control_disinfection_and_sterilization,score_max_infection_prevention_and_control_disinfection_and_sterilization,score_infection_prevention_and_control,score_max_infection_prevention_and_control,question1.score_logistics_patient_and_sample_transfer,question1.score_max_logistics_patient_and_sample_transfer,score_total,score_max_total
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1
25,JOANNA CLINIC,15.754015,-153.871776,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,rural,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388,153,180,183,133,30,21,55,18,10,14,14,17,18,12,19,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,partially_operational,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,partial,not_in_place,,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,partially_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,partially_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,0.5,4.0,1.0,5.0,0.5,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,12.0,44.0,22.0,102.0
47,JOANNA CLINIC,60.190934,162.787117,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,peri_urban,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388,153,180,183,133,30,21,55,18,10,14,14,17,18,12,19,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,fully_operational,fully_operational,partially_operational,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,not_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,2.0,4.0,1.5,5.0,0.0,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,11.0,44.0,22.5,102.0


It's very unlikely that the consultation headcount numbers for four months would be exactly the same between two identically named clinics. These are therefore likely to truly refer to the same facility.



In [343]:
len(df_dup_2)

2

Our number of duplicates has now dropped lower. But maybe there are still some very similar entries where one or more of the consultation headcounts differ. We have not recorded those in df_dup_2. Let's search for matches with regard to a number of different column combinations and merge the resulting dataframes into one. 

We consider matching facility names and facility locations first.

In [344]:
df_dup_3 = df[df.duplicated(['facility_name','facility_location'],keep=False)].sort_values('facility_name')
df_dup_3

Unnamed: 0_level_0,facility_name,facility_gps_lat,facility_gps_lon,region_province,district,chiefdom,facility_location,facility_type,facility_type_other,managing_authority,managing_authority_other,setting,outpatient_only,capacity.number_consultation_rooms,capacity.number_inpatient_beds,capacity.number_maternity_beds,cho_number_in_post,cho_number_present_at_visit,cht_number_in_post,cht_number_present_at_visit,cha_number_in_post,cha_number_present_at_visit,cm_number_in_post,cm_number_present_at_visit,sechn_number_in_post,sechn_number_present_at_visit,lab_tech_number_in_post,lab_tech_number_present_at_visit,mch_aides_number_in_post,mch_aides_number_present_at_visit,cleaner_porter_number_in_post,cleaner_porter_number_present_at_visit,other_number_in_post,other_number_present_at_visit,other_staff_cadre,sections_to_review,consultations.head_count.month_1,consultations.head_count.month_2,consultations.head_count.month_3,consultations.head_count.month_4,consultations.general_outpatient.month_1,consultations.general_outpatient.month_2,consultations.general_outpatient.month_3,consultations.general_outpatient.month_4,consultations.deliveries.month_1,consultations.deliveries.month_2,consultations.deliveries.month_3,consultations.deliveries.month_4,consultations.pent_vaccines.month_1,consultations.pent_vaccines.month_2,consultations.pent_vaccines.month_3,consultations.pent_vaccines.month_4,hr_focal_point,hr_staff_received_info,hr_healthcare_provider_training,hr_healthcare_provider_revised_training,hr_daily_staff_list,iec_handwashing_procedure,iec_physical_distancing,iec_covering_nose_mouth,iec_early_symptom_recognition,iec_when_facility_vs_home,iec_rational_ppe_use,iec_helpline_number,surv_procedure_for_notification,surv_official_case_definition,surv_hotline_number,surv_timely_data_reported_to_district,ter_screening_area_set_up,ter_symptom_screening_questionnaires,ter_temperature_measurement_at_triage,ter_physical_distancing_in_waiting,ter_separate_waiting_for_symptomatic,chw_trained_precautions,chw_trained_community_service,chw_drugs,chw_gloves,chw_masks,chw_iec_materials,iso_designated_isolation_for_suspected,iso_distance_between_patients_in_waiting,iso_distance_between_patient_beds,iso_transfer_referral_protocol,ipc_ppe_medical_masks,ipc_ppe_disp_surgical_masks,ipc_ppe_eye_protection,ipc_ppe_examination_gloves,ipc_ppe_surgical_gloves,ipc_ppe_long_cuffed_gloves,ipc_ppe_heavy_duty_gloves,ipc_ppe_long_sleeved_gown,ipc_ppe_waterproof_aprons,ipc_ppe_plan_staff_trained_on_ppe,ipc_ppe_plan_ppe_poster_displayed,ipc_ppe_plan_fit_test_kit,ipc_ppe_plan_contingency_for_shortages,ipc_wcd_colour_coded_bins,ipc_wcd_clinical_waste_bags,ipc_wcd_laundry_receptacles_at_patient_rooms,ipc_wcd_incinerator,ipc_wash_clean_running_water,ipc_wash_hand_soap,ipc_wash_liquid_soap,ipc_wash_disp_hand_towels,ipc_wash_alcohol_based_hand_gel,ipc_ds_protocol_facility_disinfection,ipc_ds_protocol_equipment_sterilisation,ipc_ds_environmental_disinfectant,ipc_ds_cleaning_schedule_in_toilets,ipc_ds_protocol_corpse_handling,log_referral_plan,log_cellphone_landline_swradio,log_tracer_drugs,log_albendazole,log_amoxicillin,log_ampicillin,log_chlorhexidine_5,log_chlorhexidine_7,log_gentamicin,log_folic,log_ferrous_and_folic,log_compound_sodium,log_co_trimoxazole_400,log_co_trimoxazole_200,log_metronidazole_250,log_metronidazole_200,log_methyldopa,log_magnesium_sulphate,log_lidocaine,log_ibuprofen,log_surgical_spirit,log_sodium_chloride,log_povidone,log_paracetamol_500,log_paracetamol_250,log_ors,log_gauze,log_cotton_wool,log_cannula_iv_20,log_cannula_iv_24,log_zinc_sulphate,log_water,log_needle_23,log_needle_21,log_glove_giving,log_glove_surgical,log_glove_gyn,log_glove_exam,log_tape,log_syringe,log_oxytocin,log_diazepam,log_misoprostol,log_glucose,consultations.head_count.monthly_average_head_counts,consultations.general_outpatient.monthly_average_general_outpatient,consultations.deliveries.monthly_average_deliveries,consultations.pent_vaccines.monthly_average_pent_vaccines,score_human_resources,score_max_human_resources,score_information_education_communication,score_max_information_education_communication,score_surveillance,score_max_surveillance,score_triage_and_early_recognition,score_max_triage_and_early_recognition,score_chw,score_max_chw,score_isolation,score_max_isolation,score_infection_prevention_and_control_ppe,score_max_infection_prevention_and_control_ppe,score_infection_prevention_and_control_ppe_plan,score_max_infection_prevention_and_control_ppe_plan,score_infection_prevention_and_control_waste_collection_and_disposal,score_max_infection_prevention_and_control_waste_collection_and_disposal,score_infection_prevention_and_control_water_sanitation_and_hygiene,score_max_infection_prevention_and_control_water_sanitation_and_hygiene,score_infection_prevention_and_control_disinfection_and_sterilization,score_max_infection_prevention_and_control_disinfection_and_sterilization,score_infection_prevention_and_control,score_max_infection_prevention_and_control,question1.score_logistics_patient_and_sample_transfer,question1.score_max_logistics_patient_and_sample_transfer,score_total,score_max_total
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1
25,JOANNA CLINIC,15.754015,-153.871776,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,rural,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388,153,180,183,133,30,21,55,18,10,14,14,17,18,12,19,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,partially_operational,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,partial,not_in_place,,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,partially_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,partially_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,0.5,4.0,1.0,5.0,0.5,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,12.0,44.0,22.0,102.0
47,JOANNA CLINIC,60.190934,162.787117,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,peri_urban,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388,153,180,183,133,30,21,55,18,10,14,14,17,18,12,19,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,fully_operational,fully_operational,partially_operational,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,not_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,2.0,4.0,1.5,5.0,0.0,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,11.0,44.0,22.5,102.0


In [345]:
len(df_dup_3)

2

Now we consider cases where at least one of the months' consultation headcounts match. This seems unlikely enough that it's worth looking into.

In [346]:
# considering rows where consultation headcount month 1 matches
df_dup_4 = df[df.duplicated(['facility_name','consultations.head_count.month_1'],keep=False)].sort_values('facility_name')
# considering rows where consultation headcount month 2 matches
df_dup_5 = df[df.duplicated(['facility_name','consultations.head_count.month_2'],keep=False)].sort_values('facility_name')
# considering rows where consultation headcount month 3 matches
df_dup_6 = df[df.duplicated(['facility_name','consultations.head_count.month_3'],keep=False)].sort_values('facility_name')
# considering rows where consultation headcount month 4 matches
df_dup_7 = df[df.duplicated(['facility_name','consultations.head_count.month_4'],keep=False)].sort_values('facility_name')

There will be a lot of overlap among df_dup_3, df_dup_4, df_dup_5, df_dup_6, and df_dup_7. We want to save all the unique entries across these dataframes, without having extra duplicates. We start by merging the duplicate candidates into one dataframe, in the process creating lots of duplicates. Those duplicates need to be deleted later.  

In [347]:
df_dup_potential_redundant = pd.concat([df_dup_3,df_dup_4,df_dup_5,df_dup_6,df_dup_7])

We now find and delete the duplicate indices. 

In [348]:
df_dup_potential = df_dup_potential_redundant[~df_dup_potential_redundant.index.duplicated(keep='first')]
#dup_indices = df_dup_potential_redundant.index.duplicated(keep='first')
#df_dup_potential = df_dup_potential_redundant.drop(df_dup_potential_redundant[dup_indices].index)
df_dup_potential

Unnamed: 0_level_0,facility_name,facility_gps_lat,facility_gps_lon,region_province,district,chiefdom,facility_location,facility_type,facility_type_other,managing_authority,managing_authority_other,setting,outpatient_only,capacity.number_consultation_rooms,capacity.number_inpatient_beds,capacity.number_maternity_beds,cho_number_in_post,cho_number_present_at_visit,cht_number_in_post,cht_number_present_at_visit,cha_number_in_post,cha_number_present_at_visit,cm_number_in_post,cm_number_present_at_visit,sechn_number_in_post,sechn_number_present_at_visit,lab_tech_number_in_post,lab_tech_number_present_at_visit,mch_aides_number_in_post,mch_aides_number_present_at_visit,cleaner_porter_number_in_post,cleaner_porter_number_present_at_visit,other_number_in_post,other_number_present_at_visit,other_staff_cadre,sections_to_review,consultations.head_count.month_1,consultations.head_count.month_2,consultations.head_count.month_3,consultations.head_count.month_4,consultations.general_outpatient.month_1,consultations.general_outpatient.month_2,consultations.general_outpatient.month_3,consultations.general_outpatient.month_4,consultations.deliveries.month_1,consultations.deliveries.month_2,consultations.deliveries.month_3,consultations.deliveries.month_4,consultations.pent_vaccines.month_1,consultations.pent_vaccines.month_2,consultations.pent_vaccines.month_3,consultations.pent_vaccines.month_4,hr_focal_point,hr_staff_received_info,hr_healthcare_provider_training,hr_healthcare_provider_revised_training,hr_daily_staff_list,iec_handwashing_procedure,iec_physical_distancing,iec_covering_nose_mouth,iec_early_symptom_recognition,iec_when_facility_vs_home,iec_rational_ppe_use,iec_helpline_number,surv_procedure_for_notification,surv_official_case_definition,surv_hotline_number,surv_timely_data_reported_to_district,ter_screening_area_set_up,ter_symptom_screening_questionnaires,ter_temperature_measurement_at_triage,ter_physical_distancing_in_waiting,ter_separate_waiting_for_symptomatic,chw_trained_precautions,chw_trained_community_service,chw_drugs,chw_gloves,chw_masks,chw_iec_materials,iso_designated_isolation_for_suspected,iso_distance_between_patients_in_waiting,iso_distance_between_patient_beds,iso_transfer_referral_protocol,ipc_ppe_medical_masks,ipc_ppe_disp_surgical_masks,ipc_ppe_eye_protection,ipc_ppe_examination_gloves,ipc_ppe_surgical_gloves,ipc_ppe_long_cuffed_gloves,ipc_ppe_heavy_duty_gloves,ipc_ppe_long_sleeved_gown,ipc_ppe_waterproof_aprons,ipc_ppe_plan_staff_trained_on_ppe,ipc_ppe_plan_ppe_poster_displayed,ipc_ppe_plan_fit_test_kit,ipc_ppe_plan_contingency_for_shortages,ipc_wcd_colour_coded_bins,ipc_wcd_clinical_waste_bags,ipc_wcd_laundry_receptacles_at_patient_rooms,ipc_wcd_incinerator,ipc_wash_clean_running_water,ipc_wash_hand_soap,ipc_wash_liquid_soap,ipc_wash_disp_hand_towels,ipc_wash_alcohol_based_hand_gel,ipc_ds_protocol_facility_disinfection,ipc_ds_protocol_equipment_sterilisation,ipc_ds_environmental_disinfectant,ipc_ds_cleaning_schedule_in_toilets,ipc_ds_protocol_corpse_handling,log_referral_plan,log_cellphone_landline_swradio,log_tracer_drugs,log_albendazole,log_amoxicillin,log_ampicillin,log_chlorhexidine_5,log_chlorhexidine_7,log_gentamicin,log_folic,log_ferrous_and_folic,log_compound_sodium,log_co_trimoxazole_400,log_co_trimoxazole_200,log_metronidazole_250,log_metronidazole_200,log_methyldopa,log_magnesium_sulphate,log_lidocaine,log_ibuprofen,log_surgical_spirit,log_sodium_chloride,log_povidone,log_paracetamol_500,log_paracetamol_250,log_ors,log_gauze,log_cotton_wool,log_cannula_iv_20,log_cannula_iv_24,log_zinc_sulphate,log_water,log_needle_23,log_needle_21,log_glove_giving,log_glove_surgical,log_glove_gyn,log_glove_exam,log_tape,log_syringe,log_oxytocin,log_diazepam,log_misoprostol,log_glucose,consultations.head_count.monthly_average_head_counts,consultations.general_outpatient.monthly_average_general_outpatient,consultations.deliveries.monthly_average_deliveries,consultations.pent_vaccines.monthly_average_pent_vaccines,score_human_resources,score_max_human_resources,score_information_education_communication,score_max_information_education_communication,score_surveillance,score_max_surveillance,score_triage_and_early_recognition,score_max_triage_and_early_recognition,score_chw,score_max_chw,score_isolation,score_max_isolation,score_infection_prevention_and_control_ppe,score_max_infection_prevention_and_control_ppe,score_infection_prevention_and_control_ppe_plan,score_max_infection_prevention_and_control_ppe_plan,score_infection_prevention_and_control_waste_collection_and_disposal,score_max_infection_prevention_and_control_waste_collection_and_disposal,score_infection_prevention_and_control_water_sanitation_and_hygiene,score_max_infection_prevention_and_control_water_sanitation_and_hygiene,score_infection_prevention_and_control_disinfection_and_sterilization,score_max_infection_prevention_and_control_disinfection_and_sterilization,score_infection_prevention_and_control,score_max_infection_prevention_and_control,question1.score_logistics_patient_and_sample_transfer,question1.score_max_logistics_patient_and_sample_transfer,score_total,score_max_total
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1
25,JOANNA CLINIC,15.754015,-153.871776,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,rural,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388,153,180,183,133,30,21,55,18,10,14,14,17,18,12,19,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,partially_operational,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,partial,not_in_place,,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,partially_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,partially_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,0.5,4.0,1.0,5.0,0.5,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,12.0,44.0,22.0,102.0
47,JOANNA CLINIC,60.190934,162.787117,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,peri_urban,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388,153,180,183,133,30,21,55,18,10,14,14,17,18,12,19,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,fully_operational,fully_operational,partially_operational,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,not_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,2.0,4.0,1.5,5.0,0.0,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,11.0,44.0,22.5,102.0


I'm confident that the duplicates in df_dup_2 are genuine duplicates, so I'm going to remove them from df_dup_potential to make it easier to examine the more dubious cases. 

In [349]:
dup_2_indices = list(df_dup_2.index)

In [350]:
df_dup_potential_2 = df_dup_potential.drop(dup_2_indices,axis=0)

In [351]:
df_dup_potential_2

Unnamed: 0_level_0,facility_name,facility_gps_lat,facility_gps_lon,region_province,district,chiefdom,facility_location,facility_type,facility_type_other,managing_authority,managing_authority_other,setting,outpatient_only,capacity.number_consultation_rooms,capacity.number_inpatient_beds,capacity.number_maternity_beds,cho_number_in_post,cho_number_present_at_visit,cht_number_in_post,cht_number_present_at_visit,cha_number_in_post,cha_number_present_at_visit,cm_number_in_post,cm_number_present_at_visit,sechn_number_in_post,sechn_number_present_at_visit,lab_tech_number_in_post,lab_tech_number_present_at_visit,mch_aides_number_in_post,mch_aides_number_present_at_visit,cleaner_porter_number_in_post,cleaner_porter_number_present_at_visit,other_number_in_post,other_number_present_at_visit,other_staff_cadre,sections_to_review,consultations.head_count.month_1,consultations.head_count.month_2,consultations.head_count.month_3,consultations.head_count.month_4,consultations.general_outpatient.month_1,consultations.general_outpatient.month_2,consultations.general_outpatient.month_3,consultations.general_outpatient.month_4,consultations.deliveries.month_1,consultations.deliveries.month_2,consultations.deliveries.month_3,consultations.deliveries.month_4,consultations.pent_vaccines.month_1,consultations.pent_vaccines.month_2,consultations.pent_vaccines.month_3,consultations.pent_vaccines.month_4,hr_focal_point,hr_staff_received_info,hr_healthcare_provider_training,hr_healthcare_provider_revised_training,hr_daily_staff_list,iec_handwashing_procedure,iec_physical_distancing,iec_covering_nose_mouth,iec_early_symptom_recognition,iec_when_facility_vs_home,iec_rational_ppe_use,iec_helpline_number,surv_procedure_for_notification,surv_official_case_definition,surv_hotline_number,surv_timely_data_reported_to_district,ter_screening_area_set_up,ter_symptom_screening_questionnaires,ter_temperature_measurement_at_triage,ter_physical_distancing_in_waiting,ter_separate_waiting_for_symptomatic,chw_trained_precautions,chw_trained_community_service,chw_drugs,chw_gloves,chw_masks,chw_iec_materials,iso_designated_isolation_for_suspected,iso_distance_between_patients_in_waiting,iso_distance_between_patient_beds,iso_transfer_referral_protocol,ipc_ppe_medical_masks,ipc_ppe_disp_surgical_masks,ipc_ppe_eye_protection,ipc_ppe_examination_gloves,ipc_ppe_surgical_gloves,ipc_ppe_long_cuffed_gloves,ipc_ppe_heavy_duty_gloves,ipc_ppe_long_sleeved_gown,ipc_ppe_waterproof_aprons,ipc_ppe_plan_staff_trained_on_ppe,ipc_ppe_plan_ppe_poster_displayed,ipc_ppe_plan_fit_test_kit,ipc_ppe_plan_contingency_for_shortages,ipc_wcd_colour_coded_bins,ipc_wcd_clinical_waste_bags,ipc_wcd_laundry_receptacles_at_patient_rooms,ipc_wcd_incinerator,ipc_wash_clean_running_water,ipc_wash_hand_soap,ipc_wash_liquid_soap,ipc_wash_disp_hand_towels,ipc_wash_alcohol_based_hand_gel,ipc_ds_protocol_facility_disinfection,ipc_ds_protocol_equipment_sterilisation,ipc_ds_environmental_disinfectant,ipc_ds_cleaning_schedule_in_toilets,ipc_ds_protocol_corpse_handling,log_referral_plan,log_cellphone_landline_swradio,log_tracer_drugs,log_albendazole,log_amoxicillin,log_ampicillin,log_chlorhexidine_5,log_chlorhexidine_7,log_gentamicin,log_folic,log_ferrous_and_folic,log_compound_sodium,log_co_trimoxazole_400,log_co_trimoxazole_200,log_metronidazole_250,log_metronidazole_200,log_methyldopa,log_magnesium_sulphate,log_lidocaine,log_ibuprofen,log_surgical_spirit,log_sodium_chloride,log_povidone,log_paracetamol_500,log_paracetamol_250,log_ors,log_gauze,log_cotton_wool,log_cannula_iv_20,log_cannula_iv_24,log_zinc_sulphate,log_water,log_needle_23,log_needle_21,log_glove_giving,log_glove_surgical,log_glove_gyn,log_glove_exam,log_tape,log_syringe,log_oxytocin,log_diazepam,log_misoprostol,log_glucose,consultations.head_count.monthly_average_head_counts,consultations.general_outpatient.monthly_average_general_outpatient,consultations.deliveries.monthly_average_deliveries,consultations.pent_vaccines.monthly_average_pent_vaccines,score_human_resources,score_max_human_resources,score_information_education_communication,score_max_information_education_communication,score_surveillance,score_max_surveillance,score_triage_and_early_recognition,score_max_triage_and_early_recognition,score_chw,score_max_chw,score_isolation,score_max_isolation,score_infection_prevention_and_control_ppe,score_max_infection_prevention_and_control_ppe,score_infection_prevention_and_control_ppe_plan,score_max_infection_prevention_and_control_ppe_plan,score_infection_prevention_and_control_waste_collection_and_disposal,score_max_infection_prevention_and_control_waste_collection_and_disposal,score_infection_prevention_and_control_water_sanitation_and_hygiene,score_max_infection_prevention_and_control_water_sanitation_and_hygiene,score_infection_prevention_and_control_disinfection_and_sterilization,score_max_infection_prevention_and_control_disinfection_and_sterilization,score_infection_prevention_and_control,score_max_infection_prevention_and_control,question1.score_logistics_patient_and_sample_transfer,question1.score_max_logistics_patient_and_sample_transfer,score_total,score_max_total
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1


Now we're left with only four more potential duplicates to investigate.

For each potential match, let's make a mini dataframe.

In [352]:
display(df_dup_potential_2['facility_name'].isnull())

Series([], Name: facility_name, dtype: bool)

In [353]:
mini_dfs = []
mini_df_dict = {}
clinic_names = df_dup_potential_2['facility_name'].unique()
for name in clinic_names:
    if not pd.isnull(name): # if the facility name is not null
        mini_df = df_dup_potential_2[df_dup_potential_2['facility_name']==name]
    else: # when the facility name is null   
        mini_df = df_dup_potential_2[df_dup_potential_2['facility_name'].isnull()]
    mini_df_dict[name]=mini_df
    mini_dfs.append(mini_df)

In [354]:
for i,mini_df in enumerate(mini_dfs):
    facility_name = mini_df['facility_name'].iloc[0]
    print(f'Facility name: {facility_name}')
    indices = list(mini_df.index)
    print('Where the rows differ: ')
    display(mini_df[mini_df.columns[(mini_df.loc[indices[0]] != mini_df.loc[indices[1]]) & (~(pd.isnull(mini_df.loc[indices[0]]) & pd.isnull(mini_df.loc[indices[1]])))]])
    print('Where the rows are the same: ')
    display(mini_df[mini_df.columns[(mini_df.loc[indices[0]] == mini_df.loc[indices[1]]) | (pd.isnull(mini_df.loc[indices[0]]) & pd.isnull(mini_df.loc[indices[1]]))]])

* JOHN CLINIC: The two rows are sufficiently different that I'm not sure enough that this is the same clinic to merge the rows into one or delete one of them. I therefore leave them as is. 
* KEVIN HOSPITAL: The two rows match in enough places that they are probably referring to the same clinic. They still differ in many places, however. I will merge them, but record both originals in a separate file and make the merging easy to comment out.  
* RICHARD MCHP: The two rows match in enough places that they are probably referring to the same clinic. They still differ in many places, however. I will merge them, but record both originals in a separate file and make the merging easy to comment out.  
* N/A: The rows are very different, so I leave these as two facilities. 


### Merging rows

I will merge the following rows:
* KEVIN HOSPITAL (indices 133 and 39)
* RICHARD MCHP (indices 96 and 97)
* All the facility name pairs in df_dup_2

Before doing any merging, I will export the duplicates to a csv file so that the machine learning engineer can examine it. 

In [356]:
df_dup = df_dup_2#pd.concat([df_dup_2,mini_df_dict['KEVIN HOSPITAL'],mini_df_dict['RICHARD MCHP']])
print(f'Duplicated indices in df_dup: {sum(df_dup.index.duplicated())}')
df_dup.to_csv('similar_rows.csv')
df_dup

Duplicated indices in df_dup: 0


Unnamed: 0_level_0,facility_name,facility_gps_lat,facility_gps_lon,region_province,district,chiefdom,facility_location,facility_type,facility_type_other,managing_authority,managing_authority_other,setting,outpatient_only,capacity.number_consultation_rooms,capacity.number_inpatient_beds,capacity.number_maternity_beds,cho_number_in_post,cho_number_present_at_visit,cht_number_in_post,cht_number_present_at_visit,cha_number_in_post,cha_number_present_at_visit,cm_number_in_post,cm_number_present_at_visit,sechn_number_in_post,sechn_number_present_at_visit,lab_tech_number_in_post,lab_tech_number_present_at_visit,mch_aides_number_in_post,mch_aides_number_present_at_visit,cleaner_porter_number_in_post,cleaner_porter_number_present_at_visit,other_number_in_post,other_number_present_at_visit,other_staff_cadre,sections_to_review,consultations.head_count.month_1,consultations.head_count.month_2,consultations.head_count.month_3,consultations.head_count.month_4,consultations.general_outpatient.month_1,consultations.general_outpatient.month_2,consultations.general_outpatient.month_3,consultations.general_outpatient.month_4,consultations.deliveries.month_1,consultations.deliveries.month_2,consultations.deliveries.month_3,consultations.deliveries.month_4,consultations.pent_vaccines.month_1,consultations.pent_vaccines.month_2,consultations.pent_vaccines.month_3,consultations.pent_vaccines.month_4,hr_focal_point,hr_staff_received_info,hr_healthcare_provider_training,hr_healthcare_provider_revised_training,hr_daily_staff_list,iec_handwashing_procedure,iec_physical_distancing,iec_covering_nose_mouth,iec_early_symptom_recognition,iec_when_facility_vs_home,iec_rational_ppe_use,iec_helpline_number,surv_procedure_for_notification,surv_official_case_definition,surv_hotline_number,surv_timely_data_reported_to_district,ter_screening_area_set_up,ter_symptom_screening_questionnaires,ter_temperature_measurement_at_triage,ter_physical_distancing_in_waiting,ter_separate_waiting_for_symptomatic,chw_trained_precautions,chw_trained_community_service,chw_drugs,chw_gloves,chw_masks,chw_iec_materials,iso_designated_isolation_for_suspected,iso_distance_between_patients_in_waiting,iso_distance_between_patient_beds,iso_transfer_referral_protocol,ipc_ppe_medical_masks,ipc_ppe_disp_surgical_masks,ipc_ppe_eye_protection,ipc_ppe_examination_gloves,ipc_ppe_surgical_gloves,ipc_ppe_long_cuffed_gloves,ipc_ppe_heavy_duty_gloves,ipc_ppe_long_sleeved_gown,ipc_ppe_waterproof_aprons,ipc_ppe_plan_staff_trained_on_ppe,ipc_ppe_plan_ppe_poster_displayed,ipc_ppe_plan_fit_test_kit,ipc_ppe_plan_contingency_for_shortages,ipc_wcd_colour_coded_bins,ipc_wcd_clinical_waste_bags,ipc_wcd_laundry_receptacles_at_patient_rooms,ipc_wcd_incinerator,ipc_wash_clean_running_water,ipc_wash_hand_soap,ipc_wash_liquid_soap,ipc_wash_disp_hand_towels,ipc_wash_alcohol_based_hand_gel,ipc_ds_protocol_facility_disinfection,ipc_ds_protocol_equipment_sterilisation,ipc_ds_environmental_disinfectant,ipc_ds_cleaning_schedule_in_toilets,ipc_ds_protocol_corpse_handling,log_referral_plan,log_cellphone_landline_swradio,log_tracer_drugs,log_albendazole,log_amoxicillin,log_ampicillin,log_chlorhexidine_5,log_chlorhexidine_7,log_gentamicin,log_folic,log_ferrous_and_folic,log_compound_sodium,log_co_trimoxazole_400,log_co_trimoxazole_200,log_metronidazole_250,log_metronidazole_200,log_methyldopa,log_magnesium_sulphate,log_lidocaine,log_ibuprofen,log_surgical_spirit,log_sodium_chloride,log_povidone,log_paracetamol_500,log_paracetamol_250,log_ors,log_gauze,log_cotton_wool,log_cannula_iv_20,log_cannula_iv_24,log_zinc_sulphate,log_water,log_needle_23,log_needle_21,log_glove_giving,log_glove_surgical,log_glove_gyn,log_glove_exam,log_tape,log_syringe,log_oxytocin,log_diazepam,log_misoprostol,log_glucose,consultations.head_count.monthly_average_head_counts,consultations.general_outpatient.monthly_average_general_outpatient,consultations.deliveries.monthly_average_deliveries,consultations.pent_vaccines.monthly_average_pent_vaccines,score_human_resources,score_max_human_resources,score_information_education_communication,score_max_information_education_communication,score_surveillance,score_max_surveillance,score_triage_and_early_recognition,score_max_triage_and_early_recognition,score_chw,score_max_chw,score_isolation,score_max_isolation,score_infection_prevention_and_control_ppe,score_max_infection_prevention_and_control_ppe,score_infection_prevention_and_control_ppe_plan,score_max_infection_prevention_and_control_ppe_plan,score_infection_prevention_and_control_waste_collection_and_disposal,score_max_infection_prevention_and_control_waste_collection_and_disposal,score_infection_prevention_and_control_water_sanitation_and_hygiene,score_max_infection_prevention_and_control_water_sanitation_and_hygiene,score_infection_prevention_and_control_disinfection_and_sterilization,score_max_infection_prevention_and_control_disinfection_and_sterilization,score_infection_prevention_and_control,score_max_infection_prevention_and_control,question1.score_logistics_patient_and_sample_transfer,question1.score_max_logistics_patient_and_sample_transfer,score_total,score_max_total
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1
25,JOANNA CLINIC,15.754015,-153.871776,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,rural,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388,153,180,183,133,30,21,55,18,10,14,14,17,18,12,19,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,partially_operational,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,partial,not_in_place,,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,partially_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,partially_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,0.5,4.0,1.0,5.0,0.5,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,12.0,44.0,22.0,102.0
47,JOANNA CLINIC,60.190934,162.787117,SOUTHERN,BO,MISSISSIPPI,AMBERCHESTER,mch_post,,government_public,,peri_urban,True,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,1,4,0,TBA's,consultations hr information_education_communi...,388,153,180,183,133,30,21,55,18,10,14,14,17,18,12,19,not_completed,partially_completed,not_completed,not_completed,not_completed,not_displayed,not_displayed,not_displayed,not_displayed,not_displayed,displayed,not_displayed,not_in_place,not_in_place,fully_operational,fully_operational,partially_operational,not_in_place,not_in_place,fully_operational,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,not_in_place,fully_operational,fully_operational,fully_operational,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,not_available,not_available,not_available,not_available,partially_achieved,risk_of_shortage,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,not_available,not_available,not_available,risk_of_shortage,not_available,not_available,risk_of_shortage,not_available,available,not_available,not_available,available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,not_available,not_available,not_available,partially_available,not_available,partially_available,partially_available,partially_available,not_available,not_available,partially_available,not_available,not_available,not_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,not_available,partially_available,not_available,not_available,partially_available,partially_available,partially_available,partially_available,available,not_available,not_available,226.0,59.75,14.0,16.5,0.5,5.0,1.0,7.0,2.0,4.0,1.5,5.0,0.0,6.0,3.0,4.0,1.0,9.0,0.5,4.0,1.0,4.0,0.5,5.0,0.5,5.0,3.5,27.0,11.0,44.0,22.5,102.0


Deciding how to merge these rows is tricky. There is no obviously correct way to decide which row of a pair is more valid. I'll therefore write two functions: 

(1) A function that takes the first row of each pair when neither value is NaN, and takes the non-NaN value when one is NaN


(2) A function that does some more complex merging. 

In [373]:
def merge_super_simple(df_dup):
    
    
    df_dup = df_dup.groupby('facility_name', as_index=False).first()#.agg({'Hobbies' : 'first', 'Name' : ' '.join})
    
    return df_dup
def merge_simple(df_dup):
    
    
    df_dup = df_dup.groupby('facility_name', as_index=False).apply(lambda x: x, axis=1)#.agg({'Hobbies' : 'first', 'Name' : ' '.join})
    display(df_dup)
    """names = df_dup['facility_name'].unique()
    for i,name in enumerate(names):
        # remember to deal with NA name
        mini_df = df_dup[df_dup['facility_name']==name]
        indices = list(mini_df.index)
        display(mini_df)
        
        combo_series = mini_df.loc[indices[0]].combine_first(mini_df.loc[indices[1]])
        
    
    
        df_dup = pd.concat([df_dup, combo_series.to_frame().T])
        display(df_dup)
        #df = df.drop([48,87],axis=0)
        #df[df['facility_name']=='AMY CHP']
    """
    return df_dup

In [371]:
def merge_complex(df_dup):
    return df_dup

In [372]:
df_dup_simple = merge_simple(df_dup)
df_dup_simple.to_csv('merged_rows_simple.csv')
df_dup_complex = merge_complex(df_dup)
df_dup_complex.to_csv('merged_rows_complex.csv')

TypeError: merge_simple.<locals>.<lambda>() got an unexpected keyword argument 'axis'

In [360]:
def make_final_df(df,df_dup):
    # drop all rows in df whose indices are in dp_dup
    # merge df and df_dup
    return df

In [None]:
df = make_final_df(df,df_dup_simple) # edit the second parameter to use the duplicates dataframe of your choice

## Export final dataframe

In [None]:
df.to_csv('health_facility_assessment_cleaned.csv', header=True, index=True, index_label=None)