In [4]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
import requests
from tqdm import tqdm

# load 
pop = pd.read_csv("census_datacleaning/census_population_with_coords.csv")
fac = pd.read_csv("facilities_with_warehouses.csv")

# clean 
pop = pop.dropna(subset=["latitude", "longitude"])
fac = fac.dropna(subset=["Latitude", "Longitude"])
fac.columns = fac.columns.str.strip()
fac.rename(columns={"Latitude": "latitude", "Longitude": "longitude"}, inplace=True)

# normalize population columns to lowercase for consistency
pop.columns = pop.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("/", "_")
pop.rename(columns={"city_town_village": "city/town/village", "census_district": "district"}, inplace=True)

# normalize and filter facility names
fac["Service Delivery Type"] = (
    fac["Service Delivery Type"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
)
fac = fac[~fac["Facility Name"].str.contains("prison|school", case=False, na=False)]

# osrm endpoint
OSRM_URL = "http://localhost:5001/route/v1/driving"

def osrm_distance(lat1, lon1, lat2, lon2):
    """Return (km, minutes) via OSRM or NaN if failure"""
    url = f"{OSRM_URL}/{lon1},{lat1};{lon2},{lat2}?overview=false"
    try:
        r = requests.get(url, timeout=8)
        data = r.json()
        if "routes" in data and data["routes"]:
            route = data["routes"][0]
            return route["distance"] / 1000, route["duration"] / 60
    except Exception:
        pass
    return np.nan, np.nan

# nearest
def nearest_facilities(pop_df, fac_df, subtype, n=1):
    if subtype == "Clinic":
        sub = fac_df[
            fac_df["Service Delivery Type"].str.lower().isin(["clinic", "clinic with maternity"])
        ]
    elif subtype == "Hospital":
        # Treat Primary Hospital and District Hospital as equivalent
        # Also include specific Referral Hospitals: Princess Marina and Nyangabgwe
        sub = fac_df[
            fac_df["Service Delivery Type"].str.lower().isin(["primary hospital", "district hospital"]) |
            ((fac_df["Service Delivery Type"].str.lower() == "referral hospital") & 
             (fac_df["Facility Name"].str.contains("Princess Marina|Nyangabgwe", case=False, na=False)))
        ]
    else:
        sub = fac_df[fac_df["Service Delivery Type"].str.lower() == subtype.lower()]

    if sub.empty:
        print(f"no facilities for {subtype}")
        return pop_df

    tree = cKDTree(sub[["latitude", "longitude"]].to_numpy())
    dist, idx = tree.query(pop_df[["latitude", "longitude"]].to_numpy(), k=n)
    dist = dist[:, None] if n == 1 else dist
    idx = idx[:, None] if n == 1 else idx
    nearest = [sub.iloc[idx[:, i]].reset_index(drop=True).add_suffix(f"_{i+1}") for i in range(n)]
    merged = pd.concat(nearest, axis=1)
    merged["crow_dist_km_1"] = dist[:, 0]
    return pd.concat([pop_df.reset_index(drop=True), merged], axis=1)

# find nearest 
results = {}
for subtype in ["Health Post", "Clinic", "Hospital"]:
    print(f"finding nearest {subtype.lower()}s...")
    results[subtype] = nearest_facilities(pop, fac, subtype)

# osrm routing 
for subtype, df in results.items():
    print(f"routing for {subtype.lower()}...")
    dist_list, time_list = [], []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        d_km, t_min = osrm_distance(row["latitude"], row["longitude"],
                                    row["latitude_1"], row["longitude_1"])
        dist_list.append(d_km)
        time_list.append(t_min)
    df[f"osrm_dist_km_{subtype}"] = dist_list
    df[f"osrm_time_min_{subtype}"] = time_list

# merge all 
merged = results["Health Post"].copy()
merged = merged.rename(columns={
    "Facility Name_1": "nearest_HealthPost_name",
    "latitude_1": "nearest_HealthPost_lat",
    "longitude_1": "nearest_HealthPost_lon"
})

for subtype in ["Clinic", "Hospital"]:
    keep = [
        "city/town/village",
        "Facility Name_1", "latitude_1", "longitude_1",
        f"osrm_dist_km_{subtype}", f"osrm_time_min_{subtype}"
    ]
    subdf = results[subtype][keep].drop_duplicates("city/town/village").rename(columns={
        "Facility Name_1": f"nearest_{subtype}_name",
        "latitude_1": f"nearest_{subtype}_lat",
        "longitude_1": f"nearest_{subtype}_lon"
    })
    merged = merged.merge(subdf, on="city/town/village", how="left")

# export 
merged.to_csv("population_nearest_facilities_osrm.csv", index=False)
print("done")


finding nearest health posts...
finding nearest clinics...
finding nearest hospitals...
routing for health post...


100%|██████████| 10457/10457 [01:33<00:00, 111.66it/s]



routing for clinic...


100%|██████████| 10457/10457 [01:35<00:00, 109.47it/s]
100%|██████████| 10457/10457 [01:35<00:00, 109.47it/s]


routing for hospital...


100%|██████████| 10457/10457 [01:59<00:00, 87.84it/s] 



done


In [5]:
fac = pd.read_csv("facilities_with_warehouses.csv")
fac.columns = fac.columns.str.strip()
fac = fac.dropna(subset=["Latitude", "Longitude"])
fac = fac.rename(columns={"Latitude": "latitude", "Longitude": "longitude"})
fac = fac[~fac["Facility Name"].str.contains("school|prison", case=False, na=False)]
fac["Service Delivery Type"] = (
    fac["Service Delivery Type"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
)

clinics = fac[fac["Service Delivery Type"].str.lower() == "clinic"]
clinics_maternity = fac[fac["Service Delivery Type"].str.lower() == "clinic with maternity"]

print("Clinics (non-maternity):", len(clinics))
print("Clinics with maternity:", len(clinics_maternity))
print("Total clinic-tier facilities:", len(clinics) + len(clinics_maternity))

Clinics (non-maternity): 164
Clinics with maternity: 82
Total clinic-tier facilities: 246


In [6]:
# load + clean 
fac = pd.read_csv("facilities_with_warehouses.csv")
fac.columns = fac.columns.str.strip()
fac = fac.dropna(subset=["Latitude", "Longitude"])
fac = fac.rename(columns={"Latitude": "latitude", "Longitude": "longitude"})
fac = fac[~fac["Facility Name"].str.contains("school|prison", case=False, na=False)]
fac["Service Delivery Type"] = (
    fac["Service Delivery Type"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
)

merged = pd.read_csv("population_nearest_facilities_osrm.csv")

# define facility categories (include Primary + District Hospitals, plus Princess Marina + Nyangabgwe only)
categories = {
    "Health Post": fac.loc[fac["Service Delivery Type"].str.lower() == "health post", "Facility Name"],
    "Clinic (incl. maternity)": fac.loc[fac["Service Delivery Type"].str.lower().isin(["clinic", "clinic with maternity"]), "Facility Name"],
    "Hospital": fac.loc[
        fac["Service Delivery Type"].str.lower().isin(["primary hospital", "district hospital"]) |
        ((fac["Service Delivery Type"].str.lower() == "referral hospital") & 
         (fac["Facility Name"].str.contains("Princess Marina|Nyangabgwe", case=False, na=False))),
        "Facility Name"
    ]
}

used_from_merge = {
    "Health Post": set(merged["nearest_HealthPost_name"].dropna().unique()),
    "Clinic (incl. maternity)": set(merged["nearest_Clinic_name"].dropna().unique()),
    "Hospital": set(merged["nearest_Hospital_name"].dropna().unique())
}

# summary 
print("Facility usage summary:\n")
summary = {}
for cat, fac_names in categories.items():
    all_set = set(fac_names)
    used_set = used_from_merge[cat]
    unused_set = all_set - used_set
    summary[cat] = {
        "total": len(all_set),
        "used": len(used_set),
        "unused": len(unused_set),
        "unused_list": sorted(list(unused_set))
    }
    print(f"{cat}: {summary[cat]['used']} of {summary[cat]['total']} used ({summary[cat]['unused']} unused)")
max_len = max(len(v["unused_list"]) for v in summary.values())
unused_df = pd.DataFrame({
    "Health Posts (unused)": summary["Health Post"]["unused_list"] + [None] * (max_len - len(summary["Health Post"]["unused_list"])),
    "Clinics (unused)": summary["Clinic (incl. maternity)"]["unused_list"] + [None] * (max_len - len(summary["Clinic (incl. maternity)"]["unused_list"])),
    "Hospitals (unused)": summary["Hospital"]["unused_list"] + [None] * (max_len - len(summary["Hospital"]["unused_list"]))
})
unused_df.to_csv("unused_facilities_check.csv", index=False)
print("\nSaved unused facility names to 'unused_facilities_check.csv'")


Facility usage summary:

Health Post: 41 of 320 used (279 unused)
Clinic (incl. maternity): 67 of 246 used (179 unused)
Hospital: 22 of 27 used (5 unused)

Saved unused facility names to 'unused_facilities_check.csv'
