In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm

census = pd.read_csv("census_population_2022_deduped.csv")

# clean columns
census.columns = census.columns.str.strip().str.lower().str.replace(" ", "_")

# drop duplicates to avoid repeated lookups
towns = census[["census_district", "city/town/village"]].drop_duplicates().copy()

# combine district and town for better search accuracy
towns["query"] = towns["city/town/village"] + ", " + towns["census_district"] + ", Botswana"



In [3]:
geolocator = Nominatim(user_agent="botswana_census_geocoder")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)  # stay polite

tqdm.pandas()

# get coordinates
towns["location"] = towns["query"].progress_apply(geocode)
towns["latitude"] = towns["location"].apply(lambda loc: loc.latitude if loc else None)
towns["longitude"] = towns["location"].apply(lambda loc: loc.longitude if loc else None)


  0%|          | 3/8321 [00:02<2:11:37,  1.05it/s]RateLimiter caught an error, retrying (0/2 tries). Called with (*('Gaborone West Extension 2 (Phase 1), Gaborone, Botswana',), **{}).
Traceback (most recent call last):
  File "/Users/elee/Documents/GitHub/thesiscode2026/.conda/lib/python3.11/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/Users/elee/Documents/GitHub/thesiscode2026/.conda/lib/python3.11/site-packages/urllib3/connection.py", line 565, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/elee/Documents/GitHub/thesiscode2026/.conda/lib/python3.11/http/client.py", line 1395, in getresponse
    response.begin()
  File "/Users/elee/Documents/GitHub/thesiscode2026/.conda/lib/python3.11/http/client.py", line 325, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^

KeyboardInterrupt: 

In [None]:
towns.to_csv("census_villages_geocoded.csv", index=False)
# here, you will find that there are several missing coordinates due to limitations of Nominatim. 
# to improve coverage, we can use Google Maps Geocoding API as a secondary source. 
# see the geocode_google.py script for that.

In [None]:
geo = pd.read_csv("botswana_geocode/census_villages_geocoded_google.csv")
pop = pd.read_csv("census_population_2022.csv")

# Standardize names to prevent merge issues
geo["city/town/village"] = geo["city/town/village"].str.strip().str.lower()
pop["city/town/village"] = pop["City/Town/Village"].str.strip().str.lower()

merged = pop.merge(geo[["city/town/village","latitude","longitude"]],
                   on="city/town/village",
                   how="left")

merged.to_csv("census_population_with_coords.csv", index=False)