In [1]:
import os
import time
import requests
import pandas as pd
from dotenv import load_dotenv

# Load API key from .env file
load_dotenv()
API_KEY = os.getenv("GOOGLE_API_KEY")
if not API_KEY:
    raise SystemExit("Missing GOOGLE_API_KEY in .env file. Create a .env file with: GOOGLE_API_KEY=your_key_here")


In [8]:
# Botswana district centroids (updated to match actual district names)
district_centroids = {
    "Barolong": (-24.5, 25.5),
    "Central Bobonong": (-22.4, 28.3),
    "Central Boteti": (-22.0, 24.5),
    "Central Kalahari Game Reserve": (-22.8, 23.5),
    "Central Kgalagadi Game Reserve": (-24.5, 21.0),
    "Central Mahalapye": (-23.1, 26.8),
    "Central Serowe/ Palapye": (-22.87, 26.57),
    "Central Tutume": (-21.5, 27.5),
    "Chobe": (-18.6, 24.5),
    "Francistown": (-21.15, 27.5),
    "Gaborone": (-24.64, 25.92),
    "Ghanzi": (-21.7, 21.8),
    "Jwaneng": (-24.74, 24.73),
    "Kgalagadi North": (-24.5, 21.8),
    "Kgalagadi South": (-25.6, 22.0),
    "Kgatleng": (-24.4, 26.4),
    "Kweneng East": (-24.7, 25.7),
    "Kweneng West": (-23.8, 24.5),
    "Lobatse": (-25.2, 25.7),
    "Ngamiland Delta": (-19.5, 23.3),
    "Ngamiland East": (-20.5, 25.0),
    "Ngamiland West": (-19.8, 21.5),
    "Ngwaketse": (-25.3, 24.5),
    "Ngwaketse West": (-25.5, 23.5),
    "North East": (-20.9, 27.5),
    "Orapa": (-21.3, 25.2),
    "Selibe Phikwe": (-21.95, 27.82),
    "South East": (-24.7, 25.95),
    "Sowa": (-20.4, 26.3)
}


def is_valid_botswana(lat, lng):
    """Reject any Google coordinate outside Botswana."""
    return (-27.5 <= lat <= -17.0) and (19.0 <= lng <= 29.8)


def google_geocode_strict(query, district, api_key):
    """Strict Google lookup—reject ambiguous or wrong results."""
    url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {"address": query, "region": "bw", "key": api_key}

    try:
        r = requests.get(url, params=params, timeout=10)
        data = r.json()
    except:
        return None

    if data.get("status") != "OK":
        return None

    result = data["results"][0]
    loc = result["geometry"]["location"]
    lat, lng = loc["lat"], loc["lng"]

    # Reject coordinates outside Botswana bounding box
    if not is_valid_botswana(lat, lng):
        return None

    # Simple address validation: must mention district
    addr = result.get("formatted_address", "").lower()
    if district.lower() not in addr:
        return None

    return (lat, lng)


def safe_geocode(village, district, api_key):
    """Main geocode function with fallback to district centroid."""
    # Census-only placeholders → skip geocoding entirely
    if village.strip().lower() in [
        "localities with no affiliation",
        "unclassified",
        "unallocated",
        "location not specified"
    ]:
        if district not in district_centroids:
            print(f"Warning: District '{district}' not in centroids dict, using Gaborone")
            return district_centroids.get("Gaborone", (-24.64, 25.92))
        return district_centroids[district]

    # Google strict attempt
    query = f"{village}, Botswana"
    result = google_geocode_strict(query, district, api_key)

    if result is not None:
        return result

    # Fallback: district centroid
    if district not in district_centroids:
        print(f"Warning: District '{district}' not in centroids dict, using Gaborone")
        return district_centroids.get("Gaborone", (-24.64, 25.92))
    return district_centroids[district]


# Load file from census_datacleaning folder
df = pd.read_csv("../census_datacleaning/census_population_2022_deduped.csv")

# Detect village and district columns (case-insensitive, handle variations)
print("Available columns:", list(df.columns))
village_col = None
district_col = None

for col in df.columns:
    col_lower = col.lower()
    if 'village' in col_lower or 'town' in col_lower or 'city' in col_lower:
        village_col = col
    if 'district' in col_lower:
        district_col = col

if village_col is None or district_col is None:
    raise KeyError(f"Could not find village/town column or district column. Available: {list(df.columns)}")

print(f"Using village column: '{village_col}'")
print(f"Using district column: '{district_col}'")

# Initialize coordinate columns (overwrite if exist)
df["latitude"] = None
df["longitude"] = None

# Geocode ALL rows (no nominatim leftovers)
for idx, row in df.iterrows():
    village = row[village_col]
    district = row[district_col]

    lat, lng = safe_geocode(village, district, API_KEY)
    df.at[idx, "latitude"] = lat
    df.at[idx, "longitude"] = lng

    time.sleep(0.25)  # avoid throttling


df.to_csv("census_population_2022_geocoded_google_strict.csv", index=False)
print("Done — saved as census_population_2022_geocoded_google_strict.csv")

Available columns: ['Census District', 'City/Town/Village', 'Total Population', 'Year']
Using village column: 'City/Town/Village'
Using district column: 'Census District'
Done — saved as census_population_2022_geocoded_google_strict.csv
Done — saved as census_population_2022_geocoded_google_strict.csv
