In [None]:
import os
import time
import requests
import pandas as pd
from dotenv import load_dotenv

# load API key
load_dotenv()
API_KEY = os.getenv("GOOGLE_API_KEY")
if not API_KEY:
    raise SystemExit("Missing GOOGLE_API_KEY in .env file")

In [None]:
# Botswana district centroids (you can refine later)
district_centroids = {
    "Ngamiland Delta": (-19.5, 23.3),
    "Central Serowe-Palapye": (-22.87, 26.57),
    "Central Bobonong": (-22.4, 28.3),
    "Chobe": (-18.6, 24.5),
    "Ghanzi": (-21.7, 21.8),
    "Kgalagadi South": (-25.6, 22.0),
    "Kgalagadi North": (-24.5, 21.8),
    "Kgatleng": (-24.4, 26.4),
    "Kweneng East": (-24.7, 25.7),
    "Kweneng West": (-23.8, 24.5),
    "South East": (-24.7, 25.95),
    "Southern": (-25.1, 25.3),
    "North East": (-20.9, 27.5),
    "Central Mahalapye": (-23.1, 26.8)
}


def is_valid_botswana(lat, lng):
    """Reject any Google coordinate outside Botswana."""
    return (-27.5 <= lat <= -17.0) and (19.0 <= lng <= 29.8)


def google_geocode_strict(query, district, api_key):
    """Strict Google lookup—reject ambiguous or wrong results."""
    url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {"address": query, "region": "bw", "key": api_key}

    try:
        r = requests.get(url, params=params, timeout=10)
        data = r.json()
    except:
        return None

    if data.get("status") != "OK":
        return None

    result = data["results"][0]
    loc = result["geometry"]["location"]
    lat, lng = loc["lat"], loc["lng"]

    # Reject coordinates outside Botswana bounding box
    if not is_valid_botswana(lat, lng):
        return None

    # Simple address validation: must mention district
    addr = result.get("formatted_address", "").lower()
    if district.lower() not in addr:
        return None

    return (lat, lng)


def safe_geocode(village, district, api_key):
    """Main geocode function with fallback to district centroid."""
    # Census-only placeholders → skip geocoding entirely
    if village.strip().lower() in [
        "localities with no affiliation",
        "unclassified",
        "unallocated",
        "location not specified"
    ]:
        return district_centroids[district]

    # Google strict attempt
    query = f"{village}, Botswana"
    result = google_geocode_strict(query, district, api_key)

    if result is not None:
        return result

    # Fallback: district centroid
    return district_centroids[district]


# Load file
df = pd.read_csv("../census_population_2022_deduped.csv")

# Initialize coordinate columns (overwrite if exist)
df["latitude"] = None
df["longitude"] = None


# Geocode ALL rows (no nominatim leftovers)
for idx, row in df.iterrows():
    village = row["city/town/village"]
    district = row["census_district"]

    lat, lng = safe_geocode(village, district, API_KEY)
    df.at[idx, "latitude"] = lat
    df.at[idx, "longitude"] = lng

    time.sleep(0.25)  # avoid throttling


df.to_csv("census_population_2022_geocoded_google_strict.csv", index=False)
print("Done — saved as census_population_2022_geocoded_google_strict.csv")