# Import libraries

In [30]:
import os
import pandas as pd
import datetime as dt
import numpy as np
import sys
from pathlib import Path
import requests
import time
import re
sys.path.append(os.path.abspath(".."))

from helpers.df_formating import convert_to_integer, convert_cols_to_snake_case, drop_row_if_not_complete, drop_if_unnamed, excel_time_to_minutes

def to_analyze(df, cols):
    to_analyze = df[cols]
    path_desktop = "/mnt/c/Users/matth/Desktop"
    path_file = f"{path_desktop}/to_analyze.csv"
    to_analyze.to_csv(path_file, index=False)

In [31]:
# --- read token from ../secrets.txt ---
secrets_path = Path("..") / "secrets.txt"

token = None
with open(secrets_path) as f:
    for line in f:
        if line.startswith("MAP_BOX_TOKEN="):
            token = line.strip().split("=", 1)[1]
            break

assert token is not None, "MAP_BOX_TOKEN not found in secrets.txt"

MAPBOX_TOKEN = token

In [32]:
def geocode_place_mapbox_v5(place: str, *, country="MW", proximity=(33.78, -13.97), limit=1):
    q = f"{place}, Malawi"
    url = f"https://api.mapbox.com/geocoding/v5/mapbox.places/{requests.utils.quote(q)}.json"
    params = {
        "access_token": MAPBOX_TOKEN,
        "country": country,
        "proximity": f"{proximity[0]},{proximity[1]}",
        "limit": limit,
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    feats = data.get("features", [])
    if not feats:
        return None

    lon, lat = feats[0]["center"]
    return {
        "query": place,
        "lon": lon,
        "lat": lat,
        "place_name": feats[0].get("place_name"),
        "relevance": feats[0].get("relevance"),
        "feature_id": feats[0].get("id"),
    }




In [33]:
base_dir ="../Nkhoma_data/Data"

In [34]:
# all the files
os.listdir(base_dir)

['old_theatre_books_clean.xlsx',
 'Theatre_Book-Database 2025-plain.xlsx',
 'old_theatre_books_clean.pkl',
 'Old Theatre Books.xlsx',
 '.ipynb_checkpoints',
 'Theatre_Book-Database 2022 Auswertung-Arbeitsversion.xlsx',
 'Theatre_Book-Database 2024 Auswertung-Arbeitsversion.xlsx',
 'theatre_book_database_2022_clean.pkl',
 'Theatre_Book-Database 2023 Auswertung-Arbeitsversion.xlsx',
 'theatre_book_database_2022_clean.xlsx']

# Lets clean Theatre_Book-Database 2022 Auswertung-Arbeitsversion.xlsx

In [35]:
file_to_clean = "Theatre_Book-Database 2022 Auswertung-Arbeitsversion.xlsx"
path = f"{base_dir}/{file_to_clean}"
df = pd.read_excel(path, engine="openpyxl")
df.head()

  warn(msg)


Unnamed: 0,Theatre Book #,Hospital ID #,DATE of Surgery,First Name,Last Name,Age (years),Sex,Village,Surgeon,1st Assistent/Instructor,...,Urgency,Surgery severity,ASA-Score,Year of birth,Operation time (minutes),Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34
0,220001,,2022-01-01,Elifa,Sumati,26.0,F,Nkhonde,Obs/Gyn,,...,,,,1997.0,00:00:00,,Calculated: do not fill out,Fill out for every patient,Fill out for all PAACS cases,Fill out if possible
1,220002,,2022-01-01,Siyatu,Isaac,27.0,F,Mozambique,Obs/Gyn,,...,,,,1996.0,00:00:00,,,,,
2,220003,,2022-01-02,Loness,Mapemphero,25.0,F,Chembe,Obs/Gyn,,...,,,,1998.0,00:00:00,,,,,
3,220004,,2022-01-03,Saizi,Nedson,48.0,M,Chilikumanda,Limbe,Caleb,...,Emergency,Major,ASA 3,1975.0,00:00:00,,,,,
4,220005,,2022-01-03,Beatrice,Hezekia,26.0,F,Mazengera,Obs/Gyn,,...,,,,1997.0,00:00:00,,,,,


In [36]:
df = convert_cols_to_snake_case(df)
df = drop_if_unnamed(df)
# coerce theatre_book to numeric (invalid entries → NaN)
df["theatre_book"] = pd.to_numeric(df["theatre_book"], errors="coerce")
# keep only rows with a valid theatre_book number
df = df.dropna(subset=["theatre_book"])
df = convert_to_integer(df, ['theatre_book', 'hospital_id', 'age_years', 'year_of_birth'])
# rewrite typos
df = df.rename(columns={
    "sarting_time": "starting_time",
    "asascore": "asa_score",
})

In [37]:
df.columns

Index(['theatre_book', 'hospital_id', 'date_of_surgery', 'first_name',
       'last_name', 'age_years', 'sex', 'village', 'surgeon',
       'first_assistent_instructor', 'second_assistent', 'anaestesist',
       'nurse', 'anesthesia', 'department', 'indication_for_surgery',
       'surgery_type', 'final_diagnosis_category', 'final_diagnosis_free_text',
       'side', 'main_procedure_category', 'procedure_free_text', 'histology',
       'starting_time', 'finishing_time', 'urgency', 'surgery_severity',
       'asa_score', 'year_of_birth', 'operation_time_minutes'],
      dtype='object')

In [38]:
to_analyze(df, ['village'])

In [39]:
# Drop hospital_id 99% missing
df = df.drop(columns=["hospital_id"])

In [40]:
# convert date_of_surgery to datetime
df["date_of_surgery"] = pd.to_datetime(
    df["date_of_surgery"],
    errors="coerce"
)

In [41]:
# Standardize age
df["age_years"].describe()

count       1507.0
mean     33.930325
std       18.71034
min            1.0
25%           21.0
50%           29.0
75%           45.0
max           97.0
Name: age_years, dtype: Float64

In [42]:
df["age_years"] = df["age_years"].astype("Int64")
df["age_years"].dtype
df["age_years"].isna().sum()

33

In [43]:
# Standardizing sex

In [44]:
df["sex"].value_counts(dropna=False)
df["sex"] = pd.Categorical(
    df["sex"],
    categories=["F", "M"]
)

In [45]:
# Cleaning surgeon
df["surgeon"].value_counts(dropna=False)

surgeon
Obs/Gyn     755
Limbe       197
Lam         154
Caleb        97
Terry        79
Widmann      57
Stuebing     48
Other        35
Vitu         34
Vaylann      32
obs/Gyn      21
Thoko        16
lam          13
NaN           2
Name: count, dtype: int64

In [46]:
df["surgeon"] = (
    df["surgeon"]
    .str.strip()
    .str.title()
)

In [47]:
# Classify when not an individual
def classify_surgeon(x):
    if pd.isna(x):
        return "unknown"
    if x == "Obs/Gyn":
        return "specialty"
    if x in ["Limbe", "Lam"]:
        return "facility"
    if x == "Other":
        return "unknown"
    return "individual"

df["surgeon_type"] = df["surgeon"].apply(classify_surgeon)

In [48]:
df["surgeon_type"] = pd.Categorical(
    df["surgeon_type"],
    categories=["individual", "facility", "specialty", "unknown"]
)

In [49]:
# normalize names

In [50]:
for col in ["first_name", "last_name"]:
    df[col] = (
        df[col]
        .astype("string")
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
        .str.title()
    )

In [51]:
df[["first_name", "last_name"]].isna().sum()

first_name    3
last_name     4
dtype: int64

In [52]:
# lowercase village
df["village"] = df["village"].str.lower()

In [53]:
df["village"].value_counts(dropna=False)

village
lilongwe     295
mazengera    177
dedza        165
NaN          114
tambala       58
            ... 
kde            1
zomba          1
ntchisi        1
mponera        1
mchedza        1
Name: count, Length: 414, dtype: int64

In [54]:
def normalize(text):
    if pd.isna(text):
        return pd.NA
    return (
        str(text)
        .lower()
        .strip()
        .replace(",", "")
    )

df["village_norm"] = df["village"].apply(normalize)

def classify_place(s):
    if pd.isna(s):
        return "missing"

    # Lilongwe Areas (Area 11, area23, a23)
    if re.match(r"^(area\s*\d+|a\s*\d+)$", s):
        return "lilongwe_area"

    # Mile-based informal locations (6 miles, 6miles)
    if re.match(r"^\d+\s*miles?$", s) or re.match(r"^\d+miles?$", s):
        return "distance_marker"

    # Known landmarks
    if s in {"airfield", "airport", "battalion", "batalion"}:
        return "landmark"

    # Countries / outside Malawi (very important to catch)
    if s in {"mozambique", "zambia", "tanzania"}:
        return "foreign_country"

    # Very short / code-like tokens
    if re.match(r"^[a-z]\d+$", s):
        return "code_like"

    # Everything else: assume village / town name
    return "named_place"

df["place_type"] = df["village_norm"].apply(classify_place)

df["place_type"].value_counts()


place_type
named_place        1384
missing             114
foreign_country      21
lilongwe_area        16
landmark              4
distance_marker       1
Name: count, dtype: int64

In [56]:
def build_geocode_query(row, default_city="Lilongwe"):
    s = row["village_norm"]
    t = row["place_type"]

    if t == "missing":
        return pd.Series([pd.NA, "none", 0.0])

    if t == "foreign_country":
        # keep but flag as very low confidence
        return pd.Series([f"{s}", "foreign", 0.2])

    if t == "lilongwe_area":
        # Area-based Lilongwe neighborhoods
        return pd.Series([f"{s.replace('a', 'Area ')}, {default_city}, Malawi", "lilongwe", 0.9])

    if t == "distance_marker":
        # e.g. "6 miles" → peri-urban Lilongwe
        return pd.Series([f"{s}, {default_city}, Malawi", "lilongwe_periurban", 0.7])

    if t == "landmark":
        # anchor landmark searches strongly
        return pd.Series([f"{s}, {default_city}, Malawi", "lilongwe_landmark", 0.8])

    if t == "named_place":
        # normal villages / towns
        return pd.Series([f"{s}, Malawi", "national", 0.85])

    # fallback
    return pd.Series([f"{s}, Malawi", "unknown", 0.4])


df[["geocode_text", "geo_prior", "confidence_prior"]] = (
    df.apply(build_geocode_query, axis=1)
)

df[["village", "place_type", "geocode_text", "geo_prior", "confidence_prior"]].head(20)


Unnamed: 0,village,place_type,geocode_text,geo_prior,confidence_prior
0,nkhonde,named_place,"nkhonde, Malawi",national,0.85
1,mozambique,foreign_country,mozambique,foreign,0.2
2,chembe,named_place,"chembe, Malawi",national,0.85
3,chilikumanda,named_place,"chilikumanda, Malawi",national,0.85
4,mazengera,named_place,"mazengera, Malawi",national,0.85
5,mazengera,named_place,"mazengera, Malawi",national,0.85
6,kamphika,named_place,"kamphika, Malawi",national,0.85
7,mazengera,named_place,"mazengera, Malawi",national,0.85
8,mzuzu,named_place,"mzuzu, Malawi",national,0.85
9,mazengera,named_place,"mazengera, Malawi",national,0.85


In [57]:
import requests
import time
from pathlib import Path

# Reuse MAPBOX_TOKEN from earlier
PROXIMITY_LILONGWE = (33.78, -13.97)  # lon, lat

def mapbox_geocode(text, proximity=PROXIMITY_LILONGWE, limit=1):
    if pd.isna(text):
        return None

    url = f"https://api.mapbox.com/geocoding/v5/mapbox.places/{requests.utils.quote(text)}.json"
    params = {
        "access_token": MAPBOX_TOKEN,
        "country": "MW",
        "proximity": f"{proximity[0]},{proximity[1]}",
        "limit": limit,
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    feats = data.get("features", [])
    if not feats:
        return None

    f = feats[0]
    lon, lat = f["center"]
    return {
        "geocode_text": text,
        "lon": lon,
        "lat": lat,
        "place_name": f.get("place_name"),
        "relevance": f.get("relevance", 0.0),
        "feature_id": f.get("id"),
    }


# ---- build unique geocoding table ----
unique_geo = (
    df[["geocode_text"]]
    .dropna()
    .drop_duplicates()
)

cache_path = Path("mapbox_geocoded_results.csv")
if cache_path.exists():
    cache = pd.read_csv(cache_path)
else:
    cache = pd.DataFrame(columns=[
        "geocode_text", "lon", "lat", "place_name", "relevance", "feature_id"
    ])

cached = set(cache["geocode_text"].astype(str))

new_results = []

for text in unique_geo["geocode_text"]:
    if text in cached:
        continue

    result = mapbox_geocode(text)
    new_results.append(
        result if result is not None else {
            "geocode_text": text,
            "lon": pd.NA,
            "lat": pd.NA,
            "place_name": pd.NA,
            "relevance": 0.0,
            "feature_id": pd.NA,
        }
    )
    time.sleep(0.1)

if new_results:
    cache = pd.concat([cache, pd.DataFrame(new_results)], ignore_index=True)
    cache.to_csv(cache_path, index=False)

cache.head(), cache["lat"].notna().mean()

  cache = pd.concat([cache, pd.DataFrame(new_results)], ignore_index=True)


(           geocode_text        lon        lat                place_name  \
 0       nkhonde, Malawi  34.701808 -15.300392      Malawi, Neno, Malawi   
 1            mozambique       <NA>       <NA>                      <NA>   
 2        chembe, Malawi  34.846135   -14.0218  Chembe, Mangochi, Malawi   
 3  chilikumanda, Malawi  33.739164 -13.215804                    Malawi   
 4     mazengera, Malawi  34.701808 -15.300392      Malawi, Neno, Malawi   
 
    relevance     feature_id  
 0        0.5  place.6260894  
 1        0.0           <NA>  
 2        1.0   place.870558  
 3        1.0   country.8862  
 4        0.5  place.6260894  ,
 0.9951573849878934)

In [59]:
import pandas as pd

# --- find the right column names if pandas added suffixes ---
def pick_col(base):
    if base in df.columns:
        return base
    for c in df.columns:
        if c.startswith(base + "_"):
            return c
    return None

feature_id_col = pick_col("feature_id")
place_name_col = pick_col("place_name")
relevance_col  = pick_col("relevance")
lat_col        = pick_col("lat")
lon_col        = pick_col("lon")

print("Using columns:", {
    "feature_id": feature_id_col,
    "place_name": place_name_col,
    "relevance": relevance_col,
    "lat": lat_col,
    "lon": lon_col
})

# --- country-level detection ---
if feature_id_col is not None:
    bad_country = df[feature_id_col].astype("string").str.startswith("country.", na=False)
else:
    # fallback: place_name equals "Malawi" or ends with ", Malawi" but is too generic
    bad_country = df[place_name_col].astype("string").str.fullmatch(r"malawi", case=False, na=False)

# --- loose Malawi bounds sanity check ---
out_of_malawi = (
    df[lon_col].notna() & df[lat_col].notna() &
    ((df[lon_col] < 32.6) | (df[lon_col] > 35.9) | (df[lat_col] < -17.2) | (df[lat_col] > -9.3))
)

min_relevance = 0.8
rel = df[relevance_col].fillna(0.0) if relevance_col is not None else 0.0

df["is_bad_geocode"] = (
    df[lat_col].isna() |
    df[lon_col].isna() |
    bad_country |
    (rel < min_relevance) |
    out_of_malawi
)

audit = (
    df.loc[df["is_bad_geocode"], ["village", "geocode_text", place_name_col, relevance_col]]
      .value_counts()
      .head(30)
)

df["is_bad_geocode"].mean(), audit


Using columns: {'feature_id': None, 'place_name': None, 'relevance': None, 'lat': None, 'lon': None}


KeyError: None

In [7]:
pkl_path = f"{base_dir}/theatre_book_database_2022_clean.pkl"
xlsx_path = f"{base_dir}/theatre_book_database_2022_clean.xlsx"

df.to_pickle(pkl_path)
df.to_excel(xlsx_path, index=False)

In [20]:
to_analyze(df, ['theatre_book', 'hospital_id', 'date_of_surgery'])