# Import libraries

In [1]:
import os
import pandas as pd
import datetime as dt
import numpy as np
import sys
from pathlib import Path
import requests
import time
import re
sys.path.append(os.path.abspath(".."))

from helpers.df_formating import (
    convert_to_integer, 
    convert_cols_to_snake_case,
    normalize,
    drop_row_if_not_complete, 
    drop_if_unnamed, 
    excel_time_to_minutes)

from helpers.geo_coding import prepare_village_for_geocoding, geocode_place_mapbox_v5, geocode_unique_queries_mapbox

def to_analyze(df, cols):
    to_analyze = df[cols]
    path_desktop = "/mnt/c/Users/matth/Desktop"
    path_file = f"{path_desktop}/to_analyze.csv"
    to_analyze.to_csv(path_file, index=False)

In [2]:
# --- read token from ../secrets.txt ---
secrets_path = Path("..") / "secrets.txt"

token = None
with open(secrets_path) as f:
    for line in f:
        if line.startswith("MAP_BOX_TOKEN="):
            token = line.strip().split("=", 1)[1]
            break

assert token is not None, "MAP_BOX_TOKEN not found in secrets.txt"

MAPBOX_TOKEN = token

In [3]:
base_dir ="../Nkhoma_data/Data"

In [4]:
# all the files
os.listdir(base_dir)

['2022_theatre_books_clean_unknown_villages.xlsx',
 'old_theatre_books_clean.xlsx',
 'Theatre_Book-Database 2025-plain.xlsx',
 'old_theatre_books_clean.pkl',
 'Old Theatre Books.xlsx',
 '2022_theatre_books_clean.xlsx',
 '.ipynb_checkpoints',
 'Theatre_Book-Database 2022 Auswertung-Arbeitsversion.xlsx',
 'Theatre_Book-Database 2024 Auswertung-Arbeitsversion.xlsx',
 '2022_theatre_books_clean.pkl',
 'theatre_book_database_2022_clean.pkl',
 'Theatre_Book-Database 2023 Auswertung-Arbeitsversion.xlsx',
 '2022_theatre_books_clean_known_villages.pkl',
 '2022_theatre_books_clean_unknown_villages.pkl',
 '2022_theatre_books_clean_known_villages.xlsx',
 'theatre_book_database_2022_clean.xlsx']

In [5]:
file_to_clean = "Theatre_Book-Database 2023 Auswertung-Arbeitsversion.xlsx"
path = f"{base_dir}/{file_to_clean}"
df = pd.read_excel(path, engine="openpyxl")
df.head()

  warn(msg)


Unnamed: 0,Theatre Book #,Hospital ID #,DATE of Surgery,First Name,Last Name,Age (years),Sex,Village,Surgeon,1st Assistent/Instructor,...,Urgency,Surgery severity,ASA-Score,Year of birth,Operation time (minutes),Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34
0,230001,,2023-01-01 00:00:00,SUZEN,WISDON,26.0,F,MITAMBO,Obs/Gyn,,...,Emergency,Major,,1997.0,00:45:00,,Calculated: do not fill out,Fill out for every patient,Fill out for all PAACS cases,Fill out if possible
1,230002,,2023-01-02 00:00:00,BEZAI,MANUEL,37.0,M,MAZENGER,Terry,Vitu,...,Urgent,Major,,1986.0,1899-12-29 13:15:00,,,,,
2,230003,,2023-01-02 00:00:00,ELLINA,LUPIYA,30.0,F,CHIDUMA,Obs/Gyn,Other,...,Elective,Major,,1993.0,1899-12-29 22:55:00,,,,,
3,230004,,2023-01-02 00:00:00,HAWA,SAMSON,28.0,F,TAMBALA,Obs/Gyn,Other,...,Emergency,Intermediate,,1995.0,1899-12-29 18:30:00,,,,,
4,230005,,2023-01-03 00:00:00,JUDITH,BONFACE,22.0,F,TAMBALA,Obs/Gyn,Other,...,Emergency,Intermediate,,2001.0,00:43:00,,,,,


In [6]:
df = convert_cols_to_snake_case(df)
df = drop_if_unnamed(df)
# coerce theatre_book to numeric (invalid entries → NaN)
df["theatre_book"] = pd.to_numeric(df["theatre_book"], errors="coerce")
# keep only rows with a valid theatre_book number
df = df.dropna(subset=["theatre_book"])
df = convert_to_integer(df, ['theatre_book', 'hospital_id', 'age_years', 'year_of_birth'])
# rewrite typos
df = df.rename(columns={
    "sarting_time": "starting_time",
    "asascore": "asa_score",
})

In [7]:
to_analyze(df, ['village'])

In [8]:
# Drop hospital_id 99% missing
df = df.drop(columns=["hospital_id"])

In [9]:
# convert date_of_surgery to datetime
df["date_of_surgery"] = pd.to_datetime(
    df["date_of_surgery"],
    errors="coerce"
)

In [10]:
# Standardize age
df["age_years"].describe()
df["age_years"] = df["age_years"].astype("Int64")
df["age_years"].dtype
df["age_years"].isna().sum()

139

In [11]:
# Standardizing sex
df["sex"].value_counts(dropna=False)
df["sex"] = pd.Categorical(
    df["sex"],
    categories=["F", "M"]
)

In [12]:
df["surgeon"] = (
    df["surgeon"]
    .str.strip()
    .str.title()
)
df["surgeon"].value_counts(dropna=False)

surgeon
Obs/Gyn      764
Limbe        281
Caleb        169
Lam          162
Widmann      152
Terry        105
NaN           88
Madalitso     86
Thoko         73
Vitu          62
Vaylann       52
Other         50
Stuebing      50
Wongani       40
Name: count, dtype: int64

In [13]:
def classify_surgeon(x):
    if pd.isna(x):
        return "unknown"

    x = str(x).strip()

    if x.lower() == "obs/gyn":
        return "specialty"
    if x.lower() in ["limbe", "lam"]:
        return "facility"
    if x.lower() == "other":
        return "unknown"

    return "individual"
df["surgeon_type"] = df["surgeon"].apply(classify_surgeon)

In [14]:
df["surgeon_type"] = pd.Categorical(
    df["surgeon_type"],
    categories=["individual", "facility", "specialty", "unknown"]
)

In [15]:
for col in ["first_name", "last_name"]:
    df[col] = (
        df[col]
        .astype("string")
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
        .str.title()
    )

In [16]:
df[["first_name", "last_name"]].isna().sum()

first_name    114
last_name     123
dtype: int64

In [19]:
df = prepare_village_for_geocoding(df, col="village")
df["place_type"].value_counts()
df[["village", "village_norm", "place_type", "geocode_query"]].head(20)

Unnamed: 0,village,village_norm,place_type,geocode_query
0,MITAMBO,mitambo,village,mitambo
1,MAZENGER,mazenger,village,mazenger
2,CHIDUMA,chiduma,village,chiduma
3,TAMBALA,tambala,village,tambala
4,TAMBALA,tambala,village,tambala
5,CHIPHWAN,chiphwan,village,chiphwan
6,MSUMATI,msumati,village,msumati
7,NGONGOL,ngongol,village,ngongol
8,NKONO,nkono,village,nkono
9,NKOPOKA,nkopoka,village,nkopoka


In [20]:
df2, geo_df = geocode_unique_queries_mapbox(df, token)
# quick sanity checks
geo_df.sort_values(["error", "relevance"], ascending=[True, False]).head(20)
df2[["village", "place_type", "geocode_query", "lat", "lon", "place_name", "relevance", "error"]].head(30)

Unnamed: 0,village,place_type,geocode_query,lat,lon,place_name,relevance,error
0,MITAMBO,village,mitambo,-12.650819,34.175367,"Mtambo, Nkhotakota, Malawi",0.964286,
1,MAZENGER,village,mazenger,-13.215804,33.739164,Malawi,0.5,
2,CHIDUMA,village,chiduma,-13.215804,33.739164,Malawi,0.5,
3,TAMBALA,village,tambala,-12.873253,33.629595,"Tambala, Kasungu, Malawi",1.0,
4,TAMBALA,village,tambala,-12.873253,33.629595,"Tambala, Kasungu, Malawi",1.0,
5,CHIPHWAN,village,chiphwan,-13.215804,33.739164,Malawi,0.5,
6,MSUMATI,village,msumati,-13.215804,33.739164,Malawi,0.5,
7,NGONGOL,village,ngongol,-13.215804,33.739164,Malawi,0.5,
8,NKONO,village,nkono,-13.215804,33.739164,Malawi,0.5,
9,NKOPOKA,village,nkopoka,-14.307734,35.117773,"Nkopola, Mangochi, Malawi",0.964286,


In [25]:
df2["village_norm_cmp"] = df2["village"].apply(normalize)
df2["place_first"] = df2["place_name"].str.split(",").str[0]
df2["place_first_norm_cmp"] = df2["place_first"].apply(normalize)

df_matched = df2[df2["village_norm_cmp"] == df2["place_first_norm_cmp"]]
df_diff = df2[df2["village_norm_cmp"] != df2["place_first_norm_cmp"]]

pkl_path = f"{base_dir}/2023_theatre_books_clean.pkl"
xlsx_path = f"{base_dir}/2023_theatre_books_clean.xlsx"
df2.to_pickle(pkl_path)
df2.to_excel(xlsx_path, index=False)
pkl_path = f"{base_dir}/2023_theatre_books_clean_known_villages.pkl"
xlsx_path = f"{base_dir}/2023_theatre_books_clean_known_villages.xlsx"
df_matched.to_pickle(pkl_path)
df_matched.to_excel(xlsx_path, index=False)
pkl_path = f"{base_dir}/2023_theatre_books_clean_unknown_villages.pkl"
xlsx_path = f"{base_dir}/2023_theatre_books_clean_unknown_villages.xlsx"
df_diff.to_pickle(pkl_path)
df_diff.to_excel(xlsx_path, index=False)

In [28]:
df_matched

Unnamed: 0,theatre_book,date_of_surgery,first_name,last_name,age_years,sex,village,surgeon,first_assistent_instructor,second_assistent,...,geocode_query,lon,lat,place_name,relevance,feature_id,error,place_first,village_norm_cmp,place_first_norm_cmp
3,230004,2023-01-02,Hawa,Samson,28,F,TAMBALA,Obs/Gyn,Other,Other,...,tambala,33.629595,-12.873253,"Tambala, Kasungu, Malawi",1.0,place.10791070,,Tambala,tambala,tambala
4,230005,2023-01-03,Judith,Bonface,22,F,TAMBALA,Obs/Gyn,Other,Other,...,tambala,33.629595,-12.873253,"Tambala, Kasungu, Malawi",1.0,place.10791070,,Tambala,tambala,tambala
10,230011,2023-01-06,Thomas,Richard,42,M,KAPHUKA,Vitu,Limbe,,...,kaphuka,34.181856,-14.148509,"Kaphuka, Dedza, Malawi",1.0,place.4499614,,Kaphuka,kaphuka,kaphuka
12,230013,2023-01-06,Saidi,Sanji,40,M,TAMBALA,Vitu,Lam,,...,tambala,33.629595,-12.873253,"Tambala, Kasungu, Malawi",1.0,place.10791070,,Tambala,tambala,tambala
20,230021,2023-01-08,Suzen,Rogie,19,F,MLANDA,Obs/Gyn,Other,Other,...,mlanda,34.447182,-14.478535,"Mlanda, Ntcheu, Malawi",1.0,place.7497886,,Mlanda,mlanda,mlanda
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,232122,2023-12-25,Lemita,Ephraim,32,F,DEDZA,Obs/Gyn,Obs/Gyn,,...,dedza,34.328238,-14.380727,"Dedza, Malawi",1.0,region.132254,,Dedza,dedza,dedza
2124,232125,2023-12-28,Olipa,Alfred,29,F,MPHANDE,Obs/Gyn,Obs/Gyn,,...,mphande,35.201466,-16.339190,"Mphande, Thyolo, Malawi",1.0,place.7817374,,Mphande,mphande,mphande
2125,232126,2023-12-28,Estere,Asidi,33,F,MAPEMBA,Obs/Gyn,Obs/Gyn,,...,mapemba,34.222142,-14.097290,"Mapemba, Dedza, Malawi",1.0,place.6547614,,Mapemba,mapemba,mapemba
2130,232131,2023-12-31,Stella,Chimwenje,28,F,KACHERE,Obs/Gyn,Obs/Gyn,,...,kachere,34.682674,-15.372861,"Kachere, Neno, Malawi",1.0,place.3582110,,Kachere,kachere,kachere
