# Import libraries

In [1]:
import os
import pandas as pd
import datetime as dt
import numpy as np
import sys
from pathlib import Path
import requests
import time
import re
sys.path.append(os.path.abspath("../.."))

from helpers.df_formating import (
    convert_to_integer, 
    convert_cols_to_snake_case,
    normalize,
    drop_row_if_not_complete, 
    drop_if_unnamed, 
    excel_time_to_minutes)

from helpers.geo_coding import prepare_village_for_geocoding, geocode_place_mapbox_v5, geocode_unique_queries_mapbox

def to_analyze(df, cols):
    to_analyze = df[cols]
    path_desktop = "/mnt/c/Users/matth/Desktop"
    path_file = f"{path_desktop}/to_analyze.csv"
    to_analyze.to_csv(path_file, index=False)

In [2]:
# --- read token from ../secrets.txt ---
secrets_path = Path("..") / "secrets.txt"

token = None
with open(secrets_path) as f:
    for line in f:
        if line.startswith("MAP_BOX_TOKEN="):
            token = line.strip().split("=", 1)[1]
            break

assert token is not None, "MAP_BOX_TOKEN not found in secrets.txt"

MAPBOX_TOKEN = token

In [3]:
base_dir ="../Nkhoma_data/Data"

In [4]:
# all the files
os.listdir(base_dir)

['2022_theatre_books_clean_unknown_villages.xlsx',
 'old_theatre_books_clean.xlsx',
 'Theatre_Book-Database 2025-plain.xlsx',
 'old_theatre_books_clean.pkl',
 'Old Theatre Books.xlsx',
 '2022_theatre_books_clean.xlsx',
 '.ipynb_checkpoints',
 'Theatre_Book-Database 2022 Auswertung-Arbeitsversion.xlsx',
 'Theatre_Book-Database 2024 Auswertung-Arbeitsversion.xlsx',
 '2022_theatre_books_clean.pkl',
 'theatre_book_database_2022_clean.pkl',
 'Theatre_Book-Database 2023 Auswertung-Arbeitsversion.xlsx',
 '2022_theatre_books_clean_known_villages.pkl',
 '2022_theatre_books_clean_unknown_villages.pkl',
 '2022_theatre_books_clean_known_villages.xlsx',
 'theatre_book_database_2022_clean.xlsx']

# Lets clean Theatre_Book-Database 2022 Auswertung-Arbeitsversion.xlsx

In [5]:
file_to_clean = "Theatre_Book-Database 2022 Auswertung-Arbeitsversion.xlsx"
path = f"{base_dir}/{file_to_clean}"
df = pd.read_excel(path, engine="openpyxl")
df.head()

  warn(msg)


Unnamed: 0,Theatre Book #,Hospital ID #,DATE of Surgery,First Name,Last Name,Age (years),Sex,Village,Surgeon,1st Assistent/Instructor,...,Urgency,Surgery severity,ASA-Score,Year of birth,Operation time (minutes),Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34
0,220001,,2022-01-01,Elifa,Sumati,26.0,F,Nkhonde,Obs/Gyn,,...,,,,1997.0,00:00:00,,Calculated: do not fill out,Fill out for every patient,Fill out for all PAACS cases,Fill out if possible
1,220002,,2022-01-01,Siyatu,Isaac,27.0,F,Mozambique,Obs/Gyn,,...,,,,1996.0,00:00:00,,,,,
2,220003,,2022-01-02,Loness,Mapemphero,25.0,F,Chembe,Obs/Gyn,,...,,,,1998.0,00:00:00,,,,,
3,220004,,2022-01-03,Saizi,Nedson,48.0,M,Chilikumanda,Limbe,Caleb,...,Emergency,Major,ASA 3,1975.0,00:00:00,,,,,
4,220005,,2022-01-03,Beatrice,Hezekia,26.0,F,Mazengera,Obs/Gyn,,...,,,,1997.0,00:00:00,,,,,


In [6]:
df = convert_cols_to_snake_case(df)
df = drop_if_unnamed(df)
# coerce theatre_book to numeric (invalid entries → NaN)
df["theatre_book"] = pd.to_numeric(df["theatre_book"], errors="coerce")
# keep only rows with a valid theatre_book number
df = df.dropna(subset=["theatre_book"])
df = convert_to_integer(df, ['theatre_book', 'hospital_id', 'age_years', 'year_of_birth'])
# rewrite typos
df = df.rename(columns={
    "sarting_time": "starting_time",
    "asascore": "asa_score",
})

In [7]:
df.columns

Index(['theatre_book', 'hospital_id', 'date_of_surgery', 'first_name',
       'last_name', 'age_years', 'sex', 'village', 'surgeon',
       'first_assistent_instructor', 'second_assistent', 'anaestesist',
       'nurse', 'anesthesia', 'department', 'indication_for_surgery',
       'surgery_type', 'final_diagnosis_category', 'final_diagnosis_free_text',
       'side', 'main_procedure_category', 'procedure_free_text', 'histology',
       'starting_time', 'finishing_time', 'urgency', 'surgery_severity',
       'asa_score', 'year_of_birth', 'operation_time_minutes'],
      dtype='object')

In [8]:
to_analyze(df, ['village'])

In [9]:
# Drop hospital_id 99% missing
df = df.drop(columns=["hospital_id"])

In [10]:
# convert date_of_surgery to datetime
df["date_of_surgery"] = pd.to_datetime(
    df["date_of_surgery"],
    errors="coerce"
)

In [11]:
# Standardize age
df["age_years"].describe()
df["age_years"] = df["age_years"].astype("Int64")
df["age_years"].dtype
df["age_years"].isna().sum()

33

In [12]:
# Standardizing sex
df["sex"].value_counts(dropna=False)
df["sex"] = pd.Categorical(
    df["sex"],
    categories=["F", "M"]
)

In [13]:
# Cleaning surgeon
df["surgeon"].value_counts(dropna=False)

surgeon
Obs/Gyn     755
Limbe       197
Lam         154
Caleb        97
Terry        79
Widmann      57
Stuebing     48
Other        35
Vitu         34
Vaylann      32
obs/Gyn      21
Thoko        16
lam          13
NaN           2
Name: count, dtype: int64

In [14]:
df["surgeon"] = (
    df["surgeon"]
    .str.strip()
    .str.title()
)

In [15]:
# Classify when not an individual
def classify_surgeon(x):
    if pd.isna(x):
        return "unknown"
    if x == "Obs/Gyn":
        return "specialty"
    if x in ["Limbe", "Lam"]:
        return "facility"
    if x == "Other":
        return "unknown"
    return "individual"

df["surgeon_type"] = df["surgeon"].apply(classify_surgeon)

In [16]:
df["surgeon_type"] = pd.Categorical(
    df["surgeon_type"],
    categories=["individual", "facility", "specialty", "unknown"]
)

In [17]:
# normalize names

In [18]:
for col in ["first_name", "last_name"]:
    df[col] = (
        df[col]
        .astype("string")
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
        .str.title()
    )

In [19]:
df[["first_name", "last_name"]].isna().sum()

first_name    3
last_name     4
dtype: int64

In [20]:
# Cleaning village

df["village"].value_counts(dropna=False)


village
Lilongwe      294
Dedza         162
Mazengera     129
NaN           114
MAZENGERA      48
             ... 
Ntchisi         1
Mtemwende       1
Mponera         1
Chiradzulu      1
MCHEDZA         1
Name: count, Length: 446, dtype: int64

In [21]:
df = prepare_village_for_geocoding(df, col="village")
df["place_type"].value_counts()
df[["village", "village_norm", "place_type", "geocode_query"]].head(20)

Unnamed: 0,village,village_norm,place_type,geocode_query
0,Nkhonde,nkhonde,village,nkhonde
1,Mozambique,mozambique,foreign_country,
2,Chembe,chembe,village,chembe
3,Chilikumanda,chilikumanda,village,chilikumanda
4,Mazengera,mazengera,village,mazengera
5,Mazengera,mazengera,village,mazengera
6,Kamphika,kamphika,village,kamphika
7,Mazengera,mazengera,village,mazengera
8,Mzuzu,mzuzu,village,mzuzu
9,Mazengera,mazengera,village,mazengera


In [22]:
df2, geo_df = geocode_unique_queries_mapbox(df, token)
# quick sanity checks
geo_df.sort_values(["error", "relevance"], ascending=[True, False]).head(20)
df2[["village", "place_type", "geocode_query", "lat", "lon", "place_name", "relevance", "error"]].head(30)

Unnamed: 0,village,place_type,geocode_query,lat,lon,place_name,relevance,error
0,Nkhonde,village,nkhonde,-15.300392,34.701808,"Malawi, Neno, Malawi",0.5,
1,Mozambique,foreign_country,,,,,,
2,Chembe,village,chembe,-14.0218,34.846135,"Chembe, Mangochi, Malawi",1.0,
3,Chilikumanda,village,chilikumanda,-13.215804,33.739164,Malawi,1.0,
4,Mazengera,village,mazengera,-15.300392,34.701808,"Malawi, Neno, Malawi",0.5,
5,Mazengera,village,mazengera,-15.300392,34.701808,"Malawi, Neno, Malawi",0.5,
6,Kamphika,village,kamphika,-13.215804,33.739164,Malawi,0.5,
7,Mazengera,village,mazengera,-15.300392,34.701808,"Malawi, Neno, Malawi",0.5,
8,Mzuzu,village,mzuzu,-11.460752,34.022642,"Mzuzu, Malawi",1.0,
9,Mazengera,village,mazengera,-15.300392,34.701808,"Malawi, Neno, Malawi",0.5,


In [23]:
df2["village_norm_cmp"] = df2["village"].apply(normalize)
df2["place_first"] = df2["place_name"].str.split(",").str[0]
df2["place_first_norm_cmp"] = df2["place_first"].apply(normalize)

df_matched = df2[df2["village_norm_cmp"] == df2["place_first_norm_cmp"]]
df_diff = df2[df2["village_norm_cmp"] != df2["place_first_norm_cmp"]]

In [24]:
df_matched = df2[df2["village"] == df2["place_first"]]

In [25]:
df_diff = df2[df2["village"] != df2["place_first"]]

In [26]:
pkl_path = f"{base_dir}/2022_theatre_books_clean.pkl"
xlsx_path = f"{base_dir}/2022_theatre_books_clean.xlsx"
df2.to_pickle(pkl_path)
df2.to_excel(xlsx_path, index=False)

In [27]:
pkl_path = f"{base_dir}/2022_theatre_books_clean_known_villages.pkl"
xlsx_path = f"{base_dir}/2022_theatre_books_clean_known_villages.xlsx"
df_matched.to_pickle(pkl_path)
df_matched.to_excel(xlsx_path, index=False)

In [28]:
pkl_path = f"{base_dir}/2022_theatre_books_clean_unknown_villages.pkl"
xlsx_path = f"{base_dir}/2022_theatre_books_clean_unknown_villages.xlsx"
df_diff.to_pickle(pkl_path)
df_diff.to_excel(xlsx_path, index=False)