# Import libraries

In [1]:
import os
import pandas as pd
import datetime as dt
import numpy as np
import re

# Usefull functions

In [2]:
def to_snake_case(col):
    col = col.strip().lower()
    col = col.replace("/", "_")
    col = re.sub(r"[^\w\s]", "", col)      # remove punctuation (#, /, -, ())
    col = re.sub(r"\s+", "_", col)         # spaces → underscores
    col = re.sub(r"_+", "_", col)          # collapse multiple underscores
    # fix leading digits (e.g. 1st_assistent → first_assistent)
    col = re.sub(r"^1st_", "first_", col)
    col = re.sub(r"^2nd_", "second_", col)
    col = re.sub(r"^3nd_", "third", col)
    
    return col.strip("_")


def excel_time_to_minutes(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, dt.time):
        return x.hour * 60 + x.minute + x.second / 60
    try:
        return pd.to_timedelta(x).total_seconds() / 60
    except Exception:
        return np.nan

In [3]:
base_dir ="../Nkhoma_data/Data"

In [4]:
# all the files
os.listdir(base_dir)

['old_theatre_books_clean.xlsx',
 'Theatre_Book-Database 2025-plain.xlsx',
 'old_theatre_books_clean.pkl',
 'Old Theatre Books.xlsx',
 '.ipynb_checkpoints',
 'Theatre_Book-Database 2022 Auswertung-Arbeitsversion.xlsx',
 'Theatre_Book-Database 2024 Auswertung-Arbeitsversion.xlsx',
 'theatre_book_database_2022_clean.pkl',
 'Theatre_Book-Database 2023 Auswertung-Arbeitsversion.xlsx',
 'theatre_book_database_2022_clean.xlsx']

# Lets clean Old Theatre Books.xlsx

In [5]:
file_to_clean = "Old Theatre Books.xlsx"
path = f"{base_dir}/{file_to_clean}"
df = pd.read_excel(path, engine="openpyxl")  # often works even if extension is wrong
df.head()

Unnamed: 0,Years,Caseload
0,2023.0,
1,2022.0,1539.0
2,2021.0,1244.0
3,2020.0,1103.0
4,2019.0,1325.0


In [6]:
df.columns = [to_snake_case(c) for c in df.columns]
# Remove rows where Caseload is missing
# (these are usually header artifacts or incomplete summary rows)
df = df.dropna(subset=["caseload"]).copy()

# Convert Years from float to integer
# (Excel often stores years as floats, e.g. 2022.0)
df["years"] = df["years"].astype(int)

# Sort the data chronologically by year
# and reset the index so it runs from 0..N cleanly
df = df.sort_values("years").reset_index(drop=True)

# Round Caseload values and convert to integers
# (useful if they were read as floats like 1539.0)
df["caseload"] = df["caseload"].round().astype(int)

# Create a complete sequence of years (snake_case!)
full_years = pd.DataFrame(
    {"years": range(df["years"].min(), df["years"].max() + 1)}
)

# Merge so missing years become explicit
df = full_years.merge(df, on="years", how="left")

df.head()

Unnamed: 0,years,caseload
0,1973,630.0
1,1974,576.0
2,1975,
3,1976,
4,1977,


In [7]:
pkl_path = f"{base_dir}/old_theatre_books_clean.pkl"
xlsx_path = f"{base_dir}/old_theatre_books_clean.xlsx"
df.to_pickle(pkl_path)
df.to_excel(xlsx_path, index=False)

# Next file

In [11]:
file_to_clean = "Theatre_Book-Database 2023 Auswertung-Arbeitsversion.xlsx"
path = f"{base_dir}/{file_to_clean}"
df = pd.read_excel(path, engine="openpyxl")
df.head()

  warn(msg)


Unnamed: 0,Theatre Book #,Hospital ID #,DATE of Surgery,First Name,Last Name,Age (years),Sex,Village,Surgeon,1st Assistent/Instructor,...,Urgency,Surgery severity,ASA-Score,Year of birth,Operation time (minutes),Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34
0,230001,,2023-01-01 00:00:00,SUZEN,WISDON,26.0,F,MITAMBO,Obs/Gyn,,...,Emergency,Major,,1997.0,00:45:00,,Calculated: do not fill out,Fill out for every patient,Fill out for all PAACS cases,Fill out if possible
1,230002,,2023-01-02 00:00:00,BEZAI,MANUEL,37.0,M,MAZENGER,Terry,Vitu,...,Urgent,Major,,1986.0,1899-12-29 13:15:00,,,,,
2,230003,,2023-01-02 00:00:00,ELLINA,LUPIYA,30.0,F,CHIDUMA,Obs/Gyn,Other,...,Elective,Major,,1993.0,1899-12-29 22:55:00,,,,,
3,230004,,2023-01-02 00:00:00,HAWA,SAMSON,28.0,F,TAMBALA,Obs/Gyn,Other,...,Emergency,Intermediate,,1995.0,1899-12-29 18:30:00,,,,,
4,230005,,2023-01-03 00:00:00,JUDITH,BONFACE,22.0,F,TAMBALA,Obs/Gyn,Other,...,Emergency,Intermediate,,2001.0,00:43:00,,,,,
