# Import libraries

In [21]:
import os
import pandas as pd
import datetime as dt
import numpy as np
import sys
from pathlib import Path
import requests
import time
import re
sys.path.append(os.path.abspath("../"))

from helpers.df_formating import (
    convert_to_integer, 
    convert_cols_to_snake_case,
    normalize,
    drop_row_if_not_complete, 
    drop_if_unnamed, 
    excel_time_to_minutes)

from helpers.geo_coding import prepare_village_for_geocoding, geocode_place_mapbox_v5, geocode_unique_queries_mapbox

def to_analyze(df, cols):
    to_analyze = df[cols]
    path_desktop = "/mnt/c/Users/matth/Desktop"
    path_file = f"{path_desktop}/to_analyze.csv"
    to_analyze.to_csv(path_file, index=False)

def is_only_id_columns(df: pd.DataFrame):
    id_keywords = ("id", "number", "#", "nr", "no")
    cols = [c.lower() for c in df.columns]

    return all(
        any(k in col for k in id_keywords)
        for col in cols
    )

def drop_sparse_columns_or_skip(
    df: pd.DataFrame,
    empty_threshold: float = 0.95,
    treat_whitespace_as_empty: bool = True,
    also_treat_strings_as_na: tuple[str, ...] = ("na", "n/a", "null", "none", "")
):
    if df is None or df.empty:
        return None

    work = df.copy()

    if treat_whitespace_as_empty:
        work = work.replace(r"^\s+$", np.nan, regex=True)

    if also_treat_strings_as_na:
        na_set = set(s.lower() for s in also_treat_strings_as_na)
        work = work.applymap(
            lambda x: np.nan
            if isinstance(x, str) and x.strip().lower() in na_set
            else x
        )

    if work.isna().to_numpy().all():
        return None

    # Drop columns with >95% missing
    missing_frac = work.isna().mean(axis=0)
    cols_to_drop = missing_frac[missing_frac > empty_threshold].index
    work = work.drop(columns=cols_to_drop)

    # If nothing meaningful remains -> skip
    if work.shape[1] == 0 or work.isna().to_numpy().all():
        return None

    # ✅ NOW check if remaining columns are only IDs
    if is_only_id_columns(work):
        return None

    return work


In [22]:
base_dir ="Nkhoma_data/Data"

In [23]:
filtered = [
    f for f in os.listdir(base_dir)
    if "clean" not in f.lower()
    and "ipynb_checkpoints" not in f.lower()
]

filtered

['Theatre_Book-Database 2025-plain.xlsx',
 'Old Theatre Books.xlsx',
 'Theatre_Book-Database 2022 Auswertung-Arbeitsversion.xlsx',
 'Theatre_Book-Database 2024 Auswertung-Arbeitsversion.xlsx',
 'Theatre_Book-Database 2023 Auswertung-Arbeitsversion.xlsx']

In [24]:
path

'Nkhoma_data/Data/Theatre_Book-Database 2025-plain.xlsx'

In [27]:
filtered

['Theatre_Book-Database 2025-plain.xlsx',
 'Old Theatre Books.xlsx',
 'Theatre_Book-Database 2022 Auswertung-Arbeitsversion.xlsx',
 'Theatre_Book-Database 2024 Auswertung-Arbeitsversion.xlsx',
 'Theatre_Book-Database 2023 Auswertung-Arbeitsversion.xlsx']

In [29]:
for file in filtered[0:1]:
    path = f"{base_dir}/{file}"
    df = pd.read_excel(path, engine="openpyxl") 
    clean = drop_sparse_columns_or_skip(df, empty_threshold=0.95)
    if clean is None:
        print("Skipping: dataframe is completely empty after cleaning.")
    else:
        print(clean.shape)


(3000, 1)


  warn(msg)
  work = work.applymap(


In [30]:
clean

Unnamed: 0,Theatre Book #
0,250001
1,250002
2,250003
3,250004
4,250005
...,...
2995,252996
2996,252997
2997,252998
2998,252999


In [10]:
print(df.columns)

Index(['theatre_book', 'hospital_id', 'date_of_surgery', 'first_name',
       'last_name', 'age_years', 'sex', 'village', 'surgeon',
       'first_assistent_instructor', 'second_assistent', 'anaestesist',
       'nurse', 'anesthesia', 'department', 'indication_for_surgery',
       'surgery_type', 'final_diagnosis_category', 'final_diagnosis_free_text',
       'side', 'main_procedure_category', 'procedure_free_text', 'histology',
       'starting_time', 'finishing_time', 'urgency', 'surgery_severity',
       'asa_score', 'year_of_birth', 'operation_time_minutes',
       'op_minutes_calc', 'op_minutes_recorded'],
      dtype='object')
