In [1]:
import pandas as pd
import pathlib
import re
import warnings

In [2]:
# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
# get the source file
working_dir = pathlib.Path.cwd()
source_data = working_dir.parent / "data" / "input" / "BT_BEYIN_KSIZ.xls"
df_reports = pd.read_excel(source_data)

In [4]:
# regex patterns
film_no_pattern = r"\b\d{6,}\b"
rapor_tarihi_pattern= r"\b\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}\b"

In [5]:
# useful functions
def pattern_find_and_remove(text: str, pattern: str) -> tuple[str]:
    """
    Find patters and remove them
    """
    try:
        extracted_pattern = re.findall(pattern, text)[0]
    except IndexError:
        extracted_pattern = ""

    return extracted_pattern, text.replace(extracted_pattern, "").strip()

def replace_patterns_from_text(
    text: str, patterns: list[str], replace_with: str = ""
) -> str:
    """
    Replace patters from text
    """
    for pattern in patterns:
        text = text.replace(pattern, replace_with)

    return text.strip()

def apply_transformations(text) -> tuple[str]:
    """
    Apply multiple transformations
    """
    text = replace_patterns_from_text(
        text,
        [
            "RAPOR TARİHİ",
            "FİLM NO",
            "TETKİK NO",
            "ÇEKİM TARİHİ",
            "ÇEKİM  TARİHİ",
            "TETKİK TARİHİ",
            "Tetkik no",
            " :",
            ": ",
            " : ",
        ],
    )
    text = replace_patterns_from_text(
        text, ["RAPOR TARİHİ", "FİLM NO", "TETKİK NO", " :", ": ", " : "]
    )
    text = replace_patterns_from_text(text, ["\n", "*"], " ")
    text = replace_patterns_from_text(text, [":", ";"], " ")

    film_no, text = pattern_find_and_remove(
        text,
        film_no_pattern,
    )
    rapor_tarihi, text = pattern_find_and_remove(text, rapor_tarihi_pattern)

    return text.strip(), rapor_tarihi, film_no

In [6]:
# extract relevant information
result = df_reports['ACIKLAMA'].apply(apply_transformations)
df_reports[["text", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)

In [7]:
# check empty dates
(df_reports['report_date'].values == '').sum() 

1780

In [8]:
# drop unnecessary columns
df_reports = df_reports.rename(
    columns={
        "HASTA_NO": "patient_no",
        "PROTOKOL_NO": "protocol_no",
    }
).drop(["ACIKLAMA", "AD_SOYAD"], axis=1)

In [9]:
# convert the report date column
df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')

In [10]:
# detect report count
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [11]:
# merge report counts & order by report count/date & drop duplicates
df_reports = df_reports.drop_duplicates(subset=["patient_no"], keep="first").merge(
    df_report_count, on="patient_no", how="left"
).sort_values(["report_count", "report_date"], ascending=[True, True])

In [12]:
# convert output df to a csv file
df_reports.to_csv(working_dir.parent / "data" / "output" / "tasks.csv")