In [None]:
import pandas as pd
import pathlib
import warnings
import json

from experiment.utils import transformation

In [None]:
# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# get the source file
source_data = transformation.get_project_root() / "data" / "input" / "BT_BEYIN_KSIZ.xls"
df_reports = pd.read_excel(source_data)

DB_PATH = transformation.get_project_root() / "data" / "output" / "db.json"

with open(DB_PATH, "r") as file:
    db = json.load(file)

In [None]:
# extract relevant information
result = df_reports['ACIKLAMA'].apply(transformation.prepare_data_transformations)
df_reports[["text", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)

In [None]:
# check empty dates
(df_reports['report_date'].values == '').sum() 

In [None]:
# drop unnecessary columns
df_reports = df_reports.rename(
    columns={
        "HASTA_NO": "patient_no",
        "PROTOKOL_NO": "protocol_no",
    }
).drop(["ACIKLAMA", "AD_SOYAD"], axis=1)

In [None]:
# filter by non-annotated patients
df_reports = df_reports.loc[~df_reports["patient_no"].isin(db["annotated"])]

In [None]:
# convert the report date column
df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')

In [None]:
# detect report count
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [None]:
# merge report counts & order by report count/date & drop duplicates
df_reports = df_reports.drop_duplicates(subset=["patient_no"], keep="first").merge(
    df_report_count, on="patient_no", how="left"
).sort_values(["report_count", "report_date"], ascending=[True, True])

In [None]:
df_reports["lemmatised"] = transformation.sentence_cleaning_pipeline(df_reports["text"].to_list())

In [None]:
len(df_reports)

In [None]:
# convert output df to a csv file
df_reports.to_csv(transformation.get_project_root() / "data" / "output" / "tasks.csv")