In [14]:
import pandas as pd
import pathlib
import warnings
from experiment.utils import transformation
import json

In [15]:
# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [16]:
# get the source file
working_dir = pathlib.Path.cwd()
source_data = working_dir.parent / "data" / "input" / "BT_BEYIN_KSIZ.xls"
df_reports = pd.read_excel(source_data)

DB_PATH = working_dir.parent / "data" / "output" / "db.json"

with open(DB_PATH, "r") as file:
    db = json.load(file)

In [17]:
# extract relevant information
result = df_reports['ACIKLAMA'].apply(transformation.prepare_data_transformations)
df_reports[["text", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)

In [18]:
# check empty dates
(df_reports['report_date'].values == '').sum() 

1780

In [19]:
# drop unnecessary columns
df_reports = df_reports.rename(
    columns={
        "HASTA_NO": "patient_no",
        "PROTOKOL_NO": "protocol_no",
    }
).drop(["ACIKLAMA", "AD_SOYAD"], axis=1)

In [None]:
# filter by non-annotated patients
df_reports = df_reports.loc[~df_reports["patient_no"].isin(db["annotated"])]

In [20]:
# convert the report date column
df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')

In [21]:
# detect report count
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [22]:
# merge report counts & order by report count/date & drop duplicates
df_reports = df_reports.drop_duplicates(subset=["patient_no"], keep="first").merge(
    df_report_count, on="patient_no", how="left"
).sort_values(["report_count", "report_date"], ascending=[True, True])

In [23]:
df_reports["lemmatised"] = transformation.sentence_cleaning_pipeline(df_reports["text"].to_list())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gokasci/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
len(df_reports)

24288

In [26]:
# convert output df to a csv file
df_reports.to_csv(working_dir.parent / "data" / "output" / "tasks.csv")