In [1]:
import pandas as pd
import pathlib
import warnings
import json

import experiment.utils.query as query_utils
from experiment.utils import transformation

In [2]:
# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
transformation.get_project_root() / "tmp" / "data" / ""
pd.read_csv("")

In [None]:
query_utils.upsert_values(
    schema_name="raw_data",
    table_name="raw_annotation_project_datasets",
    select_cols=select_cols,
    constraint="raw_annotation_project_datasets_sk_key",
    cols_to_upsert=cols_to_upsert,
    values=datasets,
    timestamp_col_name="last_update",
)

In [3]:
# get directly from database
query = """
            --SELECT * FROM annotation.choices
            --SELECT * FROM annotation.labels
            SELECT id, 
                data ->> 'text' as text, 
                data ->> 'study_no' as study_no,
                data ->> 'patient_no' as patient_no,
                data ->> 'protocol_no' as protocol_no,
                data ->> 'report_date' as report_date,
                data ->> 'report_count' as report_count  
            FROM task
            WHERE is_labeled = TRUE
        """

# get values from the database
df_reports = query_utils.get_select_values(query)

In [None]:
# extract relevant information
result = df_reports['ACIKLAMA'].apply(transformation.prepare_data_transformations)
df_reports[["text", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)

In [None]:
# check empty dates
(df_reports['report_date'].values == '').sum() 

In [None]:
# drop unnecessary columns
df_reports = df_reports.rename(
    columns={
        "HASTA_NO": "patient_no",
        "PROTOKOL_NO": "protocol_no",
    }
).drop(["ACIKLAMA", "AD_SOYAD"], axis=1)

In [None]:
# filter
query = """
            SELECT 
                DISTINCT data ->> 'patient_no' as patient_no
            FROM task
            WHERE is_labeled = TRUE
        """

# get values from the database
annotated_patient_nos = query_utils.get_select_values(query)["patient_no"].to_list()

In [None]:
# filter by non-annotated patients
df_reports = df_reports.loc[~df_reports["patient_no"].isin(annotated_patient_nos)]

In [None]:
# convert the report date column
df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')

In [None]:
# detect report count
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [None]:
# merge report counts & order by report count/date & drop duplicates
df_reports = df_reports.drop_duplicates(subset=["patient_no"], keep="first").merge(
    df_report_count, on="patient_no", how="left"
).sort_values(["report_count", "report_date"], ascending=[True, True])

In [None]:
df_reports["lemmatised"] = transformation.sentence_cleaning_pipeline(df_reports["text"].to_list())

In [None]:
len(df_reports)

In [None]:
# convert output df to a csv file
df_reports.to_csv(transformation.get_project_root() / "data" / "input" / "upload_tasks.csv")