In [None]:
import pandas as pd
import warnings

from experiment.utils import dbutils
from experiment.utils import transformation

In [None]:
# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
db = dbutils.DatabaseUtils()

In [None]:
# get reports directly from database
query = """
            SELECT * FROM annotation.reports
        """

# get values from the database
df_reports = db.read_sql_query(query)
df_reports.head()

In [None]:
df_reports.head()

In [None]:
# extract relevant information
result = df_reports['report'].apply(transformation.prepare_data_transformations)
df_reports[["text", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)

In [None]:
df_reports

In [None]:
# check empty dates
(df_reports['report_date'].values == '').sum() 

In [None]:
# filter
query = """
            SELECT 
                DISTINCT data ->> 'patient_no' as patient_no
            FROM task
            WHERE is_labeled = TRUE
        """

# get values from the database
annotated_patient_nos = db.read_sql_query(query)["patient_no"].to_list()

In [None]:
# filter by non-annotated patients
df_reports = df_reports.loc[~df_reports["patient_no"].isin(annotated_patient_nos)]

In [None]:
# convert the report date column
df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')

In [None]:
# detect report count
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [None]:
# merge report counts & order by report count/date & drop duplicates
df_reports = df_reports.drop_duplicates(subset=["patient_no"], keep="first").merge(
    df_report_count, on="patient_no", how="left"
).sort_values(["report_count", "report_date"], ascending=[True, True])

In [None]:
len(df_reports)

In [None]:
# convert output df to a csv file
df_reports.to_csv(transformation.get_project_root() / "data" / "input" / "upload_tasks.csv")