In [1]:
import pandas as pd
import pathlib
import warnings
import json

import experiment.utils.query as query_utils
from experiment.utils import transformation

In [2]:
# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
# get directly from database
query = """
            SELECT * FROM annotation.reports
        """

# get values from the database
df_reports = query_utils.get_select_values(query)

In [5]:
# extract relevant information
result = df_reports['report'].apply(transformation.prepare_data_transformations)
df_reports[["text", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)

In [6]:
# check empty dates
(df_reports['report_date'].values == '').sum() 

1780

In [7]:
# filter
query = """
            SELECT 
                DISTINCT data ->> 'patient_no' as patient_no
            FROM task
            WHERE is_labeled = TRUE
        """

# get values from the database
annotated_patient_nos = query_utils.get_select_values(query)["patient_no"].to_list()

In [8]:
# filter by non-annotated patients
df_reports = df_reports.loc[~df_reports["patient_no"].isin(annotated_patient_nos)]

In [9]:
# convert the report date column
df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')

In [10]:
# detect report count
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [11]:
# merge report counts & order by report count/date & drop duplicates
df_reports = df_reports.drop_duplicates(subset=["patient_no"], keep="first").merge(
    df_report_count, on="patient_no", how="left"
).sort_values(["report_count", "report_date"], ascending=[True, True])

In [13]:
len(df_reports)

24354

In [14]:
# convert output df to a csv file
df_reports.to_csv(transformation.get_project_root() / "data" / "input" / "upload_tasks.csv")