In [33]:
import warnings
import time

import pandas as pd

from experiment.utils import dbutils, logger, transformation
from experiment.utils.tables.upload_tasks_table import UploadTasksTable

In [34]:
# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [35]:
db = dbutils.DatabaseUtils()
lg = logger.Logger(
    logging_level="DEBUG",
    output_path=transformation.get_project_root() / "tmp" / "report_translation.log",
)

In [36]:
TRANSLATE_N_MORE_REPORTS = 1

In [37]:
# get reports directly from database
query = """
            SELECT report_id, patient_no, report_original FROM annotation.reports
        """

# get values from the database
df_reports = db.read_sql_query(query)
df_reports.head()

Unnamed: 0,report_id,patient_no,report_original
0,23204350,2008726684,RAPOR TARİHİ:19/11/2022 FİLM NO:\n\nBeyin B...
1,22810477,2008231237,RAPOR TARİHİ: 14/07/2022 FİLM NO:\n\nBeyin ...
2,23072879,2004022945,RAPOR TARİHİ: 13/10/2022 FİLM NO: 12831139\n...
3,22664628,2008231238,RAPOR TARİHİ: 16.06.2022 FİLM NO:\n\nBeyin ...
4,23452265,2009543927,RAPOR TARİHİ: 10/01/2023 FİLM NO: \n\nKafa t...


In [38]:
# extract relevant information
result = df_reports['report_original'].apply(transformation.prepare_data_transformations)
df_reports[["report_original_clean", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)
df_reports.head()

Unnamed: 0,report_id,patient_no,report_original,report_original_clean,report_date,study_no
0,23204350,2008726684,RAPOR TARİHİ:19/11/2022 FİLM NO:\n\nBeyin B...,Beyin BT Kafa kaidesi ve kalvarial kemik yapı...,19/11/2022,
1,22810477,2008231237,RAPOR TARİHİ: 14/07/2022 FİLM NO:\n\nBeyin ...,Beyin BT Kafa kaidesi ve kalvarial kemik ...,14/07/2022,
2,23072879,2004022945,RAPOR TARİHİ: 13/10/2022 FİLM NO: 12831139\n...,Kafa tabanı ve verteks arasından elde olunan k...,13/10/2022,12831139.0
3,22664628,2008231238,RAPOR TARİHİ: 16.06.2022 FİLM NO:\n\nBeyin ...,Beyin BT Posterior fossada beyin sapı artefa...,16.06.2022,
4,23452265,2009543927,RAPOR TARİHİ: 10/01/2023 FİLM NO: \n\nKafa t...,Kafa tabanı ve verteks arasından elde olunan k...,10/01/2023,


In [39]:
# filter
query = """
            SELECT 
                DISTINCT data ->> 'patient_no' as patient_no
            FROM task
            WHERE is_labeled = TRUE
        """

# get values from the database
annotated_patient_nos = db.read_sql_query(query)["patient_no"].to_list()

In [40]:
# filter by non-annotated patients
df_reports = df_reports.loc[~df_reports["patient_no"].isin(annotated_patient_nos)]

In [41]:
# convert the report date column
df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')

In [42]:
# detect report count
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [43]:
# merge report counts & order by report count/date & drop duplicates
df_reports = df_reports.drop_duplicates(subset=["patient_no"], keep="first").merge(
    df_report_count, on="patient_no", how="left"
).sort_values(["report_count", "report_date"], ascending=[True, True])

In [44]:
# filter
query = """
            SELECT 
                report_id
            FROM annotation.upload_tasks
            WHERE report_english_clean IS NOT NULL 
        """

# get values from the database
upload_tasks_with_translation = db.read_sql_query(query)["report_id"].to_list()

In [45]:
reports_raw, Base = UploadTasksTable()
#Base.metadata.create_all(db.engine)

In [46]:
# use only non-translated reports
df_upload_tasks = df_reports.loc[~df_reports["report_id"].isin(upload_tasks_with_translation)].head(
    TRANSLATE_N_MORE_REPORTS
)

In [47]:
df_upload_tasks.columns

Index(['report_id', 'patient_no', 'report_original', 'report_original_clean',
       'report_date', 'study_no', 'report_count'],
      dtype='object')

In [48]:
cols_to_upsert = df_upload_tasks.columns.to_list()
cols_to_upsert.remove("report_id")
data_to_insert = []
for _, row in df_upload_tasks.iterrows():

    data_to_insert.append(
        {
            "report_id": row["report_id"],
            "patient_no": row["patient_no"],
            "report_original": row["report_original"],
            "report_original_clean": row["report_original_clean"],
            "report_english_clean": transformation.translate_report(row["report_original_clean"]),
            "study_no": row["study_no"],
            "report_count": row["report_count"]
        }
    )

    db.upsert_values(reports_raw, data_to_insert, cols_to_upsert, ["report_id"])

    # openai restriction: 3 RPM
    time.sleep(20)