In [1]:
import warnings
import time

import pandas as pd

from experiment.utils import dbutils, logger, transformation
from experiment.utils.tables.upload_tasks_table import UploadTasksTable

In [2]:
# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
db = dbutils.DatabaseUtils()
lg = logger.Logger(
    logging_level="DEBUG",
    output_path=transformation.get_project_root() / "tmp" / "report_translation.log",
)

In [4]:
TRANSLATE_N_MORE_REPORTS = 71

In [5]:
# get reports directly from database
query = """
            SELECT * FROM annotation.reports
        """

# get values from the database
df_reports = db.read_sql_query(query)
df_reports.head()

Unnamed: 0,report_id,patient_no,full_name,report_original
0,23012646,2004001412,SADİ İYEM,RAPOR TARİHİ: 26/09/2022 FİLM NO: 12796199\n...
1,22394628,2004001562,İLKNUR ERTENER,RAPOR TARİHİ: 15.03.2022 FİLM NO: 12523533\...
2,22296380,2004001709,VEHBİYE AKBALIK,RAPOR TARİHİ:15/02/2022 FİLM NO:12493333\n\n...
3,23963976,2004002163,HAVVA AYDIN,RAPOR TARİHİ: 25/05/2023 FİLM NO:\n\nKontras...
4,21367900,2004002609,ZEYNEP YEŞİLOVA,RAPOR TARİHİ: 09.08.2021 FİLM NO: 11338850...


In [6]:
# extract relevant information
result = df_reports['report_original'].apply(transformation.prepare_data_transformations)
df_reports[["report_original_clean", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)
df_reports.head()

Unnamed: 0,report_id,patient_no,full_name,report_original,report_original_clean,report_date,study_no
0,23012646,2004001412,SADİ İYEM,RAPOR TARİHİ: 26/09/2022 FİLM NO: 12796199\n...,Kafa tabanı ve verteks arasından kontrastsız e...,26/09/2022,12796199.0
1,22394628,2004001562,İLKNUR ERTENER,RAPOR TARİHİ: 15.03.2022 FİLM NO: 12523533\...,Beyin BT Kafa tabanı ve verteks arasından el...,15.03.2022,12523533.0
2,22296380,2004001709,VEHBİYE AKBALIK,RAPOR TARİHİ:15/02/2022 FİLM NO:12493333\n\n...,Kontrastsız beyin BT tetkiki Kafa kaide...,15/02/2022,12493333.0
3,23963976,2004002163,HAVVA AYDIN,RAPOR TARİHİ: 25/05/2023 FİLM NO:\n\nKontras...,Kontrastlı beyin BT İnfratentorial yapıl...,25/05/2023,
4,21367900,2004002609,ZEYNEP YEŞİLOVA,RAPOR TARİHİ: 09.08.2021 FİLM NO: 11338850...,BEYİN BT Kafa tabanı ve verteks arasından el...,09.08.2021,11338850.0


In [7]:
# filter
query = """
            SELECT 
                DISTINCT data ->> 'patient_no' as patient_no
            FROM task
            WHERE is_labeled = TRUE
        """

# get values from the database
annotated_patient_nos = db.read_sql_query(query)["patient_no"].to_list()

In [8]:
# filter by non-annotated patients
df_reports = df_reports.loc[~df_reports["patient_no"].isin(annotated_patient_nos)]

In [9]:
# convert the report date column
df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')

In [10]:
# detect report count
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [11]:
# merge report counts & order by report count/date & drop duplicates
df_reports = df_reports.drop_duplicates(subset=["patient_no"], keep="first").merge(
    df_report_count, on="patient_no", how="left"
).sort_values(["report_count", "report_date"], ascending=[True, True])

In [12]:
df_reports.columns

Index(['report_id', 'patient_no', 'full_name', 'report_original',
       'report_original_clean', 'report_date', 'study_no', 'report_count'],
      dtype='object')

In [13]:
# filter
query = """
            SELECT 
                report_id
            FROM annotation.upload_tasks
            WHERE report_english_clean IS NOT NULL 
        """

# get values from the database
upload_tasks_with_translation = db.read_sql_query(query)["report_id"].to_list()

In [14]:
reports_raw, Base = UploadTasksTable()
#Base.metadata.create_all(db.engine)

In [15]:
# use only non-translated reports
df_reports = df_reports.loc[~df_reports["report_id"].isin(upload_tasks_with_translation)].head(
    TRANSLATE_N_MORE_REPORTS
)

In [16]:
cols_to_upsert = df_reports.columns.to_list()
cols_to_upsert.remove("report_id")
data_to_insert = []
for _, row in df_reports.iterrows():

    data_to_insert.append(
        {
            "report_id": row["report_id"],
            "patient_no": row["patient_no"],
            "full_name": row["full_name"],
            "report_original": row["report_original"],
            "report_original_clean": row["report_original_clean"],
            "report_english_clean": transformation.translate_report(row["report_original_clean"]),
            "study_no": row["study_no"],
            "report_count": row["report_count"]
        }
    )

    db.upsert_values(reports_raw, data_to_insert, cols_to_upsert, ["report_id"])

    lg.logger.info(f"Report upserted: {row['report_id']}")

    # openai restriction: 3 RPM
    time.sleep(20)

2023-09-20 15:53:50,252 - Reports - INFO - Report upserted: 21199645
2023-09-20 15:54:18,331 - Reports - INFO - Report upserted: 21199484


KeyboardInterrupt: 