In [49]:
import warnings
import time

import pandas as pd

from experiment.utils import dbutils, logger, transformation
from experiment.utils.tables.upload_tasks_table import UploadTasksTable

In [50]:
# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [51]:
db = dbutils.DatabaseUtils()
lg = logger.Logger(
    logging_level="DEBUG",
    output_path=transformation.get_project_root() / "tmp" / "report_translation.log",
)

In [52]:
TRANSLATE_N_MORE_REPORTS = 300

In [53]:
# get reports directly from database
query = """
            SELECT report_id, patient_no, report_original FROM annotation.reports
        """

# get values from the database
df_reports = db.read_sql_query(query)
df_reports.head()

Unnamed: 0,report_id,patient_no,report_original
0,23204350,2008726684,RAPOR TARİHİ:19/11/2022 FİLM NO:\n\nBeyin B...
1,23072879,2004022945,RAPOR TARİHİ: 13/10/2022 FİLM NO: 12831139\n...
2,22123896,2004023205,RAPOR TARİHİ: 27/01/2022 FİLM NO: \n\nKafa ...
3,24087380,2008742855,RAPOR TARİHİ:17/07/2023 FİLM NO:\n\nKontrast...
4,21367900,2004002609,RAPOR TARİHİ: 09.08.2021 FİLM NO: 11338850...


In [54]:
# extract relevant information
result = df_reports['report_original'].apply(transformation.prepare_data_transformations)
df_reports[["report_original_clean", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)
df_reports.head()

Unnamed: 0,report_id,patient_no,report_original,report_original_clean,report_date,study_no
0,23204350,2008726684,RAPOR TARİHİ:19/11/2022 FİLM NO:\n\nBeyin B...,Beyin BT Kafa kaidesi ve kalvarial kemik yapı...,19/11/2022,
1,23072879,2004022945,RAPOR TARİHİ: 13/10/2022 FİLM NO: 12831139\n...,Kafa tabanı ve verteks arasından elde olunan k...,13/10/2022,12831139.0
2,22123896,2004023205,RAPOR TARİHİ: 27/01/2022 FİLM NO: \n\nKafa ...,Kafa tabanı ve verteks arasından elde olunan k...,27/01/2022,
3,24087380,2008742855,RAPOR TARİHİ:17/07/2023 FİLM NO:\n\nKontrast...,Kontrastsız beyin BT Kafa kaidesi ve ka...,17/07/2023,
4,21367900,2004002609,RAPOR TARİHİ: 09.08.2021 FİLM NO: 11338850...,BEYİN BT Kafa tabanı ve verteks arasından el...,09.08.2021,11338850.0


In [55]:
# filter
query = """
            SELECT 
                DISTINCT data ->> 'patient_no' as patient_no
            FROM task
            WHERE is_labeled = TRUE
        """

# get values from the database
annotated_patient_nos = db.read_sql_query(query)["patient_no"].to_list()

In [56]:
# filter by non-annotated patients
df_reports = df_reports.loc[~df_reports["patient_no"].isin(annotated_patient_nos)]

In [57]:
# convert the report date column
df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')

In [58]:
# detect report count
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [59]:
# merge report counts & order by report count/date & drop duplicates
df_reports = df_reports.drop_duplicates(subset=["patient_no"], keep="first").merge(
    df_report_count, on="patient_no", how="left"
).sort_values(["report_count", "report_date"], ascending=[True, True])

In [60]:
# filter
query = """
            SELECT 
                report_id
            FROM annotation.upload_tasks
            WHERE report_english_clean IS NOT NULL 
        """

# get values from the database
upload_tasks_with_translation = db.read_sql_query(query)["report_id"].to_list()

In [61]:
reports_raw, Base = UploadTasksTable()
#Base.metadata.create_all(db.engine)

In [62]:
# use only non-translated reports
df_upload_tasks = df_reports.loc[~df_reports["report_id"].isin(upload_tasks_with_translation)].head(
    TRANSLATE_N_MORE_REPORTS
)

In [63]:
df_upload_tasks.columns

Index(['report_id', 'patient_no', 'report_original', 'report_original_clean',
       'report_date', 'study_no', 'report_count'],
      dtype='object')

In [64]:
cols_to_upsert = df_upload_tasks.columns.to_list()
cols_to_upsert.remove("report_id")
data_to_insert = []
for _, row in df_upload_tasks.iterrows():

    data_to_insert.append(
        {
            "report_id": row["report_id"],
            "patient_no": row["patient_no"],
            "report_original": row["report_original"],
            "report_original_clean": row["report_original_clean"],
            "report_english_clean": transformation.translate_report(row["report_original_clean"]),
            "study_no": row["study_no"],
            "report_count": row["report_count"]
        }
    )

    db.upsert_values(reports_raw, data_to_insert, cols_to_upsert, ["report_id"])

    # openai restriction: 3 RPM - 200 RPD
    time.sleep(20)

RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-G5GXNK7ybi1FeYo1ZA7weCK9 on requests per day. Limit: 200 / day. Please try again in 7m12s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.