In [1]:
import time

from experiment.utils import dbutils, logger, transformation
from experiment.utils.tables.upload_tasks_table import UploadTasksTable
from experiment.api import label_studio

import openai

In [3]:
db = dbutils.DatabaseUtils()
lg = logger.Logger(
    logging_level="DEBUG",
    output_path=transformation.get_project_root() / "tmp" / "report_prompting.log",
)

In [2]:
PROMPT_N_MORE_REPORTS = 200
PROMPT = "Perform the following transformation on the report: Translate into English" 

In [4]:
reports_raw, Base = UploadTasksTable()

In [5]:
# # run the dbt model to generate tables from scratch
# Base.metadata.create_all(db.engine)
# db.run_dbt_model('upload_tasks')

[0m11:52:22  Running with dbt=1.6.1
[0m11:52:22  Registered adapter: postgres=1.6.1
[0m11:52:22  Found 8 models, 5 sources, 0 exposures, 0 metrics, 689 macros, 0 groups, 0 semantic models
[0m11:52:22  
[0m11:52:24  Concurrency: 5 threads (target='prod')
[0m11:52:24  
[0m11:52:24  1 of 1 START sql incremental model annotation.upload_tasks ..................... [RUN]
[0m11:52:26  1 of 1 OK created sql incremental model annotation.upload_tasks ................ [[32mMERGE 25687[0m in 2.07s]
[0m11:52:27  
[0m11:52:27  Finished running 1 incremental model in 0 hours 0 minutes and 4.44 seconds (4.44s).
[0m11:52:27  
[0m11:52:27  [32mCompleted successfully[0m
[0m11:52:27  
[0m11:52:27  Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1


### 1) Select Reports

In [6]:
# get reports directly from database
query = """
            SELECT * FROM annotation.upload_tasks ut 
            ORDER BY patient_report_count DESC, report_length DESC 
        """

# get values from the database
df_reports = db.read_sql_query(query)
df_reports.head()

Unnamed: 0,report_id,patient_no,protocol_no,report_original,report_length,report_prompted,patient_report_count
0,5408,2005183286,21302877,RAPOR TARİHİ: 24.06.2021 FİLM NO: 12217711...,79,,27
1,6125,2006000036,22202602,RAPOR TARİHİ:01/02/2022 TETKİK NO: ...,83,,26
2,16913,2008509284,22344550,RAPOR TARİHİ: 21/02/2022 FİLM NO:12506694\n\...,165,,20
3,16909,2008509284,22344550,RAPOR TARİHİ:08/02/2022 FİLM NO:12506525\n\n...,113,,20
4,26,2004004697,22023602,RAPOR TARİHİ:27/12/2021 FİLM NO:\n\nKontrast...,128,,19


In [7]:
# get annotated reports 
query = """
            SELECT 
                DISTINCT data ->> 'patient_no' as patient_no
            FROM task
            WHERE is_labeled = TRUE
        """

# get values from the database
annotated_patient_nos = db.read_sql_query(query)["patient_no"].to_list()

In [8]:
# get tasks that have been prompted
query = """
            SELECT 
                report_id
            FROM annotation.upload_tasks
            WHERE report_prompted != '' 
        """

# get values from the database
upload_tasks_prompted = db.read_sql_query(query)["report_id"].to_list()

In [9]:
# use only non-prompted reports & non-annotated patients
df_upload_tasks = (
    df_reports.loc[~df_reports["patient_no"].isin(annotated_patient_nos)]
    .loc[~df_reports["report_id"].isin(upload_tasks_prompted)]
    .head(PROMPT_N_MORE_REPORTS)
)

### 2) Prompt Reports

In [10]:
cols_to_upsert = df_upload_tasks.columns.to_list()
cols_to_upsert.remove("report_id")
data_to_insert = []
for _, row in df_upload_tasks.iterrows():
    try:
        data_to_insert.append(
            {
                "report_id": row["report_id"],
                "patient_no": row["patient_no"],
                "protocol_no": row["protocol_no"],
                "report_original": row["report_original"],
                "report_prompted": transformation.prompt_report(
                    report=row["report_original"], prompt=PROMPT
                ),
                "report_length": row["report_length"],
                "patient_report_count": row["patient_report_count"],
            }
        )
    

        db.upsert_values(reports_raw, data_to_insert, cols_to_upsert, ["report_id"])

        time.sleep(20)
    except openai.error.RateLimitError:
        # openai restriction: 3 RPM - 200 RPD
        pass

RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-G5GXNK7ybi1FeYo1ZA7weCK9 on requests per day. Limit: 200 / day. Please try again in 7m12s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.

### 3) Upload Tasks to Label Studio

In [None]:
# get reports directly from database
query = """
            SELECT
                report_id,
                patient_no,
                protocol_no,
                report_original,
                report_prompted as text,
                report_length,
                patient_report_count
            FROM
                annotation.upload_tasks
            WHERE
                report_id NOT IN (
                SELECT
                    (DATA ->> 'report_id')::INT AS report_id
                FROM
                    public.task)
                AND report_prompted != ''
        """

# get values from the database
df_upload_tasks = db.read_sql_query(query)

# output tasks as a csv file
output_path = (
    transformation.get_project_root() / "tmp" / "data" / "upload_tasks.csv"
)
df_upload_tasks.to_csv(output_path, index=False)
df_upload_tasks

In [None]:
# upload tasks to label studio
label_studio.upload_csv_tasks(csv_path=output_path, project_id=7)

In [None]:
label_studio.stop_label_studio()