In [1]:
import time
import datetime

from experiment.utils import dbutils, logger, transformation
from experiment.utils.tables.upload_tasks_table import UploadTasksTable
from experiment.api import label_studio

import openai

In [2]:
db = dbutils.DatabaseUtils()
lg = logger.Logger(
    logging_level="DEBUG",
    log_file=transformation.get_project_root() / "tmp" / "report_prompting.log",
)

In [3]:
PROMPT_N_MORE_REPORTS = 200
PROMPT = "Perform the following transformation on the report: Translate into English" 
PRIORITIZE_BY = "normal" # "normal" or "emergency"

In [4]:
reports_raw, Base = UploadTasksTable()

In [5]:
# generate annotation tables
# Base.metadata.create_all(db.engine)

db.run_dbt_model('all')

[0m09:24:22  Running with dbt=1.6.1
[0m09:24:22  Registered adapter: postgres=1.6.1
[0m09:24:22  Found 8 models, 3 sources, 0 exposures, 0 metrics, 689 macros, 0 groups, 0 semantic models
[0m09:24:22  
[0m09:24:24  Concurrency: 5 threads (target='prod')
[0m09:24:24  
[0m09:24:24  1 of 3 START sql view model annotation.stg_reports ............................. [RUN]
[0m09:24:25  1 of 3 OK created sql view model annotation.stg_reports ........................ [[32mCREATE VIEW[0m in 0.83s]
[0m09:24:25  2 of 3 START sql table model annotation.reports ................................ [RUN]
[0m09:24:27  2 of 3 OK created sql table model annotation.reports ........................... [[32mSELECT 32132[0m in 1.57s]
[0m09:24:27  3 of 3 START sql incremental model annotation.upload_tasks ..................... [RUN]
[0m09:24:29  3 of 3 OK created sql incremental model annotation.upload_tasks ................ [[32mMERGE 25687[0m in 1.98s]
[0m09:24:29  
[0m09:24:29  Finished run

### 1) Select Reports

In [6]:
PRIORITIZE_BY_VALUES = {
    "normal": "ASC",
    "emergency": "DESC"
}

In [7]:
# get reports directly from database
query = f"""
            SELECT * FROM annotation.upload_tasks ut 
            ORDER BY patient_report_count {PRIORITIZE_BY_VALUES[PRIORITIZE_BY]}, report_length {PRIORITIZE_BY_VALUES[PRIORITIZE_BY]} 
        """

# get values from the database
df_reports = db.read_sql_query(query)
df_reports.head()

Unnamed: 0,report_id,patient_no,protocol_no,report_original,report_length,report_prompted,patient_report_count
0,3885,2005072330,24087556,\n.\n.\n..\n\n..\n.\n.\n.\n.\n.\n.\n.\n.\n\n.\...,1,Perform the following transformation on the re...,1
1,10572,2008067603,21389331,görüntüler yok \n.\n..\n.\n.\n\n.\n.\n.\n.\n.\...,3,There are no images.,1
2,18041,2008609782,24028540,SONUCU GİRİLECEKTİR......................SONUC...,3,The result will be entered.,1
3,14847,2008342803,24154059,DİĞER TETKİKLERİNİ OKUDUNUZ BU TETKİKLERİDE OK...,6,Did you read the other tests? Can you also rea...,1
4,21371,2008938063,24072123,RAPOR TARİHİ:21/05/2023 FİLM NO: \n\nraporla...,7,REPORT DATE: 21/05/2023 FILM NO: \n\nto be r...,1


In [8]:
# get annotated reports 
query = """
            SELECT 
                DISTINCT data ->> 'patient_no' as patient_no
            FROM task
            WHERE is_labeled = TRUE
        """

# get values from the database
annotated_patient_nos = db.read_sql_query(query)["patient_no"].to_list()

In [9]:
# get tasks that have been prompted
query = """
            SELECT 
                report_id
            FROM annotation.upload_tasks
            WHERE report_prompted != '' 
        """

# get values from the database
upload_tasks_prompted = db.read_sql_query(query)["report_id"].to_list()

In [10]:
# use only non-prompted reports & non-annotated patients
df_upload_tasks = (
    df_reports.loc[~df_reports["patient_no"].isin(annotated_patient_nos)]
    .loc[~df_reports["report_id"].isin(upload_tasks_prompted)]
    .head(PROMPT_N_MORE_REPORTS)
)

### 2) Prompt Reports

In [11]:
cols_to_upsert = df_upload_tasks.columns.to_list()
cols_to_upsert.remove("report_id")
data_to_insert = []
for _, row in df_upload_tasks.iterrows():
    try:
        data_to_insert.append(
            {
                "report_id": row["report_id"],
                "patient_no": row["patient_no"],
                "protocol_no": row["protocol_no"],
                "report_original": row["report_original"],
                "report_prompted": transformation.prompt_report(
                    report=row["report_original"], prompt=PROMPT
                ),
                "report_length": row["report_length"],
                "patient_report_count": row["patient_report_count"],
            }
        )
    

        db.upsert_values(reports_raw, data_to_insert, cols_to_upsert, ["report_id"])

        time.sleep(20)
    except openai.error.RateLimitError as rate_error:
        # openai restriction: 3 RPM - 200 RPD
        lg.log(f"Rate limit for: {datetime.datetime.now()}", "WARNING")

lg.log(f"Finished prompting {len(data_to_insert)} reports")

Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)

### 3) Upload Tasks to Label Studio

In [12]:
# get reports directly from database
query = """
            SELECT
                report_id,
                patient_no,
                protocol_no,
                report_original,
                report_prompted as text,
                report_length,
                patient_report_count
            FROM
                annotation.upload_tasks
            WHERE
                report_id NOT IN (
                SELECT
                    (DATA ->> 'report_id')::INT AS report_id
                FROM
                    public.task)
                AND report_prompted != ''
        """

# get values from the database
df_upload_tasks = db.read_sql_query(query)

# output tasks as a csv file
output_path = (
    transformation.get_project_root() / "tmp" / "data" / "upload_tasks.csv"
)
df_upload_tasks.to_csv(output_path, index=False)

In [13]:
# upload tasks to label studio
label_studio.upload_csv_tasks(csv_path=output_path, project_id=7)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

{"task_count":115,"annotation_count":0,"prediction_count":0,"duration":0.42647647857666016,"file_upload_ids":[62],"could_be_tasks_list":true,"found_formats":{".csv":1},"data_columns":["report_original","report_length","patient_report_count","text","patient_no","report_id","protocol_no"]}

100  110k  100   288  100  110k    277   106k  0:00:01  0:00:01 --:--:--  106k


In [14]:
label_studio.stop_label_studio()

Scaling dynos... done, now running [32mweb[39m at 0:Basic
