In [4]:
import time

from experiment.utils import dbutils, transformation
from experiment.utils.logging import logger
from experiment.utils.tables.upload_tasks_table import UploadTasksTable
from experiment.api import label_studio


In [5]:
db = dbutils.DatabaseUtils()

In [6]:
PROMPT_N_MORE_REPORTS = 200
PROMPT = "Perform the following transformation on the report: Translate into English" 
PRIORITIZE_BY = "normal" # "normal" or "emergency"

In [7]:
reports_raw, Base = UploadTasksTable()

In [8]:
# generate annotation tables
# Base.metadata.create_all(db.engine)

db.run_dbt_model('all')

[0m09:24:27  Running with dbt=1.6.1
[0m09:24:27  Registered adapter: postgres=1.6.1
[0m09:24:28  Found 9 models, 1 snapshot, 3 sources, 0 exposures, 0 metrics, 689 macros, 0 groups, 0 semantic models
[0m09:24:28  
[0m09:24:30  Concurrency: 5 threads (target='prod')
[0m09:24:30  
[0m09:24:30  1 of 1 START snapshot snapshot.report_classifications_snapshot ................. [RUN]
[0m09:24:32  1 of 1 OK snapshotted snapshot.report_classifications_snapshot ................. [[32msuccess[0m in 1.56s]
[0m09:24:33  
[0m09:24:33  Finished running 1 snapshot in 0 hours 0 minutes and 4.65 seconds (4.65s).
[0m09:24:33  
[0m09:24:33  [32mCompleted successfully[0m
[0m09:24:33  
[0m09:24:33  Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1
[0m09:24:34  Running with dbt=1.6.1
[0m09:24:34  Registered adapter: postgres=1.6.1
[0m09:24:35  Found 9 models, 1 snapshot, 3 sources, 0 exposures, 0 metrics, 689 macros, 0 groups, 0 semantic models
[0m09:24:35  
[0m09:24:37  Concurrency: 5 thread

### 1) Select Reports

In [9]:
PRIORITIZE_BY_VALUES = {
    "normal": "ASC",
    "emergency": "DESC"
}

In [10]:
# get reports directly from database
query = f"""
            SELECT * FROM annotation.upload_tasks ut 
            WHERE
                report_id NOT IN (
                SELECT
                    (DATA ->> 'report_id')::INT AS report_id
                FROM
                    public.task)
                ORDER BY patient_report_count {PRIORITIZE_BY_VALUES[PRIORITIZE_BY]}, report_length {PRIORITIZE_BY_VALUES[PRIORITIZE_BY]} 
        """

# get values from the database
df_reports = db.read_sql_query(query)
df_reports.head()

Unnamed: 0,report_id,patient_no,protocol_no,report_original,report_length,report_prompted,patient_report_count
0,7198,2006074567,23109997,RAPOR TARİHİ:07/11/2022 FİLM NO:\n\nBeyin BT...,47,REPORT DATE: 07/11/2022 FILM NO:\n\nBrain CT...,1
1,15962,2008430651,23744022,\nKafa tabanı ve verteks arasından elde olunan...,47,In sections obtained between the base of the s...,1
2,3014,2004983506,23842325,RAPOR TARİHİ: 24/04/2023 FİLM NO: 13184017\n...,47,Report Date: 24/04/2023\nFilm No: 13184017\n\n...,1
3,4923,2005145142,22865336,RAPOR TARİHİ:30/07/2022 FİLM NO:\n\nBeyin BT...,47,REPORT DATE: 30/07/2022 FILM NUMBER: \n\nBrain...,1
4,24075,2009242981,23060585,RAPOR TARİHİ: 03/10/2022 FİLM NO:12819590\n\...,47,DATE OF REPORT: 03/10/2022 FILM NO: 12819590...,1


In [11]:
# get annotated reports 
query = """
            SELECT 
                DISTINCT data ->> 'patient_no' as patient_no
            FROM task
            WHERE is_labeled = TRUE
        """

# get values from the database
annotated_patient_nos = db.read_sql_query(query)["patient_no"].to_list()

In [12]:
# get tasks that have been prompted
query = """
            SELECT 
                report_id
            FROM annotation.upload_tasks
            WHERE report_prompted != '' 
        """

# get values from the database
upload_tasks_prompted = db.read_sql_query(query)["report_id"].to_list()

In [13]:
# use only non-prompted reports & non-annotated patients
df_upload_tasks = (
    df_reports.loc[~df_reports["patient_no"].isin(annotated_patient_nos)]
    .loc[~df_reports["report_id"].isin(upload_tasks_prompted)]
    .head(PROMPT_N_MORE_REPORTS)
)

### 2) Prompt Reports

In [14]:
cols_to_upsert = df_upload_tasks.columns.to_list()
cols_to_upsert.remove("report_id")
data_to_insert = []

for idx, (_, row) in enumerate(df_upload_tasks.iterrows()):
    try:
        data_to_insert.append(
            {
                "report_id": row["report_id"],
                "patient_no": row["patient_no"],
                "protocol_no": row["protocol_no"],
                "report_original": row["report_original"],
                "report_prompted": transformation.prompt_report(
                    report=row["report_original"], prompt=PROMPT
                ),
                "report_length": row["report_length"],
                "patient_report_count": row["patient_report_count"],
            }
        )

        db.upsert_values(reports_raw, data_to_insert, cols_to_upsert, ["report_id"])

        logger.info(f"{idx + 1}. report translated & uploaded")

        time.sleep(20)
    except Exception as error:
        # openai restriction: 3 RPM - 200 RPD
        logger.error(error)

logger.info(f"Finished prompting {len(data_to_insert)} reports")

2023-11-25 12:25:18,178 - AI Reports - INFO - 1. report translated & uploaded
2023-11-25 12:25:51,667 - AI Reports - INFO - 2. report translated & uploaded
2023-11-25 12:26:26,482 - AI Reports - INFO - 3. report translated & uploaded
2023-11-25 12:27:03,446 - AI Reports - INFO - 4. report translated & uploaded
2023-11-25 12:27:39,694 - AI Reports - INFO - 5. report translated & uploaded
2023-11-25 12:28:14,251 - AI Reports - INFO - 6. report translated & uploaded
2023-11-25 12:28:44,933 - AI Reports - INFO - 7. report translated & uploaded
2023-11-25 12:29:19,759 - AI Reports - INFO - 8. report translated & uploaded
2023-11-25 12:29:54,160 - AI Reports - INFO - 9. report translated & uploaded
2023-11-25 12:30:27,692 - AI Reports - INFO - 10. report translated & uploaded
2023-11-25 12:31:04,907 - AI Reports - INFO - 11. report translated & uploaded
2023-11-25 12:31:42,828 - AI Reports - INFO - 12. report translated & uploaded
2023-11-25 12:32:20,463 - AI Reports - INFO - 13. report tran

### 3) Upload Tasks to Label Studio

In [15]:
# get reports directly from database
query = """
            SELECT
                report_id,
                patient_no,
                protocol_no,
                report_original,
                report_prompted as text,
                report_length,
                patient_report_count
            FROM
                annotation.upload_tasks
            WHERE
                report_id NOT IN (
                SELECT
                    (DATA ->> 'report_id')::INT AS report_id
                FROM
                    public.task)
                AND report_prompted != ''
        """

# get values from the database
df_upload_tasks = db.read_sql_query(query)

# output tasks as a csv file
output_path = (
    transformation.get_project_root() / "tmp" / "data" / "upload_tasks.csv"
)
df_upload_tasks.to_csv(output_path, index=False)

In [16]:
# upload tasks to label studio
label_studio.upload_csv_tasks(csv_path=output_path, project_id=7)

Scaling dynos... done, now running [32mweb[39m at 1:Basic
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

{"task_count":262,"annotation_count":0,"prediction_count":0,"duration":0.32439541816711426,"file_upload_ids":[65],"could_be_tasks_list":true,"found_formats":{".csv":1},"data_columns":["report_length","report_original","patient_report_count","protocol_no","report_id","patient_no","text"]}

100  260k  100   288  100  259k    194   175k  0:00:01  0:00:01 --:--:--  176k


In [17]:
label_studio.stop_label_studio()

Scaling dynos... done, now running [32mweb[39m at 0:Basic
