In [1]:
import pandas as pd
import warnings

from experiment.utils import dbutils
from experiment.utils import transformation

from experiment.utils.tables.upload_tasks_table import UploadTasksTable

In [2]:
# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
db = dbutils.DatabaseUtils()

In [4]:
# get reports directly from database
query = """
            SELECT * FROM annotation.reports
        """

# get values from the database
df_reports = db.read_sql_query(query)
df_reports.head()

Unnamed: 0,report_id,patient_no,full_name,report_original,report_english
0,23012646,2004001412,SADİ İYEM,RAPOR TARİHİ: 26/09/2022 FİLM NO: 12796199\n...,
1,22394628,2004001562,İLKNUR ERTENER,RAPOR TARİHİ: 15.03.2022 FİLM NO: 12523533\...,
2,22296380,2004001709,VEHBİYE AKBALIK,RAPOR TARİHİ:15/02/2022 FİLM NO:12493333\n\n...,
3,23963976,2004002163,HAVVA AYDIN,RAPOR TARİHİ: 25/05/2023 FİLM NO:\n\nKontras...,
4,21367900,2004002609,ZEYNEP YEŞİLOVA,RAPOR TARİHİ: 09.08.2021 FİLM NO: 11338850...,


In [5]:
# extract relevant information
result = df_reports['report_original'].apply(transformation.prepare_data_transformations)
df_reports[["text", "report_date", "study_no"]] = pd.DataFrame(
    result.tolist(), index=result.index
)

In [6]:
df_reports

Unnamed: 0,report_id,patient_no,full_name,report_original,report_english,text,report_date,study_no
0,23012646,2004001412,SADİ İYEM,RAPOR TARİHİ: 26/09/2022 FİLM NO: 12796199\n...,,Kafa tabanı ve verteks arasından kontrastsız e...,26/09/2022,12796199
1,22394628,2004001562,İLKNUR ERTENER,RAPOR TARİHİ: 15.03.2022 FİLM NO: 12523533\...,,Beyin BT Kafa tabanı ve verteks arasından el...,15.03.2022,12523533
2,22296380,2004001709,VEHBİYE AKBALIK,RAPOR TARİHİ:15/02/2022 FİLM NO:12493333\n\n...,,Kontrastsız beyin BT tetkiki Kafa kaide...,15/02/2022,12493333
3,23963976,2004002163,HAVVA AYDIN,RAPOR TARİHİ: 25/05/2023 FİLM NO:\n\nKontras...,,Kontrastlı beyin BT İnfratentorial yapıl...,25/05/2023,
4,21367900,2004002609,ZEYNEP YEŞİLOVA,RAPOR TARİHİ: 09.08.2021 FİLM NO: 11338850...,,BEYİN BT Kafa tabanı ve verteks arasından el...,09.08.2021,11338850
...,...,...,...,...,...,...,...,...
29530,24165013,2009595219,MUKADDES DUMAN,RAPOR TARİHİ:20/07/2023 FİLM NO:\n\nKontrast...,,Kontrastsız beyin BT İnfratentorial yapılar ...,20/07/2023,
29531,24165077,2009595225,ZEYNEP ALBAYRAK,RAPOR TARİHİ : 20/07/2023 FİLM NO : \n\nKaf...,,Kafa tabanı ve verteks arasından elde olunan k...,20/07/2023,
29532,24165121,2009595227,BERİL SARGIN,RAPOR TARİHİ:20/07/2023 FİLM NO:\n\nKontrast...,,Kontrastsız beyin BT Kafa kaidesi ve ka...,20/07/2023,
29533,24167884,2009595436,MUHAMMED DABUL,RAPOR TARİHİ: 21/07/2023 TETKİK NO:...,,BEYİN BT Kafa tabanı ve verteks arasından eld...,21/07/2023,13333104


In [7]:
# check empty dates
(df_reports['report_date'].values == '').sum() 

1665

In [8]:
# filter
query = """
            SELECT 
                DISTINCT data ->> 'patient_no' as patient_no
            FROM task
            WHERE is_labeled = TRUE
        """

# get values from the database
annotated_patient_nos = db.read_sql_query(query)["patient_no"].to_list()

In [9]:
# filter by non-annotated patients
df_reports = df_reports.loc[~df_reports["patient_no"].isin(annotated_patient_nos)]

In [10]:
# convert the report date column
df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reports['report_date'] = pd.to_datetime(df_reports['report_date'], errors='coerce')


In [11]:
# detect report count
df_report_count = df_reports.groupby("patient_no").agg(report_count=pd.NamedAgg(column="patient_no", aggfunc="count")).reset_index()

In [12]:
# merge report counts & order by report count/date & drop duplicates
df_reports = df_reports.drop_duplicates(subset=["patient_no"], keep="first").merge(
    df_report_count, on="patient_no", how="left"
).sort_values(["report_count", "report_date"], ascending=[True, True])

In [13]:
len(df_reports)

24288

In [14]:
df_reports.columns

Index(['report_id', 'patient_no', 'full_name', 'report_original',
       'report_english', 'text', 'report_date', 'study_no', 'report_count'],
      dtype='object')

In [15]:
reports_raw, Base = UploadTasksTable()
#Base.metadata.create_all(db.engine)

cols_to_upsert = df_reports.columns.to_list()
cols_to_upsert.remove("report_id")
data_to_insert = []
for _, row in df_reports.iterrows():

    data_to_insert.append(
        {
            "report_id": row["report_id"],
            "patient_no": row["patient_no"],
            "full_name": row["full_name"],
            "report_original": row["report_original"],
            "report_english": row["report_english"],
            "text": row["text"],
            "study_no": row["study_no"],
            "report_count": row["report_count"], 
            # "report_date": row["report_date"],
        }
    )

db.upsert_values(reports_raw, data_to_insert, cols_to_upsert, ["report_id"])