In [1]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

from llm_data_quality_assistant.pipeline import Pipeline
from llm_data_quality_assistant.enums import Models, CorruptionTypes
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import numpy as np

load_dotenv()

True

In [2]:
# ...existing code...
gold_standard = pd.read_csv("../datasets/llm_dataset/Radiology_modality_sample.csv")[:10]

# Duplicate and append the DataFrame 5 times
gold_standard_extended = pd.concat([gold_standard.copy() for _ in range(5)], ignore_index=True)
# ...existing code...
print(gold_standard_extended)

                        dicom_uid  rows  columns        series_desc modality
0   1.2.840.166348.444565830.2739  1024     1024  PELVIS_ULTRASOUND       US
1   1.2.840.431243.214684544.9328  2048     2048            HAND_AP       XR
2   1.2.840.285501.600086396.5032   512      512      ABDOMEN_W_CON       CT
3   1.2.840.382511.108756150.8848   512      512  CHEST_PE_PROTOCOL       CT
4    1.2.840.178211.688829522.637  2048     2048        ABDOMEN_KUB       XR
5    1.2.840.588030.38009097.1579  2048     2048           CHEST_PA       XR
6    1.2.840.414409.93803138.7993  2048     2048       FOOT_LATERAL       XR
7   1.2.840.740439.136605933.5326   512      512       BRAIN_WO_CON       CT
8   1.2.840.717539.706621359.1289  2048     2048           CHEST_PA       XR
9   1.2.840.588402.765606324.5613   512      512      SINUS_CORONAL       CT
10  1.2.840.166348.444565830.2739  1024     1024  PELVIS_ULTRASOUND       US
11  1.2.840.431243.214684544.9328  2048     2048            HAND_AP       XR

In [3]:
corrupted_datasets, corrupted_coords = Pipeline.generate_corrupted_datasets(
    dataset=gold_standard_extended,
    cell_corruption_types=[CorruptionTypes.CellCorruptionTypes.NULL, CorruptionTypes.CellCorruptionTypes.OUTLIER],
    row_corruption_types=[],
    columns_to_exclude=["dicom_uid"],
    severity=0.1,
    output_size=1
)
print(corrupted_coords[0])

[[23  3]
 [ 8  1]
 [16  0]
 [ 2  4]
 [25  1]
 [38  1]
 [23  4]
 [23  1]
 [ 8  3]
 [18  2]
 [13  1]
 [ 9  4]
 [30  4]
 [19  2]
 [48  2]
 [10  1]
 [42  3]
 [28  2]
 [24  0]
 [23  0]
 [42  1]
 [ 4  3]
 [ 9  0]
 [35  3]
 [22  4]]


In [4]:
merged_df = Pipeline.merge_with_llm(dataset=corrupted_datasets[0], primary_key="dicom_uid",model_name=Models.GeminiModels.GEMINI_2_0_FLASH, verbose=True)


[
  {
    "dicom_uid": "1.2.840.166348.444565830.2739",
    "rows": "1024",
    "columns": "1024",
    "series_desc": "PELVIS_ULTRASOUND",
    "modality": "US"
  },
  {
    "dicom_uid": "1.2.840.166348.444565830.2739",
    "rows": "1024",
    "columns": "1024",
    "series_desc": "PELVIS_ULTRASOUND",
    "modality": "US"
  },
  {
    "dicom_uid": "1.2.840.166348.444565830.2739",
    "rows": "1024",
    "columns": "1024",
    "series_desc": "PELVIS_ULTRASOUND",
    "modality": "US"
  },
  {
    "dicom_uid": "1.2.840.166348.444565830.2739",
    "rows": "1024",
    "columns": "1024",
    "series_desc": "PELVIS_ULTRASOUND",
    "modality": "US"
  },
  {
    "dicom_uid": "1.2.840.166348.444565830.2739",
    "rows": "1024",
    "columns": "1024",
    "series_desc": "PELVIS_ULTRASOUND",
    "modality": "US"
  }
][
  {
    "dicom_uid": "1.2.840.178211.688829522.637",
    "rows": "2048",
    "columns": "2048",
    "series_desc": "ABDOMEN_KUB",
    "modality": "XR"
  },
  {
    "dicom_uid": "1.2

In [5]:
print(merged_df)
stats_micro = Pipeline.evaluate_micro(gold_standard=gold_standard_extended, generated_dataset=merged_df, corrupted_coords=corrupted_coords[0]) 

                        dicom_uid  rows columns        series_desc modality
0   1.2.840.166348.444565830.2739  1024    1024  PELVIS_ULTRASOUND       US
1   1.2.840.166348.444565830.2739  1024    1024  PELVIS_ULTRASOUND       US
2   1.2.840.166348.444565830.2739  1024    1024  PELVIS_ULTRASOUND       US
3   1.2.840.166348.444565830.2739  1024    1024  PELVIS_ULTRASOUND       US
4   1.2.840.166348.444565830.2739  1024    1024  PELVIS_ULTRASOUND       US
5    1.2.840.178211.688829522.637  2048    2048        ABDOMEN_KUB       XR
6    1.2.840.178211.688829522.637  2048    2048        ABDOMEN_KUB       XR
7    1.2.840.178211.688829522.637  2048    2048        ABDOMEN_KUB       XR
8    1.2.840.178211.688829522.637  2048    2048        ABDOMEN_KUB       XR
9    1.2.840.178211.688829522.637  2048    2048        ABDOMEN_KUB       XR
10  1.2.840.285501.600086396.5032   512     512      ABDOMEN_W_CON       CT
11  1.2.840.285501.600086396.5032   512     512      ABDOMEN_W_CON       CT
12  1.2.840.