In [1]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

from llm_data_quality_assistant.pipeline import Pipeline
from llm_data_quality_assistant.enums import Models, CorruptionTypes
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import numpy as np

load_dotenv()

True

In [2]:
gold_standard = pd.read_csv("../datasets/llm_dataset/Radiology_modality_sample.csv")

# Duplicate and append the DataFrame 5 times
gold_standard_extended = pd.concat([gold_standard.copy() for _ in range(5)], ignore_index=True)
gold_standard_extended = pd.concat([group for _, group in gold_standard_extended.groupby("dicom_uid")], ignore_index=True)
print(gold_standard_extended)

                         dicom_uid  rows  columns          series_desc  \
0    1.2.840.109739.671568005.4240  2048     2048         FOOT_LATERAL   
1    1.2.840.109739.671568005.4240  2048     2048         FOOT_LATERAL   
2    1.2.840.109739.671568005.4240  2048     2048         FOOT_LATERAL   
3    1.2.840.109739.671568005.4240  2048     2048         FOOT_LATERAL   
4    1.2.840.109739.671568005.4240  2048     2048         FOOT_LATERAL   
..                             ...   ...      ...                  ...   
495  1.2.840.991648.163637892.3869   256      256  SPINE_LUMBAR_SAG_T2   
496  1.2.840.991648.163637892.3869   256      256  SPINE_LUMBAR_SAG_T2   
497  1.2.840.991648.163637892.3869   256      256  SPINE_LUMBAR_SAG_T2   
498  1.2.840.991648.163637892.3869   256      256  SPINE_LUMBAR_SAG_T2   
499  1.2.840.991648.163637892.3869   256      256  SPINE_LUMBAR_SAG_T2   

    modality  
0         XR  
1         XR  
2         XR  
3         XR  
4         XR  
..       ...  
495   

In [3]:
corrupted_datasets = Pipeline.generate_corrupted_datasets(
    dataset=gold_standard_extended,
    cell_corruption_types=[CorruptionTypes.CellCorruptionTypes.NULL, CorruptionTypes.CellCorruptionTypes.OUTLIER],
    row_corruption_types=[],
    columns_to_exclude=["dicom_uid"],
    severity=0.15,
    output_size=1
)


In [4]:
rpm = 30
merged_df = Pipeline.merge_with_llm(dataset=corrupted_datasets[0],rpm=rpm, primary_key="dicom_uid",model_name=Models.GeminiModels.GEMINI_2_0_FLASH_LITE,  status_bar = True)


Merging groups with LLM:   0%|          | 0/100 [00:00<?, ?it/s]

Merging groups with LLM: 100%|██████████| 100/100 [03:21<00:00,  2.01s/it]


In [5]:


stats_micro = Pipeline.evaluate_micro(gold_standard=gold_standard_extended, cleaned_dataset=merged_df, corrupted_dataset=corrupted_datasets[0])
pprint(stats_micro)

stats_macro = Pipeline.evaluate_macro(gold_standard=gold_standard_extended, cleaned_dataset=merged_df, corrupted_dataset=corrupted_datasets[0])
pprint(stats_macro)

{'accuracy': 1.0,
 'column_names': ['dicom_uid', 'rows', 'columns', 'series_desc', 'modality'],
 'f1_score': 1.0,
 'false_negative': 0,
 'false_negative_rate': 0.0,
 'false_positive': 0,
 'false_positive_rate': 0.0,
 'num_columns': 5,
 'num_rows': 500,
 'precision': 1.0,
 'recall': 1.0,
 'true_negative': 2200,
 'true_positive': 300}
{'column_names': ['dicom_uid', 'rows', 'columns', 'series_desc', 'modality'],
 'num_columns': 5,
 'num_rows': 500,
 'stats': [{'accuracy': 1.0,
            'column_name': 'dicom_uid',
            'f1_score': 0.0,
            'false_negative': 0,
            'false_negative_rate': 0.0,
            'false_positive': 0,
            'false_positive_rate': 0.0,
            'num_entries': 500,
            'precision': 0.0,
            'recall': 0.0,
            'true_negative': 500,
            'true_positive': 0},
           {'accuracy': 1.0,
            'column_name': 'rows',
            'f1_score': 1.0,
            'false_negative': 0,
            'false_negat