In [22]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

from llm_data_quality_assistant.pipeline import Pipeline
from llm_data_quality_assistant.enums import Models, CorruptionTypes
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import numpy as np

load_dotenv()

True

In [23]:
gold_standard = pd.read_csv(
    "../datasets/self_generated_dataset/Radiology_modality_sample.csv"
)

# Duplicate and append the DataFrame 5 times
corrupted_versions = 5
gold_standard_extended = pd.concat([gold_standard.copy() for _ in range(corrupted_versions)], ignore_index=True)
gold_standard_extended = pd.concat([group for _, group in gold_standard_extended.groupby("dicom_uid")], ignore_index=True)
print(gold_standard_extended)

                         dicom_uid  rows  columns          series_desc  \
0    1.2.840.109739.671568005.4240  2048     2048         FOOT_LATERAL   
1    1.2.840.109739.671568005.4240  2048     2048         FOOT_LATERAL   
2    1.2.840.109739.671568005.4240  2048     2048         FOOT_LATERAL   
3    1.2.840.109739.671568005.4240  2048     2048         FOOT_LATERAL   
4    1.2.840.109739.671568005.4240  2048     2048         FOOT_LATERAL   
..                             ...   ...      ...                  ...   
495  1.2.840.991648.163637892.3869   256      256  SPINE_LUMBAR_SAG_T2   
496  1.2.840.991648.163637892.3869   256      256  SPINE_LUMBAR_SAG_T2   
497  1.2.840.991648.163637892.3869   256      256  SPINE_LUMBAR_SAG_T2   
498  1.2.840.991648.163637892.3869   256      256  SPINE_LUMBAR_SAG_T2   
499  1.2.840.991648.163637892.3869   256      256  SPINE_LUMBAR_SAG_T2   

    modality  
0         XR  
1         XR  
2         XR  
3         XR  
4         XR  
..       ...  
495   

In [24]:
corrupted_datasets = Pipeline.generate_corrupted_datasets(
    dataset=gold_standard_extended,
    cell_corruption_types=[CorruptionTypes.CellCorruptionTypes.NULL, CorruptionTypes.CellCorruptionTypes.OUTLIER, CorruptionTypes.CellCorruptionTypes.TYPO],
    row_corruption_types=[CorruptionTypes.RowCorruptionTypes.DELETE_ROWS],
    columns_to_exclude=["dicom_uid"],
    severity=0.15,
    output_size=1
)
corrupted_datasets[0].to_csv(
"../datasets/self_generated_dataset/Radiology_modality_sample_corrupt.csv", index=False, header=True, encoding="utf-8"
)


In [25]:
rpm = 30
import string
string.punctuation = string.punctuation.replace("'", "")  # Remove single quotes from punctuation
model_name = Models.GeminiModels.GEMINI_2_0_FLASH_LITE
context_rows = 50
file_name = str(model_name.value) + f"_{corrupted_versions}_copies_{context_rows}_rows_context"
for p in string.punctuation:
    file_name = file_name.replace(p, "_")

additional_context = f"""
{corrupted_datasets[0].sample(n=context_rows).to_string(index=False)}
"""
merged_df = Pipeline.merge_with_llm(
    dataset=corrupted_datasets[0],
    rpm=rpm,
    primary_key="dicom_uid",
    model_name=model_name,
    verbose=False,
    additional_prompt=additional_context,
    status_bar=True
)


Merging groups with LLM: 100%|██████████| 100/100 [03:46<00:00,  2.27s/it]


In [26]:
merged_df.to_csv(
    f"../analysis/repairs/radiology/{file_name}_repair.csv",
)

In [27]:
import json


stats_micro = Pipeline.evaluate_micro(gold_standard=gold_standard_extended, cleaned_dataset=merged_df, corrupted_dataset=corrupted_datasets[0])
pprint(stats_micro)

stats_macro = Pipeline.evaluate_macro(gold_standard=gold_standard_extended, cleaned_dataset=merged_df, corrupted_dataset=corrupted_datasets[0])
pprint(stats_macro)

with open(
    f"../analysis/results/radiology_self_generated/{file_name}_results_micro.json",
    "w",
) as f:
    json.dump(stats_micro, f, indent=4)

with open(f"../analysis/results/radiology_self_generated/{file_name}_results_macro.json", "w") as f:
    json.dump(stats_macro, f, indent=4)

{'accuracy': 0.998,
 'column_names': ['dicom_uid', 'rows', 'columns', 'series_desc', 'modality'],
 'f1_score': 0.9913344887348354,
 'false_negative': 2,
 'false_negative_rate': 0.006944444444444444,
 'false_positive': 3,
 'false_positive_rate': 0.00135623869801085,
 'num_columns': 5,
 'num_rows': 500,
 'precision': 0.9896193771626297,
 'recall': 0.9930555555555556,
 'true_negative': 2209,
 'true_positive': 286}
{'column_names': ['dicom_uid', 'rows', 'columns', 'series_desc', 'modality'],
 'num_columns': 5,
 'num_rows': 500,
 'stats': [{'accuracy': 1.0,
            'column_name': 'dicom_uid',
            'f1_score': 0.0,
            'false_negative': 0,
            'false_negative_rate': 0.0,
            'false_positive': 0,
            'false_positive_rate': 0.0,
            'num_entries': 500,
            'precision': 0.0,
            'recall': 0.0,
            'true_negative': 500,
            'true_positive': 0},
           {'accuracy': 1.0,
            'column_name': 'rows',
      