In [1]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

from llm_data_quality_assistant.corruptor import RowCorruptionTypes, CellCorruptionTypes
from llm_data_quality_assistant.enums import Models
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import numpy as np
import time

load_dotenv()


True

In [2]:
corrupt_dataset = pd.read_csv(
    "../datasets/parker_datasets/allergen/allergen_corrupted_first1000.csv"
)

gold_standard = pd.read_csv("../datasets/parker_datasets/allergen/allergen_cleaned_gold_first1000.csv")

# Reorder columns of corrupt_dataset to match gold_standard
corrupt_dataset = corrupt_dataset[gold_standard.columns]

with open("../datasets/parker_datasets/allergen/allergen.partialkey", "r") as f:
    partial_keys = f.read()

with open("../datasets/parker_datasets/allergen/allergen.rules", "r") as f:
    rules = f.read()


# print(partial_keys)
# print(rules)
# print(corrupt_dataset.head(2))
# print(gold_standard.head(2))
# print(type(gold_standard.get("code").iloc[0]))
# print(type(corrupt_dataset.get("code").iloc[0]))

In [3]:
# shortened_corrupt_df = corrupt_dataset[corrupt_dataset["code"].isin(gold_standard["code"])]
# shortened_corrupt_df = shortened_corrupt_df.sort_values(by="code").reset_index(drop=True)

# print("Shape shortened corrupt dataset:")
# print(shortened_corrupt_df.shape)

# print(shortened_corrupt_df)

# shortened_gold_standard = gold_standard[gold_standard["code"].isin(shortened_corrupt_df["code"])]
# shortened_gold_standard = shortened_gold_standard.sort_values(by="code").reset_index(drop=True)

# print("Shape shortened gold standard dataset:")
# print(shortened_gold_standard.shape)

# print(shortened_gold_standard)


In [4]:
# Clean and evaluate using the new Pipeline API
from llm_data_quality_assistant.pipeline import Pipeline
from llm_data_quality_assistant.enums import Models
import jupyter_helper_functions
import string
string.punctuation = string.punctuation.replace("'", "")  # Remove single quotes from punctuation

# Use a primary key for merging
primary_key = "code"
model = Models.GeminiModels.GEMINI_2_0_FLASH_LITE
rows_of_context = 50


extra = "simple approach"

file_name = jupyter_helper_functions.sanitize_filename(f"{model.value}_{rows_of_context}_rows_context_{extra}")   

rpm = 30
additional_prompt = f"""
Here are rows of the dataset to provide context for the cleaning process:
{corrupt_dataset.sample(rows_of_context).to_string(index=False)}
"""


# Merge/clean with LLM
merged_df, time_taken = jupyter_helper_functions.merge_with_llm_timed(
    dataset = corrupt_dataset,
    primary_key = primary_key,
    model = model,
    rpm = rpm,
    additional_prompt = additional_prompt
    )


Merging groups with LLM: 100%|██████████| 103/103 [03:39<00:00,  2.13s/it]


In [5]:
jupyter_helper_functions.save_dataframe_csv(merged_df, f"../analysis/repairs/allergen/merged_dataset_{file_name}.csv")

In [6]:
# Evaluate results

jupyter_helper_functions.standardize_and_evaluate(
    gold_standard=gold_standard,
    merged_df=merged_df,
    corrupt_dataset=corrupt_dataset,
    primary_key=primary_key,
    time_delta=time_taken,
    results_dir=f"../analysis/results/allergen/",
    file_name=file_name,
)



("Standardized micro evaluation: {'num_rows': 206, 'num_columns': 22, "
 "'column_names': ['code', 'nuts', 'almondnuts', 'brazil_nuts', "
 "'macadamia_nuts', 'hazelnut', 'pistachio', 'walnut', 'cashew', 'celery', "
 "'crustaceans', 'eggs', 'fish', 'gluten', 'lupin', 'milk', 'molluscs', "
 "'mustard', 'peanut', 'sesame', 'soy', 'sulfite'], 'true_positive': 105, "
 "'false_positive': 49, 'false_negative': 253, 'true_negative': 4125, "
 "'precision': 0.6818181818181818, 'recall': 0.29329608938547486, 'f1_score': "
 "0.41015624999999994, 'accuracy': 0.9333627537511032, 'false_positive_rate': "
 "0.011739338763775755, 'false_negative_rate': 0.7067039106145251, "
 "'time_taken': 219.03006076812744}")
("Standardized macro evaluation: {'num_rows': 206, 'num_columns': 22, "
 "'column_names': ['code', 'nuts', 'almondnuts', 'brazil_nuts', "
 "'macadamia_nuts', 'hazelnut', 'pistachio', 'walnut', 'cashew', 'celery', "
 "'crustaceans', 'eggs', 'fish', 'gluten', 'lupin', 'milk', 'molluscs', "
 "'must