In [69]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

from llm_data_quality_assistant.corruptor import RowCorruptionTypes, CellCorruptionTypes
from llm_data_quality_assistant.enums import Models
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import numpy as np
import time

load_dotenv()


True

In [70]:
corrupt_dataset = pd.read_csv("../datasets/parker_datasets/allergen/allergen_corrupted_first1000.csv")

gold_standard = pd.read_csv("../datasets/parker_datasets/allergen/allergen_cleaned_gold_first1000.csv")

with open("../datasets/parker_datasets/allergen/allergen.partialkey", "r") as f:
    partial_keys = f.read()

with open("../datasets/parker_datasets/allergen/allergen.rules", "r") as f:
    rules = f.read()


print(partial_keys)
print(rules)
print(corrupt_dataset.head(2))
print(gold_standard.head(2))
print(type(gold_standard.get("code").iloc[0]))
print(type(corrupt_dataset.get("code").iloc[0]))

code ->
    nuts,
    almondnuts,
    brazil_nuts,
    macadamia_nuts,
    hazelnut,
    pistachio,
    walnut,
    cashew,
    celery,
    crustaceans,
    eggs,
    fish,
    gluten,
    lupin,
    milk,
    molluscs,
    mustard,
    peanut,
    sesame,
    soy,
    sulfite

-- Attribute contracts
@nuts:integer
@almondnuts:integer
@brazil_nuts:integer
@macadamia_nuts:integer
@hazelnut:integer
@pistachio:integer
@walnut:integer
@cashew:integer
@celery:integer
@crustaceans:integer
@eggs:integer
@fish:integer
@gluten:integer
@lupin:integer
@milk:integer
@molluscs:integer
@mustard:integer
@peanut:integer
@sesame:integer
@soy:integer
@sulfite:integer


The attributes
of this dataset indicate the presence (‘2’), traces (‘1’), or absence
(‘0’) of allergens in a product.

Everything value except of the code has to be between 0 and 2.

--NOT ALLOWED THAT
nuts < almondnuts
nuts < brazil_nuts
nuts < macadamia_nuts
nuts < hazelnut
nuts < pistachio
nuts < walnut
nuts < cashew

IT ALLWAYS MUST BE

In [71]:
# shortened_corrupt_df = corrupt_dataset[corrupt_dataset["code"].isin(gold_standard["code"])]
# shortened_corrupt_df = shortened_corrupt_df.sort_values(by="code").reset_index(drop=True)

# print("Shape shortened corrupt dataset:")
# print(shortened_corrupt_df.shape)

# print(shortened_corrupt_df)

# shortened_gold_standard = gold_standard[gold_standard["code"].isin(shortened_corrupt_df["code"])]
# shortened_gold_standard = shortened_gold_standard.sort_values(by="code").reset_index(drop=True)

# print("Shape shortened gold standard dataset:")
# print(shortened_gold_standard.shape)

# print(shortened_gold_standard)


In [72]:
# Clean and evaluate using the new Pipeline API
from llm_data_quality_assistant.pipeline import Pipeline
from llm_data_quality_assistant.enums import Models
import string
string.punctuation = string.punctuation.replace("'", "")  # Remove single quotes from punctuation

# Use a primary key for merging
primary_key = "code"
model = Models.OpenAIModels.GPT_4_1_MINI
rows_of_context = 200
file_name = str(model.value) + f"_{rows_of_context}_rows_context"
for p in string.punctuation + " ":
    file_name = file_name.replace(p, "_")

   
rpm = 0

additional_prompt = f"""
Here are rows of the dataset to provide context for the cleaning process:
{corrupt_dataset.sample(rows_of_context).to_string(index=False)}
"""


# Merge/clean with LLM
start_time = time.time()
merged_df = Pipeline.merge_with_llm(
    dataset=corrupt_dataset,
    primary_key=primary_key,
    model_name=model,
    rpm=rpm,
    additional_prompt=additional_prompt,
    verbose=False,
    status_bar=True,
)
time_delta = time.time() - start_time
# # Show DataFrames for inspection
# print("Merged DataFrame:")
# print(merged_df)
# print("Gold Standard DataFrame:")
# print(shortened_gold_standard)
# print("Corrupted DataFrame:")
# print(shortened_corrupt_df)

Merging groups with LLM: 100%|██████████| 103/103 [06:39<00:00,  3.88s/it]


In [73]:

merged_df.to_csv(f"../analysis/merged_lukas/allergen/merged_dataset_{file_name}.csv", index=False)

In [74]:
print(merged_df)
import json
# Evaluate results
stats_micro = Pipeline.evaluate_micro(
    gold_standard=gold_standard,
    cleaned_dataset=merged_df,
    corrupted_dataset=corrupt_dataset
)
stats_micro["time_taken"] = time_delta
print("====================================")
print("MICRO EVALUATION RESULTS")
print("====================================")
pprint(stats_micro)

stats_macro = Pipeline.evaluate_macro(
    gold_standard=gold_standard,
    cleaned_dataset=merged_df,
    corrupted_dataset=corrupt_dataset
)
stats_macro["time_taken"] = time_delta
print("====================================")
print("MACRO EVALUATION RESULTS")
print("====================================")
pprint(stats_macro)

with open(
    f"../analysis/results/allergen/{file_name}_results_micro.json",
    "w",
) as f:
    json.dump(stats_micro, f, indent=4)

with open(f"../analysis/results/allergen/{file_name}_results_macro.json", "w") as f:
    json.dump(stats_macro, f, indent=4)



              code  nuts  almondnuts  brazil_nuts  macadamia_nuts  hazelnut  \
0    4104420006065     0           0            0               0         0   
1    4104420006065     0           0            0               0         0   
2    4104420007963     2           0            0               0         2   
3    4104420007963     2           0            0               0         2   
4    4104420007987     2           0            0               0         1   
..             ...   ...         ...          ...             ...       ...   
201       42256199     1           0            0               0         0   
202       42330660     0           0            0               0         0   
203       42330660     0           0            0               0         0   
204       42373186     1           0            0               0         0   
205       42373186     1           0            0               0         0   

     pistachio  walnut  cashew  celery  ...  fish  