In [1]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

from llm_data_quality_assistant import pipeline
from llm_data_quality_assistant.corruptor import RowCorruptionTypes, CellCorruptionTypes
from llm_data_quality_assistant.enums import Models
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import numpy as np

load_dotenv()


True

In [2]:
corrupt_dataset = pd.read_csv("../datasets/parker_datasets/allergen/SHORT_allergen.csv")

gold_standard = pd.read_csv("../datasets/parker_datasets/allergen/gold_standard_alergene_pivoted.csv")

with open("../datasets/parker_datasets/allergen/allergen.partialkey", "r") as f:
    partial_keys = f.read()

with open("../datasets/parker_datasets/allergen/allergen.rules", "r") as f:
    rules = f.read()


print(partial_keys)
print(rules)
print(corrupt_dataset.head(2))
print(gold_standard.head(2))
print(type(gold_standard.get("code").iloc[0]))
print(type(corrupt_dataset.get("code").iloc[0]))

code ->
    nuts,
    almondnuts,
    brazil_nuts,
    macadamia_nuts,
    hazelnut,
    pistachio,
    walnut,
    cashew,
    celery,
    crustaceans,
    eggs,
    fish,
    gluten,
    lupin,
    milk,
    molluscs,
    mustard,
    peanut,
    sesame,
    soy,
    sulfite

-- Attribute contracts
@nuts:integer
@almondnuts:integer
@brazil_nuts:integer
@macadamia_nuts:integer
@hazelnut:integer
@pistachio:integer
@walnut:integer
@cashew:integer
@celery:integer
@crustaceans:integer
@eggs:integer
@fish:integer
@gluten:integer
@lupin:integer
@milk:integer
@molluscs:integer
@mustard:integer
@peanut:integer
@sesame:integer
@soy:integer
@sulfite:integer


The attributes
of this dataset indicate the presence (‘2’), traces (‘1’), or absence
(‘0’) of allergens in a product.

Everything value except of the code has to be between 0 and 2.

--NOT ALLOWED THAT
nuts < almondnuts
nuts < brazil_nuts
nuts < macadamia_nuts
nuts < hazelnut
nuts < pistachio
nuts < walnut
nuts < cashew

IT ALLWAYS MUST BE

In [3]:
shortened_corrupt_df = corrupt_dataset[corrupt_dataset["code"].isin(gold_standard["code"])]
shortened_corrupt_df = shortened_corrupt_df.sort_values(by="code").reset_index(drop=True)

print("Shape shortened corrupt dataset:")
print(shortened_corrupt_df.shape)

print(shortened_corrupt_df)

shortened_gold_standard = gold_standard[gold_standard["code"].isin(shortened_corrupt_df["code"])]
shortened_gold_standard = shortened_gold_standard.sort_values(by="code").reset_index(drop=True)

print("Shape shortened gold standard dataset:")
print(shortened_gold_standard.shape)

print(shortened_gold_standard)


Shape shortened corrupt dataset:
(18, 22)
             code  nuts  almondnuts  brazil_nuts  macadamia_nuts  hazelnut  \
0   4104420006065     0           0            0               0         0   
1   4104420006065     0           0            0               0         0   
2   4104420007963     2           0            0               0         0   
3   4104420007963     2           0            0               0         2   
4   4104420007987     2           0            0               0         0   
5   4104420007987     2           0            0               0         2   
6   4104420010628     0           0            0               0         0   
7   4104420010628     2           0            0               0         0   
8   4104420014701     0           0            0               0         0   
9   4104420014701     0           0            0               0         0   
10  4104420015289     1           0            0               0         0   
11  4104420015289     

In [4]:
p = pipeline.Pipeline(shortened_gold_standard)

assert shortened_gold_standard.shape == shortened_corrupt_df.shape

# Find coordinates where entries differ
diff = shortened_gold_standard.values != shortened_corrupt_df.values
corrupted_coords = np.argwhere(diff)

print("Corrupted coordinates (row, col):")
print(corrupted_coords)

# Process in chunks of 2 rows
chunk_size = 2
chunks = [
    shortened_corrupt_df.iloc[i:i+chunk_size]
    for i in range(0, len(shortened_corrupt_df), chunk_size)
]

merged_chunks = []
for chunk in chunks:
    cleaned_chunk = p.clean_single_dataset(chunk, (rules))
    merged_chunks.append(cleaned_chunk)

merged_df = pd.concat(merged_chunks, ignore_index=True)


shortened_merged_df = merged_df[merged_df["code"].isin(gold_standard["code"])]
shortened_merged_df = shortened_merged_df.sort_values(by="code").reset_index(drop=True)

print("Shape merged dataset:")
print(merged_df.shape)

print("Shape shortened merged dataset:")
print(shortened_merged_df.shape)

results = p.evaluate_micro(shortened_merged_df, [corrupted_coords])
print(results)

same_values_only = (shortened_merged_df.values == shortened_corrupt_df.values).all()
print(same_values_only)


Corrupted coordinates (row, col):
[[ 2  2]
 [ 2  5]
 [ 2  8]
 [ 2 13]
 [ 2 15]
 [ 2 20]
 [ 3  2]
 [ 3  8]
 [ 4  2]
 [ 4  5]
 [ 4  6]
 [ 4  7]
 [ 5  2]
 [ 5  6]
 [ 5  7]
 [ 6  1]
 [ 6  2]
 [ 6 13]
 [ 6 18]
 [ 6 19]
 [ 7  1]
 [ 7  2]
 [ 7 13]
 [ 7 18]
 [ 7 19]
 [10  2]
 [11  1]
 [11  2]
 [11 18]
 [15 13]
 [16  2]
 [16 13]
 [17  2]]
Shape merged dataset:
(18, 22)
Shape shortened merged dataset:
(18, 22)
{'num_rows': 18, 'num_columns': 22, 'column_names': ['code', 'nuts', 'almondnuts', 'brazil_nuts', 'macadamia_nuts', 'hazelnut', 'pistachio', 'walnut', 'cashew', 'celery', 'crustaceans', 'eggs', 'fish', 'gluten', 'lupin', 'milk', 'molluscs', 'mustard', 'peanut', 'sesame', 'soy', 'sulfite'], 'true_positive': 5, 'false_positive': 4, 'false_negative': 28, 'true_negative': 359, 'precision': 0.5555555555555556, 'recall': 0.15151515151515152, 'f1_score': 0.2380952380952381, 'accuracy': 0.9191919191919192, 'false_positive_rate': 0.011019283746556474, 'false_negative_rate': 0.8484848484848485}
Fals

In [5]:
print("Shortened Merged DataFrame:")
print(shortened_merged_df)
print("Shortened Gold Standard DataFrame:")
print(shortened_gold_standard)

Shortened Merged DataFrame:
             code  nuts  almondnuts  brazil_nuts  macadamia_nuts  hazelnut  \
0   4104420006065     0           0            0               0         0   
1   4104420006065     0           0            0               0         0   
2   4104420007963     2           0            0               0         2   
3   4104420007963     2           0            0               0         2   
4   4104420007987     2           0            0               0         2   
5   4104420007987     2           0            0               0         2   
6   4104420010628     2           0            0               0         0   
7   4104420010628     2           0            0               0         0   
8   4104420014701     0           0            0               0         0   
9   4104420014701     0           0            0               0         0   
10  4104420015289     2           0            0               0         0   
11  4104420015289     2           0 