In [1]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))
import pandas as pd
from llm_data_quality_assistant.pipeline import Pipeline

from pprint import pprint
import json

In [2]:
corrupt_dataset = pd.read_csv("../datasets/parker_datasets/allergen/allergen_corrupted_first1000.csv")

gold_standard = pd.read_csv("../datasets/parker_datasets/allergen/allergen_cleaned_gold_first1000.csv")

repaired_dataset = pd.read_csv("../datasets/parker_datasets/allergen/parker-repair/allergen_repair.csv")

# Reindex columns to match corrupt_dataset
repaired_dataset = repaired_dataset.reindex(columns=corrupt_dataset.columns)
corrupt_dataset = corrupt_dataset.sort_values(by="code").reset_index(drop=True)
gold_standard = gold_standard.sort_values(by="code").reset_index(drop=True)
repaired_dataset = repaired_dataset.sort_values(by="code").reset_index(drop=True)

# Check if all three dataframes have the exact same row order
same_order = (
    (corrupt_dataset['code'] == gold_standard['code']).all() and
    (corrupt_dataset['code'] == repaired_dataset['code']).all()
)
print("All dataframes have the exact same row order:", same_order)

All dataframes have the exact same row order: True


In [3]:
micro_stats = Pipeline.evaluate_micro(
    gold_standard=gold_standard,
    corrupted_dataset=corrupt_dataset,
    cleaned_dataset=repaired_dataset,
)

pprint(micro_stats)
with open("../analysis/results/allergen/parker_results_micro.json", "w") as f:
    json.dump(micro_stats, f, indent=4)

{'accuracy': 0.9377758164165931,
 'column_names': ['code',
                  'nuts',
                  'almondnuts',
                  'brazil_nuts',
                  'macadamia_nuts',
                  'hazelnut',
                  'pistachio',
                  'walnut',
                  'cashew',
                  'celery',
                  'crustaceans',
                  'eggs',
                  'fish',
                  'gluten',
                  'lupin',
                  'milk',
                  'molluscs',
                  'mustard',
                  'peanut',
                  'sesame',
                  'soy',
                  'sulfite'],
 'f1_score': 0.4382470119521912,
 'false_negative': 248,
 'false_negative_rate': 0.6927374301675978,
 'false_positive': 34,
 'false_positive_rate': 0.008145663632007666,
 'num_columns': 22,
 'num_rows': 206,
 'precision': 0.7638888888888888,
 'recall': 0.30726256983240224,
 'true_negative': 4140,
 'true_positive': 110}


In [4]:
macro_stats = Pipeline.evaluate_macro(
    gold_standard=gold_standard,
    corrupted_dataset=corrupt_dataset,
    cleaned_dataset=repaired_dataset,
)

pprint(macro_stats)
with open("../analysis/results/allergen/parker_results_macro.json", "w") as f:
    json.dump(macro_stats, f, indent=4)

{'column_names': ['code',
                  'nuts',
                  'almondnuts',
                  'brazil_nuts',
                  'macadamia_nuts',
                  'hazelnut',
                  'pistachio',
                  'walnut',
                  'cashew',
                  'celery',
                  'crustaceans',
                  'eggs',
                  'fish',
                  'gluten',
                  'lupin',
                  'milk',
                  'molluscs',
                  'mustard',
                  'peanut',
                  'sesame',
                  'soy',
                  'sulfite'],
 'num_columns': 22,
 'num_rows': 206,
 'stats': [{'accuracy': 1.0,
            'column_name': 'code',
            'f1_score': 0.0,
            'false_negative': 0,
            'false_negative_rate': 0.0,
            'false_positive': 0,
            'false_positive_rate': 0.0,
            'num_entries': 206,
            'precision': 0.0,
            'recall': 0.0,
