In [1]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

In [2]:
import pandas as pd
from pprint import pprint
from llm_data_quality_assistant.llm_integration import combine_results_1
from llm_data_quality_assistant.pipeline import Pipeline

# Read the dataframes
# LLM cleaned
llm_path = "../analysis/repairs/allergen/merged_dataset_gemini_2_0_flash_lite_50_rows_context.csv"
df_llm = pd.read_csv(llm_path)

# Parker cleaned
parker_path = "../datasets/parker_datasets/allergen/parker-repair/allergen_repair.csv"
df_parker = pd.read_csv(parker_path)

# Original corrupted
orig_path = "../datasets/parker_datasets/allergen/allergen_corrupted_first1000.csv"
df_original = pd.read_csv(orig_path)

# Check if all DataFrames have the same shape
if not (df_llm.shape == df_parker.shape == df_original.shape):
    raise ValueError("Not all DataFrames have the same shape.")

# Ensure all DataFrames have the same order of 'code'
if "code" not in df_parker.columns or "code" not in df_llm.columns or "code" not in df_original.columns:
    raise ValueError("All DataFrames must contain a 'code' column.")

# Ensure all DataFrames have the same column order
column_order = df_parker.columns.tolist()
df_llm = df_llm[column_order]
df_original = df_original[column_order]


df_parker = df_parker.sort_values("code").reset_index(drop=True)
df_llm = df_llm.sort_values("code").reset_index(drop=True)
df_original = df_original.sort_values("code").reset_index(drop=True)

if not (
    (df_parker["code"].tolist() == df_llm["code"].tolist() == df_original["code"].tolist())
):
    raise ValueError("The 'code' column does not have the same order in all DataFrames.")


# Calculate the percentage of differing cells between df_parker and df_llm (excluding 'code' column)
diff_mask = (df_parker[column_order].values != df_llm[column_order].values)
num_diff = diff_mask.sum()
total_cells = diff_mask.size
percent_diff = (num_diff / total_cells) * 100

print(f"Percentage of differing cells between df_parker and df_llm (excluding 'code'): {percent_diff:.2f}%")


# Combine results
combined_df = combine_results_1(df_llm=df_llm, df_parker=df_parker, df_original=df_original)

# Show the result
combined_df.head()


Percentage of differing cells between df_parker and df_llm (excluding 'code'): 0.31%


Unnamed: 0,eggs,gluten,hazelnut,pistachio,code,lupin,crustaceans,milk,sulfite,walnut,...,nuts,fish,molluscs,almondnuts,cashew,peanut,sesame,soy,brazil_nuts,macadamia_nuts
0,0,2,0,0,42239499,0,0,0,0,0,...,2,0,0,0,0,0,0,2,0,0
1,0,2,0,0,42239499,0,0,0,0,0,...,2,0,0,0,0,0,0,2,0,0
2,0,0,0,0,42256199,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,0,0,0,0,42256199,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0,2,0,0,42330660,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:

gold_standard_path = "../datasets/parker_datasets/allergen/allergen_cleaned_gold_first1000.csv"
gold_standard = pd.read_csv(gold_standard_path)
# Ensure gold_standard has the same column order and row order as combined_df
gold_standard = gold_standard[column_order]
gold_standard = gold_standard.sort_values("code").reset_index(drop=True)


# cleaned_dataset = combined_df (already defined)
# corrupted_dataset = df_original (already defined)

micro_eval = Pipeline.evaluate_micro(
    gold_standard=gold_standard,
    cleaned_dataset=combined_df,
    corrupted_dataset=df_original
)


print("Micro evaluation result:")
pprint(micro_eval)

Micro evaluation result:
{'accuracy': 0.9386584289496911,
 'column_names': ['eggs',
                  'gluten',
                  'hazelnut',
                  'pistachio',
                  'code',
                  'lupin',
                  'crustaceans',
                  'milk',
                  'sulfite',
                  'walnut',
                  'mustard',
                  'celery',
                  'nuts',
                  'fish',
                  'molluscs',
                  'almondnuts',
                  'cashew',
                  'peanut',
                  'sesame',
                  'soy',
                  'brazil_nuts',
                  'macadamia_nuts'],
 'f1_score': 0.45275590551181105,
 'false_negative': 243,
 'false_negative_rate': 0.6787709497206704,
 'false_positive': 35,
 'false_positive_rate': 0.008385241974125539,
 'num_columns': 22,
 'num_rows': 206,
 'precision': 0.7666666666666667,
 'recall': 0.32122905027932963,
 'true_negative': 4139,
 'true_po

In [4]:
# Save repaired dataset to repairs/allergen with consistent naming for option 1
repaired_path = "../analysis/repairs/allergen/merged_dataset_gemini_2_0_flash_lite_50_rows_context_option_1.csv"
combined_df.to_csv(repaired_path, index=False)
print(f"Repaired dataset saved to {repaired_path}")

# Save micro and macro evaluation results to results/allergen with consistent naming for option 1
import json
results_micro_path = "../analysis/results/allergen/gemini_2_0_flash_lite_50_rows_context_option_1_results_micro.json"
with open(results_micro_path, "w") as f:
    json.dump(micro_eval, f, indent=2)
print(f"Micro evaluation results saved to {results_micro_path}")

macro_eval = Pipeline.evaluate_macro(
    gold_standard=gold_standard,
    cleaned_dataset=combined_df,
    corrupted_dataset=df_original
)
print("Macro evaluation result:")
pprint(macro_eval)

results_macro_path = "../analysis/results/allergen/gemini_2_0_flash_lite_50_rows_context_option_1_results_macro.json"
with open(results_macro_path, "w") as f:
    json.dump(macro_eval, f, indent=2)
print(f"Macro evaluation results saved to {results_macro_path}")


Repaired dataset saved to ../analysis/repairs/allergen/merged_dataset_gemini_2_0_flash_lite_50_rows_context_option_1.csv
Micro evaluation results saved to ../analysis/results/allergen/gemini_2_0_flash_lite_50_rows_context_option_1_results_micro.json
Macro evaluation result:
{'column_names': ['eggs',
                  'gluten',
                  'hazelnut',
                  'pistachio',
                  'code',
                  'lupin',
                  'crustaceans',
                  'milk',
                  'sulfite',
                  'walnut',
                  'mustard',
                  'celery',
                  'nuts',
                  'fish',
                  'molluscs',
                  'almondnuts',
                  'cashew',
                  'peanut',
                  'sesame',
                  'soy',
                  'brazil_nuts',
                  'macadamia_nuts'],
 'num_columns': 22,
 'num_rows': 206,
 'stats': [{'accuracy': 1.0,
            'column_name