In [1]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

In [2]:
import pandas as pd
from pprint import pprint
from llm_data_quality_assistant.llm_integration import combine_results_1
from llm_data_quality_assistant.pipeline import Pipeline

# Read the dataframes
# LLM cleaned
eudract_llm_path = "../analysis/repairs/eudract/merged_dataset_gemini_2_0_flash_lite_50_rows_context.csv"
df_llm = pd.read_csv(eudract_llm_path)

# Original corrupted
eudract_orig_path = "../datasets/parker_datasets/eudract/eudract_corrupted_first1000.csv"
df_original = pd.read_csv(eudract_orig_path)

# Parker cleaned
eudract_parker_path = "../datasets/parker_datasets/eudract/parker-repair/eudract_repair.csv"
df_parker = pd.read_csv(eudract_parker_path)
df_parker = df_parker[df_original.columns]


print(df_llm.shape, df_parker.shape, df_original.shape)

# Check if all DataFrames have the same shape
if not (df_llm.shape == df_parker.shape == df_original.shape):
    raise ValueError("Not all DataFrames have the same shape.")

# Ensure all DataFrames have the same order of "eudract_number"
if "eudract_number" not in df_parker.columns or "eudract_number" not in df_llm.columns or "eudract_number" not in df_original.columns:
    raise ValueError("All DataFrames must contain a 'eudract_number' column.")

# Ensure all DataFrames have the same column order
column_order = df_parker.columns.tolist()
df_llm = df_llm[column_order]
df_original = df_original[column_order]

df_parker = df_parker.sort_values("eudract_number").reset_index(drop=True)
df_llm = df_llm.sort_values("eudract_number").reset_index(drop=True)
df_original = df_original.sort_values("eudract_number").reset_index(drop=True)

if not (
    (df_parker["eudract_number"].tolist() == df_llm["eudract_number"].tolist() == df_original["eudract_number"].tolist())
):
    raise ValueError("The 'eudract_number' column does not have the same order in all DataFrames.")

# Calculate the percentage of differing cells between df_parker and df_llm (excluding 'eudract_number' column)
diff_mask = (df_parker[column_order].values != df_llm[column_order].values)
num_diff = diff_mask.sum()
total_cells = diff_mask.size
percent_diff = (num_diff / total_cells) * 100

print(f"Percentage of differing cells between df_parker and df_llm (excluding 'eudract_number'): {percent_diff:.2f}%")

# Combine results
combined_df = combine_results_1(df_llm=df_llm, df_parker=df_parker, df_original=df_original)

# Show the result
combined_df.head()

(3133, 9) (3133, 9) (3133, 9)
Percentage of differing cells between df_parker and df_llm (excluding 'eudract_number'): 0.02%


Unnamed: 0,eudract_number,arms,controlled,crossover,double_blind,open,parallel_group,randomised,single_blind
0,2004-000232-91,2+,Yes,No,No,Yes,Yes,Yes,No
1,2004-000232-91,2+,Yes,No,No,Yes,Yes,Yes,No
2,2004-000232-91,2+,Yes,No,No,Yes,Yes,Yes,No
3,2004-000299-15,2+,No,No,Yes,No,Yes,Yes,No
4,2004-000299-15,2+,No,No,Yes,No,Yes,Yes,No


In [None]:
gold_standard_path = "../datasets/parker_datasets/eudract/eudract_cleaned_gold_first1000.csv"
gold_standard = pd.read_csv(gold_standard_path)
# Ensure gold_standard has the same column order and row order as combined_df
gold_standard = gold_standard[column_order]
gold_standard = gold_standard.sort_values("eudract_number").reset_index(drop=True)

# cleaned_dataset = combined_df (already defined)
# corrupted_dataset = df_original (already defined)

micro_eval = Pipeline.evaluate_micro(
    gold_standard=gold_standard,
    cleaned_dataset=combined_df,
    corrupted_dataset=df_original
)

print("Micro evaluation result:")
pprint(micro_eval)

# Save repaired dataset to repairs/eudract with _o1_combined suffix
repaired_path = "../analysis/repairs/eudract/eudract_o1_combined.csv"
combined_df.to_csv(repaired_path, index=False)
print(f"Repaired dataset saved to {repaired_path}")

# Save micro and macro evaluation results to results/eudract with _o1_combined suffix
import json
results_micro_path = "../analysis/results/eudract/eudract_o1_combined_micro.json"
with open(results_micro_path, "w") as f:
    json.dump(micro_eval, f, indent=2)
print(f"Micro evaluation results saved to {results_micro_path}")

macro_eval = Pipeline.evaluate_macro(
    gold_standard=gold_standard,
    cleaned_dataset=combined_df,
    corrupted_dataset=df_original
)
print("Macro evaluation result:")
pprint(macro_eval)

results_macro_path = "../analysis/results/eudract/eudract_o1_combined_macro.json"
with open(results_macro_path, "w") as f:
    json.dump(macro_eval, f, indent=2)
print(f"Macro evaluation results saved to {results_macro_path}")

Micro evaluation result:
{'accuracy': 0.9436110224491967,
 'column_names': ['eudract_number',
                  'arms',
                  'controlled',
                  'crossover',
                  'double_blind',
                  'open',
                  'parallel_group',
                  'randomised',
                  'single_blind'],
 'f1_score': 0.6547980894485453,
 'false_negative': 1454,
 'false_negative_rate': 0.49088453747467925,
 'false_positive': 136,
 'false_positive_rate': 0.005389340202100258,
 'num_columns': 9,
 'num_rows': 3133,
 'precision': 0.9172749391727494,
 'recall': 0.5091154625253207,
 'true_negative': 25099,
 'true_positive': 1508}
