In [1]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

import jupyter_helper_functions
import pandas as pd
from pprint import pprint

from llm_data_quality_assistant.pipeline import Pipeline

import json

In [2]:
corrupt_dataset = jupyter_helper_functions.load_dataset(
    "../datasets/parker_datasets/eudract/eudract_corrupted_first1000.csv"
)
gold_standard = jupyter_helper_functions.load_dataset("../datasets/parker_datasets/eudract/eudract_cleaned_gold_first1000.csv")
repaired_dataset = jupyter_helper_functions.load_dataset(
    "../analysis/repairs/eudract/merged_dataset_gemini_2_0_flash_lite_50_rows_context_combined_option3.csv"
)

# Drop empty columns if any
for col in ["active_comparator", "placebo"]:
    if col in repaired_dataset.columns:
        repaired_dataset.drop(columns=[col], inplace=True)

primary_key = "eudract_number"

# Standardize datasets
out = jupyter_helper_functions.Pipeline.standardize_datasets(
    gold_standard=gold_standard,
    cleaned_dataset=repaired_dataset,
    corrupted_dataset=corrupt_dataset,
    primary_key=primary_key,
)
gold_standard = out["gold_standard"]
repaired_dataset = out["cleaned_dataset"]
corrupt_dataset = out["corrupted_dataset"]

# Check if all datasets have the exact same order of eudract_number
assert (
    corrupt_dataset[primary_key].tolist() == gold_standard[primary_key].tolist() == repaired_dataset[primary_key].tolist()
), "Datasets are not in the same order!"
print("All datasets have the exact same order of eudract_number.")

All datasets have the exact same order of eudract_number.


In [3]:
micro_stats = jupyter_helper_functions.Pipeline.evaluate_micro(
    gold_standard=gold_standard,
    corrupted_dataset=corrupt_dataset,
    cleaned_dataset=repaired_dataset,
)

pprint(micro_stats)

jupyter_helper_functions.save_json(
    micro_stats,
    "../analysis/results/eudract/gemini_2_0_flash_lite_50_rows_context_option_3_results_micro.json",
)

{'accuracy': 0.9435755576834415,
 'column_names': ['eudract_number',
                  'arms',
                  'controlled',
                  'crossover',
                  'double_blind',
                  'open',
                  'parallel_group',
                  'randomised',
                  'single_blind'],
 'f1_score': 0.6545059717698154,
 'false_negative': 1455,
 'false_negative_rate': 0.4912221471978393,
 'false_positive': 136,
 'false_positive_rate': 0.005389340202100258,
 'num_columns': 9,
 'num_rows': 3133,
 'precision': 0.9172245891661595,
 'recall': 0.5087778528021607,
 'true_negative': 25099,
 'true_positive': 1507}


In [4]:
macro_stats = jupyter_helper_functions.Pipeline.evaluate_macro(
    gold_standard=gold_standard,
    corrupted_dataset=corrupt_dataset,
    cleaned_dataset=repaired_dataset,
)

pprint(macro_stats)
jupyter_helper_functions.save_json(
    macro_stats,
    "../analysis/results/eudract/gemini_2_0_flash_lite_50_rows_context_option_3_results_macro.json",
)

{'column_names': ['eudract_number',
                  'arms',
                  'controlled',
                  'crossover',
                  'double_blind',
                  'open',
                  'parallel_group',
                  'randomised',
                  'single_blind'],
 'num_columns': 9,
 'num_rows': 3133,
 'stats': [{'accuracy': 1.0,
            'column_name': 'eudract_number',
            'f1_score': 0.0,
            'false_negative': 0,
            'false_negative_rate': 0.0,
            'false_positive': 0,
            'false_positive_rate': 0.0,
            'num_entries': 3133,
            'precision': 0.0,
            'recall': 0.0,
            'true_negative': 3133,
            'true_positive': 0},
           {'accuracy': 0.9119055218640281,
            'column_name': 'arms',
            'f1_score': 0.8319123020706456,
            'false_negative': 270,
            'false_negative_rate': 0.2833158447009444,
            'false_positive': 6,
            'false_po