In [1]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))
import pandas as pd
from llm_data_quality_assistant.pipeline import Pipeline

from pprint import pprint
import json

In [2]:
corrupt_dataset = pd.read_csv("../datasets/parker_datasets/eudract/eudract_corrupted_first1000.csv")

gold_standard = pd.read_csv("../datasets/parker_datasets/eudract/eudract_cleaned_gold_first1000.csv")

repaired_dataset = pd.read_csv("../datasets/parker_datasets/eudract/parker-repair/eudract_repair.csv")
repaired_dataset.drop(columns=["active_comparator","placebo"], inplace=True)  # Drop empty columns if any
print(corrupt_dataset.shape, gold_standard.shape, repaired_dataset.shape)

output = Pipeline.standardize_datasets(datasets=[corrupt_dataset, gold_standard, repaired_dataset], primary_key="eudract_number")
corrupt_dataset, gold_standard, repaired_dataset = output[0], output[1], output[2]

# Check if all datasets have the exact same order of eudract_number
assert (corrupt_dataset['eudract_number'].tolist() == 
    gold_standard['eudract_number'].tolist() == 
    repaired_dataset['eudract_number'].tolist()), "Datasets are not in the same order!"
print("All datasets have the exact same order of eudract_number.")


(3133, 9) (3133, 9) (3133, 9)
All datasets have the exact same order of eudract_number.


In [3]:
micro_stats = Pipeline.evaluate_micro(
    gold_standard=gold_standard,
    corrupted_dataset=corrupt_dataset,
    cleaned_dataset=repaired_dataset,
)

pprint(micro_stats)
with open("../analysis/results/eudract/parker_results_micro.json", "w") as f:
    json.dump(micro_stats, f, indent=4)

{'accuracy': 0.9437528815122176,
 'column_names': ['eudract_number',
                  'arms',
                  'controlled',
                  'crossover',
                  'double_blind',
                  'open',
                  'parallel_group',
                  'randomised',
                  'single_blind'],
 'f1_score': 0.6553672316384181,
 'false_negative': 1454,
 'false_negative_rate': 0.49088453747467925,
 'false_positive': 132,
 'false_positive_rate': 0.005230830196156133,
 'num_columns': 9,
 'num_rows': 3133,
 'precision': 0.9195121951219513,
 'recall': 0.5091154625253207,
 'true_negative': 25103,
 'true_positive': 1508}


In [4]:
macro_stats = Pipeline.evaluate_macro(
    gold_standard=gold_standard,
    corrupted_dataset=corrupt_dataset,
    cleaned_dataset=repaired_dataset,
)

pprint(macro_stats)
with open("../analysis/results/eudract/parker_results_macro.json", "w") as f:
    json.dump(macro_stats, f, indent=4)

{'column_names': ['eudract_number',
                  'arms',
                  'controlled',
                  'crossover',
                  'double_blind',
                  'open',
                  'parallel_group',
                  'randomised',
                  'single_blind'],
 'num_columns': 9,
 'num_rows': 3133,
 'stats': [{'accuracy': 1.0,
            'column_name': 'eudract_number',
            'f1_score': 0.0,
            'false_negative': 0,
            'false_negative_rate': 0.0,
            'false_positive': 0,
            'false_positive_rate': 0.0,
            'num_entries': 3133,
            'precision': 0.0,
            'recall': 0.0,
            'true_negative': 3133,
            'true_positive': 0},
           {'accuracy': 0.9119055218640281,
            'column_name': 'arms',
            'f1_score': 0.8319123020706456,
            'false_negative': 270,
            'false_negative_rate': 0.2833158447009444,
            'false_positive': 6,
            'false_po