In [1]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

from llm_data_quality_assistant import pipeline
from llm_data_quality_assistant.corruptor import RowCorruptionTypes, CellCorruptionTypes
from llm_data_quality_assistant.enums import Models
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import numpy as np
import time

load_dotenv()

True

# 2. Load and Explore EudraCT Data
Load the EudraCT dataset and perform exploratory data analysis to understand its structure and content.

In [2]:
corrupt_dataset = pd.read_csv("../datasets/parker_datasets/eudract/eudract_corrupted_first1000.csv"
)
# Drop the columns that contain wrong values accoording to Bronsealer & Acosta (2023)
try:
    corrupt_dataset.drop(columns=["placebo", "active_comparator"], inplace=True)
except KeyError:
    print("Columns 'placebo' and 'active_comparator' not found in the dataset.")

gold_standard = pd.read_csv(
    "../datasets/parker_datasets/eudract/eudract_cleaned_gold_first1000.csv"
)

with open("../datasets/parker_datasets/eudract/eudract.partialkey", "r") as f:
    partial_keys = f.read()

with open("../datasets/parker_datasets/eudract/eudract.rules", "r") as f:
    rules = f.read()

print(partial_keys)
print(rules)
print(corrupt_dataset.head(2))
print(gold_standard.head(2))
print(type(gold_standard.get("eudract_number").iloc[0]))
print(type(corrupt_dataset.get("eudract_number").iloc[0]))

Columns 'placebo' and 'active_comparator' not found in the dataset.
eudract_number ->
    single_blind,
    double_blind,
    open,
    controlled,
    placebo,
    active_comparator,
    randomised,
    crossover,
    parallel_group,
    arms

-- Attributes
@open:STRING
@single_blind:STRING
@double_blind:STRING
@randomised:STRING
@controlled:STRING
@placebo:STRING
@active_comparator:STRING
@crossover:STRING
@parallel_group:STRING
@arms:STRING

-- Overview of attributes
open notin {'Yes', 'No'}
single_blind notin {'Yes', 'No'}
double_blind notin {'Yes', 'No'}
randomised notin {'Yes', 'No'}
controlled notin {'Yes', 'No'}
placebo notin {'Yes', 'No'}
active_comparator notin {'Yes', 'No'}
crossover notin {'Yes', 'No'}
parallel_group notin {'Yes', 'No'}
arms notin {'0', '1', '2+'}

-- eudract rules for masking
open == 'Yes' & single_blind == 'Yes'
open == 'Yes' & double_blind == 'Yes'
single_blind == 'Yes' & double_blind == 'Yes'
open == 'No' & single_blind == 'No' & double_blind == 'No'

-

# 3. Clean and Merge Data with LLM
Use the LLM pipeline to clean and merge the corrupted dataset using the provided rules and evaluate the results.

In [3]:
from llm_data_quality_assistant.pipeline import Pipeline
from llm_data_quality_assistant.enums import Models
import string
import json
string.punctuation = string.punctuation.replace("'", "")  # Remove single quotes from punctuation

# Use a primary key for merging
primary_key = "eudract_number"
model = Models.GeminiModels.GEMINI_2_0_FLASH_LITE
rows_of_context = 50  # adjust as needed
file_name = str(model.value) + f"_{rows_of_context}_rows_context"
for p in string.punctuation + " ":
    file_name = file_name.replace(p, "_")

rpm = 30  # or 0 if you want no rate limit

additional_prompt = f"""
Here are rows of the dataset to provide context for the cleaning process:
{corrupt_dataset.sample(rows_of_context).to_string(index=False)}
"""

start_time = time.time()
merged_df = Pipeline.merge_with_llm(
    dataset=corrupt_dataset,
    primary_key=primary_key,
    model_name=model,
    rpm=rpm,
    additional_prompt=additional_prompt,
    verbose=False,
    status_bar=True,
)
time_delta = time.time() - start_time

merged_df.to_csv(f"../analysis/merged_lukas/eudract/merged_dataset_{file_name}.csv", index=False)

stats_micro = Pipeline.evaluate_micro(
    gold_standard=gold_standard,
    cleaned_dataset=merged_df,
    corrupted_dataset=corrupt_dataset
)
stats_micro["time_taken"] = time_delta
print("====================================")
print("MICRO EVALUATION RESULTS")
print("====================================")
pprint(stats_micro)

stats_macro = Pipeline.evaluate_macro(
    gold_standard=gold_standard,
    cleaned_dataset=merged_df,
    corrupted_dataset=corrupt_dataset
)
stats_macro["time_taken"] = time_delta
print("====================================")
print("MACRO EVALUATION RESULTS")
print("====================================")
pprint(stats_macro)

with open(
    f"../analysis/results/eudract/{file_name}_results_micro.json",
    "w",
) as f:
    json.dump(stats_micro, f, indent=4)

with open(f"../analysis/results/eudract/{file_name}_results_macro.json", "w") as f:
    json.dump(stats_macro, f, indent=4)

Merging groups with LLM:   0%|          | 0/524 [00:00<?, ?it/s]

Merging groups with LLM: 100%|██████████| 524/524 [17:40<00:00,  2.02s/it]



MICRO EVALUATION RESULTS
{'accuracy': 0.9468737808986771,
 'column_names': ['eudract_number',
                  'arms',
                  'controlled',
                  'crossover',
                  'double_blind',
                  'open',
                  'parallel_group',
                  'randomised',
                  'single_blind'],
 'f1_score': 0.6781263429308123,
 'false_negative': 1384,
 'false_negative_rate': 0.46725185685347737,
 'false_positive': 114,
 'false_positive_rate': 0.004517535169407569,
 'num_columns': 9,
 'num_rows': 3133,
 'precision': 0.9326241134751773,
 'recall': 0.5327481431465226,
 'time_taken': 1060.6060194969177,
 'true_negative': 25121,
 'true_positive': 1578}
MACRO EVALUATION RESULTS
{'column_names': ['eudract_number',
                  'arms',
                  'controlled',
                  'crossover',
                  'double_blind',
                  'open',
                  'parallel_group',
                  'randomised',
                