# 1. Import Required Libraries
Import libraries such as pandas, numpy, and transformers for data handling and model training.

In [21]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

from llm_data_quality_assistant import pipeline
from llm_data_quality_assistant.corruptor import RowCorruptionTypes, CellCorruptionTypes
from llm_data_quality_assistant.enums import Models
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import numpy as np

load_dotenv()

# For language model fine-tuning (optional, placeholder)
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

True

# 2. Load and Explore EudraCT Data
Load the EudraCT dataset and perform exploratory data analysis to understand its structure and content.

In [22]:
corrupt_dataset = pd.read_csv("../datasets/parker_datasets/eudract/eudract.csv")
# Drop the columns that contain wrong values accoording to Bronsealer & Acosta (2023)
corrupt_dataset.drop(columns=["placebo", "active_comparator"], inplace=True)
gold_standard = pd.read_csv("../datasets/parker_datasets/eudract/eudract_gold_standard_pivoted.csv")


with open("../datasets/parker_datasets/eudract/eudract.partialkey", "r") as f:
    partial_keys = f.read()

with open("../datasets/parker_datasets/eudract/eudract.rules", "r") as f:
    rules = f.read()

print(partial_keys)
print(rules)
print(corrupt_dataset.head(2))
print(gold_standard.head(2))
print(type(gold_standard.get("eudract_number").iloc[0]))
print(type(corrupt_dataset.get("eudract_number").iloc[0]))

eudract_number ->
    single_blind,
    double_blind,
    open,
    controlled,
    placebo,
    active_comparator,
    randomised,
    crossover,
    parallel_group,
    arms

-- Attributes
@open:STRING
@single_blind:STRING
@double_blind:STRING
@randomised:STRING
@controlled:STRING
@placebo:STRING
@active_comparator:STRING
@crossover:STRING
@parallel_group:STRING
@arms:STRING

-- Overview of attributes
open notin {'Yes', 'No'}
single_blind notin {'Yes', 'No'}
double_blind notin {'Yes', 'No'}
randomised notin {'Yes', 'No'}
controlled notin {'Yes', 'No'}
placebo notin {'Yes', 'No'}
active_comparator notin {'Yes', 'No'}
crossover notin {'Yes', 'No'}
parallel_group notin {'Yes', 'No'}
arms notin {'0', '1', '2+'}

-- eudract rules for masking
open == 'Yes' & single_blind == 'Yes'
open == 'Yes' & double_blind == 'Yes'
single_blind == 'Yes' & double_blind == 'Yes'
open == 'No' & single_blind == 'No' & double_blind == 'No'

-- eudract rules for control
controlled == 'No' & placebo == 'Yes'
co

# 3. Preprocess the Data
Clean and preprocess the dataset, including alignment and filtering by key column.

In [23]:
shortened_corrupt_df = corrupt_dataset[corrupt_dataset["eudract_number"].isin(gold_standard["eudract_number"])].head(50)
shortened_corrupt_df = shortened_corrupt_df.sort_values(by="eudract_number").reset_index(drop=True)

shortened_gold_standard = gold_standard[gold_standard["eudract_number"].isin(shortened_corrupt_df["eudract_number"])].head(50)
shortened_gold_standard = shortened_gold_standard.sort_values(by="eudract_number").reset_index(drop=True)

print("Shape shortened corrupt dataset:")
print(shortened_corrupt_df.shape)
print(shortened_corrupt_df.head())

print("Shape shortened gold standard dataset:")
print(shortened_gold_standard.shape)
print(shortened_gold_standard.head())

Shape shortened corrupt dataset:
(50, 9)
   eudract_number randomised crossover parallel_group open single_blind  \
0  2004-000232-91        Yes        No            Yes  Yes           No   
1  2004-000232-91        Yes        No            NaN  Yes           No   
2  2004-000232-91        Yes       NaN            NaN  NaN          NaN   
3  2004-000299-15        Yes        No             No  NaN           No   
4  2004-000299-15        Yes        No            Yes   No           No   

  double_blind controlled arms  
0           No        Yes  NaN  
1           No        NaN  NaN  
2          NaN        Yes  NaN  
3          Yes        Yes  NaN  
4          Yes         No  NaN  
Shape shortened gold standard dataset:
(50, 9)
  arms controlled crossover double_blind  eudract_number open parallel_group  \
0   2+        Yes        No           No  2004-000232-91  Yes            Yes   
1   2+        Yes        No           No  2004-000232-91  Yes            Yes   
2   2+        Yes      

In [24]:
p = pipeline.Pipeline(shortened_gold_standard)

assert shortened_gold_standard.shape == shortened_corrupt_df.shape

diff = shortened_gold_standard.values != shortened_corrupt_df.values
corrupted_coords = np.argwhere(diff)

print("Corrupted coordinates (row, col):")
print(corrupted_coords)

# Example: Clean the dataset using the pipeline
merged_df = p.clean_single_dataset(additional_prompt=(rules))

shortened_merged_df = merged_df[merged_df["eudract_number"].isin(gold_standard["eudract_number"])]
shortened_merged_df = shortened_merged_df.sort_values(by="eudract_number").reset_index(drop=True)

print("Shape merged dataset:")
print(merged_df.shape)
print("Shape shortened merged dataset:")
print(shortened_merged_df.shape)

# Placeholder for language model fine-tuning
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
# ...

Corrupted coordinates (row, col):
[[ 0  0]
 [ 0  3]
 [ 0  4]
 [ 0  5]
 [ 0  6]
 [ 0  8]
 [ 1  0]
 [ 1  3]
 [ 1  4]
 [ 1  5]
 [ 1  6]
 [ 1  7]
 [ 1  8]
 [ 2  0]
 [ 2  2]
 [ 2  3]
 [ 2  4]
 [ 2  5]
 [ 2  6]
 [ 2  8]
 [ 3  0]
 [ 3  3]
 [ 3  4]
 [ 3  8]
 [ 4  0]
 [ 4  4]
 [ 4  7]
 [ 4  8]
 [ 5  0]
 [ 5  4]
 [ 5  5]
 [ 5  6]
 [ 5  8]
 [ 6  0]
 [ 6  3]
 [ 6  4]
 [ 6  5]
 [ 6  7]
 [ 6  8]
 [ 7  0]
 [ 7  3]
 [ 7  4]
 [ 7  5]
 [ 7  8]
 [ 8  0]
 [ 8  4]
 [ 8  5]
 [ 8  8]
 [ 9  0]
 [ 9  4]
 [ 9  5]
 [ 9  8]
 [10  0]
 [10  4]
 [10  5]
 [10  8]
 [11  0]
 [11  4]
 [11  5]
 [11  8]
 [12  0]
 [12  4]
 [12  5]
 [12  8]
 [13  0]
 [13  2]
 [13  3]
 [13  4]
 [13  5]
 [13  6]
 [13  8]
 [14  0]
 [14  2]
 [14  4]
 [14  6]
 [14  8]
 [15  0]
 [15  2]
 [15  4]
 [15  6]
 [15  8]
 [16  0]
 [16  2]
 [16  4]
 [16  6]
 [16  8]
 [17  0]
 [17  2]
 [17  4]
 [17  6]
 [17  8]
 [18  0]
 [18  2]
 [18  4]
 [18  6]
 [18  8]
 [19  0]
 [19  2]
 [19  4]
 [19  6]
 [19  8]
 [20  0]
 [20  3]
 [20  4]
 [20  5]
 [20  6]
 [20  8]
 [2

# 5. Evaluate the Model
Evaluate the performance of the cleaned dataset and (optionally) the trained model using appropriate metrics.

In [27]:
results = p.evaluate_micro(shortened_merged_df, [corrupted_coords])
pprint(results)

same_values_only = (shortened_merged_df.values == shortened_corrupt_df.values).all()
print("All values same as corrupt dataset:", same_values_only)

print("Shortened Merged DataFrame:")
print(shortened_merged_df)
print("Shortened Gold Standard DataFrame:")
print(shortened_gold_standard)

{'accuracy': 1.0,
 'column_names': ['arms',
                  'controlled',
                  'crossover',
                  'double_blind',
                  'eudract_number',
                  'open',
                  'parallel_group',
                  'randomised',
                  'single_blind'],
 'f1_score': 1.0,
 'false_negative': 0,
 'false_negative_rate': 0.0,
 'false_positive': 0,
 'false_positive_rate': 0.0,
 'num_columns': 9,
 'num_rows': 50,
 'precision': 1.0,
 'recall': 1.0,
 'true_negative': 210,
 'true_positive': 240}
All values same as corrupt dataset: False
Shortened Merged DataFrame:
   arms controlled crossover double_blind  eudract_number open parallel_group  \
0    2+        Yes        No           No  2004-000232-91  Yes            Yes   
1    2+        Yes        No           No  2004-000232-91  Yes            Yes   
2    2+        Yes        No           No  2004-000232-91  Yes            Yes   
3    2+        Yes        No          Yes  2004-000299-15   No 

# 6. Save the Model
Save the trained model and tokenizer for future use. (Optional placeholder for actual saving code.)

In [26]:
# Placeholder for saving the model and tokenizer
# model.save_pretrained('./eudract_model')
# tokenizer.save_pretrained('./eudract_tokenizer')
print("Model and tokenizer saving placeholder.")

Model and tokenizer saving placeholder.
