# 1. Import Required Libraries
Import libraries such as pandas, numpy, and the LLM pipeline for data handling and cleaning.

In [4]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

from llm_data_quality_assistant.enums import Models
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import numpy as np
import jupyter_helper_functions
import string
import time

load_dotenv()

True

# 2. Load and Explore Flight Data
Load the flight dataset and perform exploratory data analysis to understand its structure and content.

In [5]:
corrupt_dataset = jupyter_helper_functions.load_dataset(
    "../datasets/parker_datasets/flight/flight_cleaned_corrupted_first1000_int.csv"
)
gold_standard = jupyter_helper_functions.load_dataset(
    "../datasets/parker_datasets/flight/flight_cleaned_gold_first1000_int.csv"
)

# print(partial_keys)
# print(rules)
# print(corrupt_dataset.head(2))
# print(gold_standard.head(2))
# print(type(gold_standard.get("composed_key").iloc[0]))
# print(type(corrupt_dataset.get("composed_key").iloc[0]))

# 3. Clean and Merge Data with LLM
Use the LLM pipeline to clean and merge the corrupted dataset using the provided rules and evaluate the results.

In [6]:
from llm_data_quality_assistant.pipeline import Pipeline
from llm_data_quality_assistant.enums import Models
import jupyter_helper_functions
import string
import time
string.punctuation = string.punctuation.replace("'", "")  # Remove single quotes from punctuation

# Use a primary key for merging
primary_key = "composed_key"
model = Models.GeminiModels.GEMINI_2_5_FLASH_LITE_PREVIEW_06_17
rows_of_context = 50

extra = "simple approach"
file_name = jupyter_helper_functions.sanitize_filename(f"{model.value}_{rows_of_context}_rows_context_{extra}")   

rpm = 15
additional_prompt = f"""
Here are rows of the dataset to provide context for the cleaning process:
{corrupt_dataset.sample(rows_of_context).to_string(index=False)}
"""

# Merge/clean with LLM
merged_df, time_taken = jupyter_helper_functions.merge_with_llm_timed(
    dataset = corrupt_dataset,
    primary_key = primary_key,
    model = model,
    rpm = rpm,
    additional_prompt = additional_prompt
    )


Merging groups with LLM:  65%|██████▌   | 653/1000 [44:56<23:52,  4.13s/it]  


ConnectError: [Errno -3] Temporary failure in name resolution

# 4. Evaluate the Results
Evaluate the cleaned dataset using micro and macro evaluation metrics.

In [None]:
# (Evaluation is now handled by standardize_and_evaluate)

MICRO EVALUATION RESULTS
{'accuracy': 0.7491018450784361,
 'column_names': ['composed_key',
                  'actual_arrival',
                  'actual_departure',
                  'scheduled_arrival',
                  'scheduled_departure'],
 'f1_score': 0.6398553260996384,
 'false_negative': 20042,
 'false_negative_rate': 0.4222657649116154,
 'false_positive': 10826,
 'false_positive_rate': 0.14326359389680682,
 'num_columns': 5,
 'num_rows': 24606,
 'precision': 0.7169451198786833,
 'recall': 0.5777342350883846,
 'true_negative': 64741,
 'true_positive': 27421}
MACRO EVALUATION RESULTS
{'column_names': ['composed_key',
                  'actual_arrival',
                  'actual_departure',
                  'scheduled_arrival',
                  'scheduled_departure'],
 'num_columns': 5,
 'num_rows': 24606,
 'stats': [{'accuracy': 1.0,
            'column_name': 'composed_key',
            'f1_score': 0.0,
            'false_negative': 0,
            'false_negative_rate': 0.0

In [None]:
import json
import time
# Save merged dataset
jupyter_helper_functions.save_dataframe_csv(merged_df, f"../analysis/repairs/flight/merged_dataset_{file_name}.csv")

# Evaluate results
jupyter_helper_functions.standardize_and_evaluate(
    gold_standard=gold_standard,
    merged_df=merged_df,
    corrupt_dataset=corrupt_dataset,
    primary_key=primary_key,
    time_delta=time_taken,
    results_dir=f"../analysis/results/flight/",
    file_name=file_name,
)
