In [9]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

from llm_data_quality_assistant import pipeline
from llm_data_quality_assistant.corruptor import RowCorruptionTypes, CellCorruptionTypes
from llm_data_quality_assistant.enums import Models
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import numpy as np

load_dotenv()


True

In [10]:
corrupt_dataset = pd.read_csv("../datasets/parker_datasets/flight/flight.csv")

gold_standard = pd.read_csv("../datasets/parker_datasets/flight/flight_gold_standard_pivoted.csv")

gold_standard = gold_standard[corrupt_dataset.columns]

with open("../datasets/parker_datasets/flight/flight.partialkey", "r") as f:
    partial_keys = f.read()

with open("../datasets/parker_datasets/flight/flight.rules", "r") as f:
    rules = f.read()

print(corrupt_dataset.head(10))
print(gold_standard.head(10))
print(type(gold_standard.get("composed_key").iloc[0]))
print(type(corrupt_dataset.get("composed_key").iloc[0]))

                   composed_key  scheduled_departure  actual_departure  \
0  2011-12-01 - CO-1099-EWR-ORD               2160.0            2153.0   
1  2011-12-01 - CO-1099-EWR-ORD               2160.0               NaN   
2  2011-12-01 - CO-1099-EWR-ORD               2160.0               NaN   
3  2011-12-01 - CO-1099-EWR-ORD               2160.0               NaN   
4  2011-12-01 - CO-1099-EWR-ORD               2160.0               NaN   
5  2011-12-01 - CO-1099-EWR-ORD               2160.0               NaN   
6  2011-12-01 - CO-1099-EWR-ORD                  NaN            2153.0   
7  2011-12-01 - CO-1099-EWR-ORD                  NaN            2153.0   
8  2011-12-01 - CO-1099-EWR-ORD                  NaN            2153.0   
9  2011-12-01 - CO-1099-EWR-ORD                  NaN            2167.0   

   scheduled_arrival  actual_arrival  
0             2316.0          2297.0  
1             2316.0             NaN  
2             2316.0             NaN  
3             2316.0         

In [11]:
shortened_corrupt_df = corrupt_dataset[corrupt_dataset["composed_key"].isin(gold_standard["composed_key"])]
shortened_corrupt_df = shortened_corrupt_df.sort_values(by="composed_key").reset_index(drop=True)

shortened_gold_standard = gold_standard[gold_standard["composed_key"].isin(shortened_corrupt_df["composed_key"])]
shortened_gold_standard = shortened_gold_standard.sort_values(by="composed_key").reset_index(drop=True)

# Get the first unique composed_keys
first_1_keys = shortened_corrupt_df["composed_key"].unique()[:20]

# Filter both DataFrames to only those keys
shortened_corrupt_df = shortened_corrupt_df[shortened_corrupt_df["composed_key"].isin(first_1_keys)].reset_index(drop=True)
shortened_gold_standard = shortened_gold_standard[shortened_gold_standard["composed_key"].isin(first_1_keys)].reset_index(drop=True)

print(shortened_corrupt_df)
print(shortened_gold_standard)

                     composed_key  scheduled_departure  actual_departure  \
0    2011-12-01 - AA-1007-MIA-PHX                  NaN            2769.0   
1    2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
2    2011-12-01 - AA-1007-MIA-PHX                  NaN            2769.0   
3    2011-12-01 - AA-1007-MIA-PHX                  NaN            2769.0   
4    2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
..                            ...                  ...               ...   
516  2011-12-01 - AA-2957-DFW-CVG               2275.0            2285.0   
517  2011-12-01 - AA-2957-DFW-CVG               2275.0               NaN   
518  2011-12-01 - AA-2957-DFW-CVG               2275.0            2285.0   
519  2011-12-01 - AA-2957-DFW-CVG               2275.0            2284.0   
520  2011-12-01 - AA-2957-DFW-CVG               2275.0               NaN   

     scheduled_arrival  actual_arrival  
0                  NaN          3043.0  
1    

In [12]:

assert shortened_gold_standard.shape == shortened_corrupt_df.shape


"""
# Process in chunks of 2 rows
chunk_size = 2
chunks = [
    shortened_corrupt_df.iloc[i:i+chunk_size]
    for i in range(0, len(shortened_corrupt_df), chunk_size)
]

merged_chunks = []
for chunk in chunks:
    cleaned_chunk = p.clean_single_dataset(chunk, (rules))
    merged_chunks.append(cleaned_chunk)

merged_df = pd.concat(merged_chunks, ignore_index=True)
"""
rpm = 30
merged_df = pipeline.Pipeline.merge_with_llm(
    dataset=shortened_corrupt_df,
    primary_key="composed_key",
    rpm=rpm,
    additional_prompt=rules,
    model_name=Models.GeminiModels.GEMINI_2_0_FLASH_LITE
)


shortened_merged_df = merged_df[merged_df["composed_key"].isin(gold_standard["composed_key"])]
shortened_merged_df = shortened_merged_df.sort_values(by="composed_key").reset_index(drop=True)

print("Shape merged dataset:")
print(merged_df.shape)

print("Shape shortened merged dataset:")
print(shortened_merged_df.shape)

results = pipeline.Pipeline.evaluate_micro(corrupted_dataset=shortened_corrupt_df,
                                           gold_standard=shortened_gold_standard,
                                           cleaned_dataset=shortened_merged_df)
pprint(results)

same_values_only = (shortened_merged_df.values == shortened_corrupt_df.values).all()
print(same_values_only)


Shape merged dataset:
(521, 5)
Shape shortened merged dataset:
(521, 5)
{'accuracy': 0.7834932821497121,
 'column_names': ['composed_key',
                  'scheduled_departure',
                  'actual_departure',
                  'scheduled_arrival',
                  'actual_arrival'],
 'f1_score': 0.6670602125147579,
 'false_negative': 313,
 'false_negative_rate': 0.3564920273348519,
 'false_positive': 251,
 'false_positive_rate': 0.1453387376954256,
 'num_columns': 5,
 'num_rows': 521,
 'precision': 0.6924019607843137,
 'recall': 0.643507972665148,
 'true_negative': 1476,
 'true_positive': 565}
False
{'accuracy': 0.7834932821497121,
 'column_names': ['composed_key',
                  'scheduled_departure',
                  'actual_departure',
                  'scheduled_arrival',
                  'actual_arrival'],
 'f1_score': 0.6670602125147579,
 'false_negative': 313,
 'false_negative_rate': 0.3564920273348519,
 'false_positive': 251,
 'false_positive_rate': 0.1453387376

In [13]:
print("Shortened Merged DataFrame:")
print(shortened_merged_df)
print("Shortened Gold Standard DataFrame:")
print(shortened_gold_standard)

Shortened Merged DataFrame:
                     composed_key  scheduled_departure  actual_departure  \
0    2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
1    2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
2    2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
3    2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
4    2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
..                            ...                  ...               ...   
516  2011-12-01 - AA-2957-DFW-CVG               2275.0            2284.0   
517  2011-12-01 - AA-2957-DFW-CVG               2275.0            2284.0   
518  2011-12-01 - AA-2957-DFW-CVG               2275.0            2284.0   
519  2011-12-01 - AA-2957-DFW-CVG               2275.0            2284.0   
520  2011-12-01 - AA-2957-DFW-CVG               2275.0            2284.0   

     scheduled_arrival  actual_arrival  
0               30