In [1]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))

from llm_data_quality_assistant import pipeline
from llm_data_quality_assistant.corruptor import RowCorruptionTypes, CellCorruptionTypes
from llm_data_quality_assistant.enums import Models
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import numpy as np

load_dotenv()


True

In [2]:
corrupt_dataset = pd.read_csv("../datasets/parker_datasets/flight/flight.csv")

gold_standard = pd.read_csv("../datasets/parker_datasets/flight/flight_gold_standard_pivoted.csv")

gold_standard = gold_standard[corrupt_dataset.columns]

with open("../datasets/parker_datasets/flight/flight.partialkey", "r") as f:
    partial_keys = f.read()

with open("../datasets/parker_datasets/flight/flight.rules", "r") as f:
    rules = f.read()

print(corrupt_dataset.head(2))
print(gold_standard.head(2))
print(type(gold_standard.get("composed_key").iloc[0]))
print(type(corrupt_dataset.get("composed_key").iloc[0]))

                   composed_key  scheduled_departure  actual_departure  \
0  2011-12-01 - CO-1099-EWR-ORD               2160.0            2153.0   
1  2011-12-01 - CO-1099-EWR-ORD               2160.0               NaN   

   scheduled_arrival  actual_arrival  
0             2316.0          2297.0  
1             2316.0             NaN  
                  composed_key  scheduled_departure  actual_departure  \
0  2012-01-02 - UA-938-DEN-ORD              48894.0           48896.0   
1  2012-01-02 - UA-938-DEN-ORD              48894.0           48896.0   

   scheduled_arrival  actual_arrival  
0            49037.0         49049.0  
1            49037.0         49049.0  
<class 'str'>
<class 'str'>


In [3]:
shortened_corrupt_df = corrupt_dataset[corrupt_dataset["composed_key"].isin(gold_standard["composed_key"])]
shortened_corrupt_df = shortened_corrupt_df.sort_values(by="composed_key").reset_index(drop=True)

shortened_gold_standard = gold_standard[gold_standard["composed_key"].isin(shortened_corrupt_df["composed_key"])]
shortened_gold_standard = shortened_gold_standard.sort_values(by="composed_key").reset_index(drop=True)

# Get the first unique composed_keys
first_1_keys = shortened_corrupt_df["composed_key"].unique()[:1]

# Filter both DataFrames to only those keys
shortened_corrupt_df = shortened_corrupt_df[shortened_corrupt_df["composed_key"].isin(first_1_keys)].reset_index(drop=True)
shortened_gold_standard = shortened_gold_standard[shortened_gold_standard["composed_key"].isin(first_1_keys)].reset_index(drop=True)

print(shortened_corrupt_df)
print(shortened_gold_standard)

                    composed_key  scheduled_departure  actual_departure  \
0   2011-12-01 - AA-1007-MIA-PHX                  NaN            2769.0   
1   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
2   2011-12-01 - AA-1007-MIA-PHX                  NaN            2769.0   
3   2011-12-01 - AA-1007-MIA-PHX                  NaN            2769.0   
4   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
5   2011-12-01 - AA-1007-MIA-PHX               2755.0            2756.0   
6   2011-12-01 - AA-1007-MIA-PHX               2755.0            2756.0   
7   2011-12-01 - AA-1007-MIA-PHX                  NaN            2769.0   
8   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
9   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
10  2011-12-01 - AA-1007-MIA-PHX                  NaN            2768.0   
11  2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
12  2011-12-01 - AA-1007-

In [4]:
p = pipeline.Pipeline(shortened_gold_standard)

assert shortened_gold_standard.shape == shortened_corrupt_df.shape

# Find coordinates where entries differ
diff = shortened_gold_standard.values != shortened_corrupt_df.values
corrupted_coords = np.argwhere(diff)

print("Corrupted coordinates (row, col):")
print(corrupted_coords)

"""
# Process in chunks of 2 rows
chunk_size = 2
chunks = [
    shortened_corrupt_df.iloc[i:i+chunk_size]
    for i in range(0, len(shortened_corrupt_df), chunk_size)
]

merged_chunks = []
for chunk in chunks:
    cleaned_chunk = p.clean_single_dataset(chunk, (rules))
    merged_chunks.append(cleaned_chunk)

merged_df = pd.concat(merged_chunks, ignore_index=True)
"""

merged_df = p.clean_single_dataset(shortened_corrupt_df, (rules))


shortened_merged_df = merged_df[merged_df["composed_key"].isin(gold_standard["composed_key"])]
shortened_merged_df = shortened_merged_df.sort_values(by="composed_key").reset_index(drop=True)

print("Shape merged dataset:")
print(merged_df.shape)

print("Shape shortened merged dataset:")
print(shortened_merged_df.shape)

results = p.evaluate_micro(shortened_merged_df, [corrupted_coords])
print(results)

same_values_only = (shortened_merged_df.values == shortened_corrupt_df.values).all()
print(same_values_only)


Corrupted coordinates (row, col):
[[ 0  1]
 [ 0  2]
 [ 0  3]
 [ 0  4]
 [ 2  1]
 [ 2  2]
 [ 2  3]
 [ 2  4]
 [ 3  1]
 [ 3  2]
 [ 3  3]
 [ 3  4]
 [ 5  2]
 [ 6  2]
 [ 7  1]
 [ 7  2]
 [ 7  3]
 [ 7  4]
 [ 8  3]
 [ 8  4]
 [ 9  3]
 [ 9  4]
 [10  1]
 [10  3]
 [12  1]
 [12  2]
 [12  3]
 [12  4]
 [13  1]
 [13  3]
 [14  1]
 [14  2]
 [14  3]
 [14  4]
 [15  1]
 [15  2]
 [15  3]
 [15  4]
 [16  1]
 [16  3]
 [17  1]
 [17  2]
 [17  3]
 [17  4]
 [18  1]
 [18  2]
 [18  3]
 [18  4]
 [19  1]
 [19  2]
 [19  3]
 [19  4]
 [20  1]
 [20  2]
 [20  3]
 [20  4]]
Shape merged dataset:
(21, 5)
Shape shortened merged dataset:
(21, 5)
{'num_rows': 21, 'num_columns': 5, 'column_names': ['composed_key', 'scheduled_departure', 'actual_departure', 'scheduled_arrival', 'actual_arrival'], 'true_positive': 53, 'false_positive': 3, 'false_negative': 3, 'true_negative': 46, 'precision': 0.9464285714285714, 'recall': 0.9464285714285714, 'f1_score': 0.9464285714285714, 'accuracy': 0.9428571428571428, 'false_positive_rate': 0.0612

In [5]:
print("Shortened Merged DataFrame:")
print(shortened_merged_df)
print("Shortened Gold Standard DataFrame:")
print(shortened_gold_standard)

Shortened Merged DataFrame:
                    composed_key  scheduled_departure  actual_departure  \
0   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
1   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
2   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
3   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
4   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
5   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
6   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
7   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
8   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
9   2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
10  2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0   
11  2011-12-01 - AA-1007-MIA-PHX               2755.0            2768.0 