In [41]:
import sys
import os

# Add the parent directory to the path so the package is importable
sys.path.append(os.path.abspath(".."))
import pandas as pd
from llm_data_quality_assistant.pipeline import Pipeline

from pprint import pprint
import json

In [42]:
corrupt_dataset = pd.read_csv(
    "../datasets/parker_datasets/flight/flight_cleaned_corrupted_first1000_int.csv"
)
gold_standard = pd.read_csv(
    "../datasets/parker_datasets/flight/flight_cleaned_corrupted_first1000_int.csv"
)
repaired_dataset = pd.read_csv(
    "../datasets/parker_datasets/flight/parker_repairs/flight_repair.csv"
)

output = Pipeline.standardize_datasets(datasets=[corrupt_dataset, gold_standard, repaired_dataset], primary_key="composed_key")
corrupt_dataset = output[0]
gold_standard = output[1]
repaired_dataset = output[2]

print(gold_standard.head())
print(corrupt_dataset.head())
print(repaired_dataset.head())

# Check if all three dataframes have the same column order
same_column_order = (
    list(corrupt_dataset.columns) == list(gold_standard.columns) == list(repaired_dataset.columns)
)
print("All dataframes have the same column order:", same_column_order)

# Check if all three dataframes have the exact same row order
same_order = (
    (corrupt_dataset['composed_key'] == gold_standard['composed_key']).all() and
    (corrupt_dataset['composed_key'] == repaired_dataset['composed_key']).all()
)
print("All dataframes have the exact same row order:", same_order)


                   composed_key  actual_arrival  actual_departure  \
0  2011-12-01 - AA-1007-MIA-PHX          3055.0            2756.0   
1  2011-12-01 - AA-1007-MIA-PHX          3043.0            2769.0   
2  2011-12-01 - AA-1007-MIA-PHX          3055.0            2768.0   
3  2011-12-01 - AA-1007-MIA-PHX          3043.0            2768.0   
4  2011-12-01 - AA-1007-MIA-PHX          3055.0            2768.0   

   scheduled_arrival  scheduled_departure  
0             3065.0               2755.0  
1                NaN                  NaN  
2             3065.0               2755.0  
3             3038.0               2755.0  
4                NaN                  NaN  
                   composed_key  actual_arrival  actual_departure  \
0  2011-12-01 - AA-1007-MIA-PHX          3055.0            2756.0   
1  2011-12-01 - AA-1007-MIA-PHX          3043.0            2769.0   
2  2011-12-01 - AA-1007-MIA-PHX          3055.0            2768.0   
3  2011-12-01 - AA-1007-MIA-PHX          3043

In [43]:
micro_stats = Pipeline.evaluate_micro(
    gold_standard=gold_standard,
    corrupted_dataset=corrupt_dataset,
    cleaned_dataset=repaired_dataset,
)

pprint(micro_stats)
with open("../analysis/results/flight/parker_results_micro.json", "w") as f:
    json.dump(micro_stats, f, indent=4)


{'accuracy': 0.2891002194586686,
 'column_names': ['composed_key',
                  'actual_arrival',
                  'actual_departure',
                  'scheduled_arrival',
                  'scheduled_departure'],
 'f1_score': 0.0,
 'false_negative': 0,
 'false_negative_rate': 0.0,
 'false_positive': 87462,
 'false_positive_rate': 0.7108997805413314,
 'num_columns': 5,
 'num_rows': 24606,
 'precision': 0.0,
 'recall': 0.0,
 'true_negative': 35568,
 'true_positive': 0}


In [44]:
macro_stats = Pipeline.evaluate_macro(
    gold_standard=gold_standard,
    corrupted_dataset=corrupt_dataset,
    cleaned_dataset=repaired_dataset,
)

pprint(macro_stats)
with open("../analysis/results/flight/parker_results_macro.json", "w") as f:
    json.dump(macro_stats, f, indent=4)

{'column_names': ['composed_key',
                  'actual_arrival',
                  'actual_departure',
                  'scheduled_arrival',
                  'scheduled_departure'],
 'num_columns': 5,
 'num_rows': 24606,
 'stats': [{'accuracy': 1.0,
            'column_name': 'composed_key',
            'f1_score': 0.0,
            'false_negative': 0,
            'false_negative_rate': 0.0,
            'false_positive': 0,
            'false_positive_rate': 0.0,
            'num_entries': 24606,
            'precision': 0.0,
            'recall': 0.0,
            'true_negative': 24606,
            'true_positive': 0},
           {'accuracy': 0.4455010972933431,
            'column_name': 'actual_arrival',
            'f1_score': 0.0,
            'false_negative': 0,
            'false_negative_rate': 0.0,
            'false_positive': 13644,
            'false_positive_rate': 0.5544989027066569,
            'num_entries': 24606,
            'precision': 0.0,
            'recal