# 00 Data Quality and Canonical Table
Build `order_delivery_legs` at order-seller grain and run quality acceptance checks.


In [1]:
from pathlib import Path
import sys
import pandas as pd

ROOT = Path('..') if Path.cwd().name == 'notebooks' else Path('.')
if str(ROOT / 'src') not in sys.path:
    sys.path.append(str(ROOT / 'src'))

from utils import build_order_delivery_legs, quality_summary

raw_dir = ROOT / 'data' / 'raw'
processed_dir = ROOT / 'data' / 'processed'
processed_dir.mkdir(parents=True, exist_ok=True)


In [2]:
legs = build_order_delivery_legs(raw_dir)
summary = quality_summary(legs)

print('Row counts:', summary['row_counts'])
print('Coverage:', {k: round(v, 4) for k, v in summary['coverage'].items()})
print('Duplicate order-seller rows:', summary['duplicate_order_seller'])
print('Top statuses (%):')
print((summary['status_distribution'].head(8) * 100).round(2).to_string())


Row counts: {'rows': 100785, 'orders': 99441, 'order_seller_legs': 100785}
Coverage: {'customer_state_coverage': 1.0, 'seller_state_coverage': 0.9923}
Duplicate order-seller rows: 0
Top statuses (%):
order_status
delivered      97.06
shipped         1.10
canceled        0.62
unavailable     0.61
invoiced        0.31
processing      0.30
created         0.00
approved        0.00


In [3]:
print('Missing rates for critical columns (%):')
print((summary['missing_rates'] * 100).round(2).to_string())


Missing rates for critical columns (%):
order_delivered_customer_date    2.94
order_delivered_carrier_date     1.77
seller_state                     0.77
seller_id                        0.77
shipping_limit_date              0.77
order_id                         0.00
order_purchase_timestamp         0.00
customer_state                   0.00
customer_id                      0.00
order_estimated_delivery_date    0.00


In [4]:
# Acceptance checks
assert summary['coverage']['customer_state_coverage'] >= 0.99, 'Customer state coverage below 99%'
assert summary['coverage']['seller_state_coverage'] >= 0.99, 'Seller state coverage below 99%'
assert summary['duplicate_order_seller'] == 0, 'Duplicate order-seller legs detected'

for col in ['order_purchase_timestamp', 'order_estimated_delivery_date']:
    assert legs[col].notna().all(), f'{col} has nulls after parsing'

print('All acceptance checks passed.')


All acceptance checks passed.


In [5]:
parquet_path = processed_dir / 'order_delivery_legs.parquet'
csv_path = processed_dir / 'order_delivery_legs.csv'

try:
    legs.to_parquet(parquet_path, index=False)
    print(f'Saved: {parquet_path}')
except Exception as e:
    legs.to_csv(csv_path, index=False)
    print(f'Parquet unavailable ({e}); saved CSV fallback: {csv_path}')


Saved: ..\data\processed\order_delivery_legs.parquet
