# Data Exploration

This notebook explores the generated logistics data to understand patterns and validate data quality.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## Load Data

Load the generated sample data from parquet files.

In [None]:
# Load data
DATA_DIR = '../data/generated'

packages = pd.read_parquet(f'{DATA_DIR}/packages.parquet')
tracking_events = pd.read_parquet(f'{DATA_DIR}/tracking_events.parquet')
customers = pd.read_parquet(f'{DATA_DIR}/customers.parquet')
locations = pd.read_parquet(f'{DATA_DIR}/locations.parquet')

print(f"Packages: {len(packages):,} records")
print(f"Tracking Events: {len(tracking_events):,} records")
print(f"Customers: {len(customers):,} records")
print(f"Locations: {len(locations):,} records")

## Package Analysis

In [None]:
# Package overview
packages.head()

In [None]:
# Service type distribution
fig, ax = plt.subplots(figsize=(10, 6))
packages['service_type'].value_counts().plot(kind='bar', ax=ax)
ax.set_title('Package Distribution by Service Type')
ax.set_xlabel('Service Type')
ax.set_ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Status distribution
fig, ax = plt.subplots(figsize=(10, 6))
packages['status'].value_counts().plot(kind='bar', ax=ax)
ax.set_title('Package Distribution by Status')
ax.set_xlabel('Status')
ax.set_ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Weight distribution
fig, ax = plt.subplots(figsize=(10, 6))
packages['weight_lbs'].hist(bins=50, ax=ax)
ax.set_title('Package Weight Distribution')
ax.set_xlabel('Weight (lbs)')
ax.set_ylabel('Count')
plt.tight_layout()
plt.show()

print("Weight Statistics:")
print(packages['weight_lbs'].describe())

In [None]:
# Packages over time
packages['created_date'] = pd.to_datetime(packages['created_at']).dt.date
daily_packages = packages.groupby('created_date').size()

fig, ax = plt.subplots(figsize=(14, 6))
daily_packages.plot(ax=ax)
ax.set_title('Daily Package Volume')
ax.set_xlabel('Date')
ax.set_ylabel('Number of Packages')
plt.tight_layout()
plt.show()

## Customer Analysis

In [None]:
# Customer overview
customers.head()

In [None]:
# Customer type distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

customers['customer_type'].value_counts().plot(kind='pie', autopct='%1.1f%%', ax=axes[0])
axes[0].set_title('Customer Type Distribution')

customers['tier'].value_counts().plot(kind='bar', ax=axes[1])
axes[1].set_title('Customer Tier Distribution')
axes[1].set_xlabel('Tier')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Geographic distribution
state_counts = customers['address_state'].value_counts().head(20)

fig, ax = plt.subplots(figsize=(12, 6))
state_counts.plot(kind='bar', ax=ax)
ax.set_title('Top 20 States by Customer Count')
ax.set_xlabel('State')
ax.set_ylabel('Customer Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Tracking Events Analysis

In [None]:
# Event type distribution
fig, ax = plt.subplots(figsize=(12, 6))
tracking_events['event_type'].value_counts().plot(kind='barh', ax=ax)
ax.set_title('Tracking Event Type Distribution')
ax.set_xlabel('Count')
ax.set_ylabel('Event Type')
plt.tight_layout()
plt.show()

In [None]:
# Events per package
events_per_package = tracking_events.groupby('package_id').size()

fig, ax = plt.subplots(figsize=(10, 6))
events_per_package.hist(bins=30, ax=ax)
ax.set_title('Distribution of Events per Package')
ax.set_xlabel('Number of Events')
ax.set_ylabel('Count')
plt.tight_layout()
plt.show()

print("Events per Package Statistics:")
print(events_per_package.describe())

## Location Analysis

In [None]:
# Location types
fig, ax = plt.subplots(figsize=(10, 6))
locations['location_type'].value_counts().plot(kind='bar', ax=ax)
ax.set_title('Location Type Distribution')
ax.set_xlabel('Location Type')
ax.set_ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Hub locations map (simple scatter)
hubs = locations[locations['location_type'].isin(['hub', 'distribution_center'])]

fig, ax = plt.subplots(figsize=(14, 8))
ax.scatter(hubs['longitude'], hubs['latitude'], c='red', s=50, alpha=0.7)
ax.set_title('Hub and Distribution Center Locations')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_xlim(-130, -65)
ax.set_ylim(24, 50)
plt.tight_layout()
plt.show()

## Data Quality Checks

In [None]:
# Check for nulls
print("Null counts in packages:")
print(packages.isnull().sum())

In [None]:
# Check for duplicates
print(f"Duplicate package IDs: {packages['package_id'].duplicated().sum()}")
print(f"Duplicate customer IDs: {customers['customer_id'].duplicated().sum()}")
print(f"Duplicate event IDs: {tracking_events['event_id'].duplicated().sum()}")

In [None]:
# Validate referential integrity
package_customers = set(packages['sender_customer_id'].unique()) | set(packages['recipient_customer_id'].unique())
known_customers = set(customers['customer_id'].unique())
orphan_customers = package_customers - known_customers

print(f"Orphan customer references: {len(orphan_customers)}")

## Summary Statistics

In [None]:
# Summary
print("=== Dataset Summary ===")
print("\nPackages:")
print(f"  Total: {len(packages):,}")
print(f"  Delivered: {(packages['status'] == 'delivered').sum():,} ({(packages['status'] == 'delivered').mean()*100:.1f}%)")
print(f"  In Transit: {(packages['status'] == 'in_transit').sum():,}")
print(f"  Exceptions: {(packages['status'] == 'exception').sum():,}")

print("\nCustomers:")
print(f"  Total: {len(customers):,}")
print(f"  Business: {(customers['customer_type'] == 'business').sum():,}")
print(f"  Residential: {(customers['customer_type'] == 'residential').sum():,}")

print("\nTracking Events:")
print(f"  Total: {len(tracking_events):,}")
print(f"  Avg per package: {len(tracking_events)/len(packages):.1f}")

print("\nLocations:")
print(f"  Total: {len(locations):,}")
print(f"  Hubs: {(locations['location_type'] == 'hub').sum()}")
print(f"  Distribution Centers: {(locations['location_type'] == 'distribution_center').sum()}")