The following code block imports some of the libraries we will be using.

In [None]:
import pathlib
import sqlite3

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

The following code block reads data from the *Orders*, *Order Details*, and *Products* tables in the *northwind.db* database. The various data is used to create an modified version of the order data, which is eventually stored in the `orders` variable, that includes information that may be useful in investigating patterns in late orders.

In [None]:
# Read select tables from database
with sqlite3.Connection('northwind.db') as conn:
    orders = pd.read_sql('SELECT * FROM Orders', conn)
    order_details = pd.read_sql('SELECT * FROM "Order Details"', conn)
    products = pd.read_sql('SELECT ProductID, ProductName, SupplierID, CategoryID FROM Products', conn)
    
order_details = order_details.merge(products, how='left')

# Get summary from order details data
order_detail_summary = order_details.groupby(
    'OrderID'
).agg(
    unique_products=('ProductID', 'nunique'),
    total_quantity=('Quantity', 'sum'),
    unique_suppliers=('SupplierID', 'nunique'),
).reset_index()

orders = orders.merge(order_detail_summary, how='left')

# Convert date columns to datetime formats
orders['OrderDate'] = pd.to_datetime(orders['OrderDate'])
orders['RequiredDate'] = pd.to_datetime(orders['RequiredDate'])
orders['ShippedDate'] = pd.to_datetime(orders['ShippedDate'])

# Limit columns to subset
columns_to_keep = [
    'OrderID', 
    'CustomerID', 
    'EmployeeID', 
    'OrderDate', 
    'RequiredDate',
    'ShippedDate', 
    'ShipVia', 
    'ShipCity', 
    'ShipCountry',
    'ShipRegion', 
    'unique_products', 
    'total_quantity', 
    'unique_suppliers'
]
orders = orders[columns_to_keep]

# Create date-based columns
orders['order_to_ship_days'] = (orders['ShippedDate'] - orders['OrderDate'])/pd.Timedelta(days=1)
orders['order_to_required_days'] = (orders['RequiredDate'] - orders['OrderDate'])/pd.Timedelta(days=1)
orders['lateness'] = (orders['ShippedDate'] - orders['RequiredDate'])/pd.Timedelta(days=1)
orders['late'] = (orders['lateness'] > 0).astype(int)

The following code block prints the first five rows of the `orders` object.

In [None]:
orders.head()