In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

DATA_PATH = Path().cwd().parent / "data"


def read_data(file_name):
    return pd.read_csv(DATA_PATH / file_name, parse_dates=True)


shipments_df = read_data("shipments.csv")
po_df = read_data("purchase_orders.csv")
suppliers_df = read_data("suppliers.csv")
sku_df = read_data("skus.csv")
transit_events_df = read_data("transit_events.csv")
sites_df = read_data("sites.csv")
deployments_df = read_data("deployments.csv")
rma_returns_df = read_data("rma_returns.csv")
inventory_df = read_data("inventory_snapshots.csv")
_datetime_map = {
    "shipments": (shipments_df, ["ship_date", "eta_date", "actual_delivery_date"]),
    "purchase_orders": (po_df, ["order_date", "promised_date"]),
    "transit_events": (transit_events_df, ["event_ts"]),
    "deployments": (deployments_df, ["planned_date", "actual_date"]),
}
for _, (df, cols) in _datetime_map.items():
    for col in cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")

Forecast site-level inventory for each SKU for the next 30/60/90 days

In [3]:
po_order_dates = po_df[["po_id", "order_date"]].rename(columns={"order_date": "po_order_date"})
shipments_df = shipments_df.merge(po_order_dates, on="po_id", how="left")
# remove suspect shipments where PO order date is after ship date
shipments_df = shipments_df.where(
    shipments_df["po_order_date"].notna()
    & shipments_df["ship_date"].notna()
    & (shipments_df["po_order_date"] <= shipments_df["ship_date"])
)
# remove null rows created by filtering
shipments_df = shipments_df.dropna(subset=["po_order_date", "ship_date"])
shipments_df

Unnamed: 0,shipment_id,po_id,ship_qty,mode,incoterm,origin_country,dest_site_id,ship_date,eta_date,status,po_order_date
0,SH-00000001,PO-0104861,6.0,Road,CIF,Mexico,ST-00452,2025-02-03,2025-02-10,Delivered,2024-05-25
4,SH-00000005,PO-0003236,6.0,Road,DAP,Brazil,ST-01995,2025-06-19,2025-06-27,Delivered,2023-04-01
6,SH-00000007,PO-0089238,4.0,Sea,DAP,Germany,ST-01749,2023-08-09,2023-09-13,In Transit,2023-02-02
7,SH-00000008,PO-0075655,8.0,Road,FOB,Austria,ST-00867,2025-08-28,2025-09-05,Delivered,2025-06-03
9,SH-00000010,PO-0099357,6.0,Road,DDP,Czechia,ST-00151,2025-01-02,2025-01-13,In Transit,2024-05-09
...,...,...,...,...,...,...,...,...,...,...,...
217491,SH-00217492,PO-0013523,2.0,Sea,DAP,Canada,ST-01941,2025-05-26,2025-06-26,In Transit,2024-05-13
217492,SH-00217493,PO-0093856,5.0,Sea,DAP,Spain,ST-00658,2025-02-14,2025-03-21,In Transit,2023-02-17
217494,SH-00217495,PO-0148870,3.0,Air,FOB,Malaysia,ST-00088,2024-03-13,2024-03-20,In Transit,2023-10-15
217496,SH-00217497,PO-0038120,7.0,Road,EXW,Mexico,ST-01876,2024-08-14,2024-08-27,Delivered,2023-02-12


i am assuming that event_status= Delivered  - means the shipment was delivered

In [4]:

transit_events_delivery = transit_events_df[
    transit_events_df["event_status"] == "Delivered"
][["shipment_id", "event_ts"]].rename(columns={"event_ts": "actual_delivery_date"})
shipments_df = shipments_df.merge(
    transit_events_delivery,
    on="shipment_id",
    how="left",
    suffixes=("", "_from_events")
)
# actual_delivery_date Missing:
# 80768 (72%)
# Distinct:
# 17931 (16%)

shipments_df

Unnamed: 0,shipment_id,po_id,ship_qty,mode,incoterm,origin_country,dest_site_id,ship_date,eta_date,status,po_order_date,actual_delivery_date
0,SH-00000001,PO-0104861,6.0,Road,CIF,Mexico,ST-00452,2025-02-03,2025-02-10,Delivered,2024-05-25,NaT
1,SH-00000005,PO-0003236,6.0,Road,DAP,Brazil,ST-01995,2025-06-19,2025-06-27,Delivered,2023-04-01,NaT
2,SH-00000007,PO-0089238,4.0,Sea,DAP,Germany,ST-01749,2023-08-09,2023-09-13,In Transit,2023-02-02,2023-05-31 07:00:00
3,SH-00000007,PO-0089238,4.0,Sea,DAP,Germany,ST-01749,2023-08-09,2023-09-13,In Transit,2023-02-02,2024-02-29 03:00:00
4,SH-00000008,PO-0075655,8.0,Road,FOB,Austria,ST-00867,2025-08-28,2025-09-05,Delivered,2025-06-03,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...
112836,SH-00217492,PO-0013523,2.0,Sea,DAP,Canada,ST-01941,2025-05-26,2025-06-26,In Transit,2024-05-13,2025-05-16 08:00:00
112837,SH-00217493,PO-0093856,5.0,Sea,DAP,Spain,ST-00658,2025-02-14,2025-03-21,In Transit,2023-02-17,2024-08-02 16:00:00
112838,SH-00217495,PO-0148870,3.0,Air,FOB,Malaysia,ST-00088,2024-03-13,2024-03-20,In Transit,2023-10-15,NaT
112839,SH-00217497,PO-0038120,7.0,Road,EXW,Mexico,ST-01876,2024-08-14,2024-08-27,Delivered,2023-02-12,NaT


In [None]:
deployments_df["actual_date"].max() # of course we are living in future.

Timestamp('2025-12-28 00:00:00')

In [7]:
latest_inventory_date = inventory_df["snapshot_date"].max()
latest_inventory_date

'2025-10-31'

In [None]:
# actual inventory state = 