In [1]:
"""
Inventory Optimization with Deep Reinforcement Learning (TensorFlow/Keras)
=========================================================================
Goal:
- Train an RL agent to decide "how much to order today" to minimize total inventory cost
  under stochastic demand and lead time uncertainty.

What this file includes:
1) A small, realistic single-item inventory simulator (custom environment)
2) A Deep Q-Network (DQN) agent in TensorFlow/Keras
3) Training loop + evaluation vs. a classic (s, S) reorder policy baseline

Typical supply chain costs modeled:
- Holding cost (per unit per day)
- Stockout / backorder penalty (per unit short)
- Ordering cost (variable per unit + optional fixed cost per order)

State (what the agent sees):
- On-hand inventory
- Pipeline inventory by lead-time remaining (vector)
- Current day-of-week (optional seasonality hook)

Action (what the agent controls):
- Order quantity chosen from a discrete set (e.g., {0, 5, 10, ..., 50})

Business interpretation:
- The agent learns a dynamic reorder point / safety stock behavior that adapts
  to pipeline and upcoming demand patterns, instead of a fixed static rule.
"""

from __future__ import annotations

import random
from dataclasses import dataclass
from collections import deque
from typing import Deque, Tuple, Dict, List

import numpy as np
import tensorflow as tf

In [3]:
# =============================================================================
# 0) Reproducibility + global config
# =============================================================================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Training horizon settings
EPISODES = 500
DAYS_PER_EPISODE = 180  # simulate 180 days per episode (roughly half-year)

# DQN hyperparameters
GAMMA = 0.99
LEARNING_RATE = 1e-3
BATCH_SIZE = 128
REPLAY_CAPACITY = 50_000
LEARN_START = 2_000        # wait until replay has enough transitions
TRAIN_EVERY = 1            # gradient steps frequency (in environment steps)
TARGET_UPDATE_EVERY = 1_000

# Exploration schedule (epsilon-greedy)
EPS_START = 1.0
EPS_END = 0.05
EPS_DECAY_STEPS = 50_000   # anneal epsilon over these steps

# Action space: discrete order quantities
MAX_ORDER_QTY = 50
ORDER_STEP = 5
ACTIONS = np.arange(0, MAX_ORDER_QTY + 1, ORDER_STEP)  # e.g., [0,5,10,...,50]
N_ACTIONS = len(ACTIONS)

# Environment settings (can be tuned to match your case study)
MAX_LEAD_TIME = 5  # maximum possible lead time (days)

In [4]:
# =============================================================================
# 1) Inventory simulator environment (single-item, stochastic demand, lead time)
# =============================================================================
@dataclass
class InventoryEnvConfig:
    # Demand process
    mean_demand: float = 20.0
    demand_std: float = 5.0          # for normal demand (clipped at >=0)
    use_weekly_seasonality: bool = True

    # Lead time (stochastic). You can replace with empirical distribution.
    min_lead_time: int = 1
    max_lead_time: int = MAX_LEAD_TIME

    # Cost parameters
    holding_cost: float = 1.0        # cost per unit held per day
    stockout_cost: float = 10.0      # penalty per unit short per day (lost sales/backorder)
    unit_order_cost: float = 2.0     # variable ordering cost per unit
    fixed_order_cost: float = 20.0   # fixed cost if you order > 0 (set 0 if not needed)

    # Inventory bounds (for normalization / clipping)
    max_inventory: int = 300         # on-hand upper bound (soft cap via clipping)
    max_pipeline_per_bucket: int = 200  # pipeline bucket upper bound (soft cap)


class InventoryEnv:
    """
    A lightweight inventory control environment designed for DQN.

    Dynamics per day t:
    1) Receive deliveries arriving today
    2) Observe state
    3) Choose action: order quantity q
    4) Sample lead time L, put q into pipeline arriving at t+L
    5) Sample demand D_t, fulfill from on-hand
    6) Compute costs and reward = - total_cost
    """

    def __init__(self, config: InventoryEnvConfig, horizon_days: int, seed: int = 0):
        self.cfg = config
        self.horizon_days = horizon_days
        self.rng = np.random.default_rng(seed)

        # Internal state
        self.day: int = 0
        self.on_hand: int = 0

        # Pipeline is a vector of length max_lead_time, where:
        # pipeline[i] = units arriving in (i+1) days
        self.pipeline: np.ndarray = np.zeros(self.cfg.max_lead_time, dtype=np.int32)

        # For logging
        self.total_cost: float = 0.0

    @property
    def state_dim(self) -> int:
        # on_hand (1) + pipeline buckets (max_lead_time) + day_of_week one scalar (1)
        return 1 + self.cfg.max_lead_time + 1

    def reset(self) -> np.ndarray:
        self.day = 0
        self.total_cost = 0.0

        # Initialize with some starting inventory/pipeline (optional)
        self.on_hand = int(self.cfg.mean_demand * 2)  # start with ~2 days cover
        self.pipeline = np.zeros(self.cfg.max_lead_time, dtype=np.int32)

        return self._get_obs()

    def step(self, action_index: int) -> Tuple[np.ndarray, float, bool, Dict]:
        """
        action_index: index into ACTIONS array (discrete order quantities)
        returns: obs_next, reward, done, info
        """
        assert 0 <= action_index < N_ACTIONS
        order_qty = int(ACTIONS[action_index])

        # --- 1) Receive deliveries arriving today (pipeline bucket 0 means arriving in 1 day,
        # so "arriving today" is not stored there. We implement shift:
        # pipeline[0] is arriving tomorrow; so first shift pipeline, but arrivals today are tracked separately.
        arrivals_today = 0  # this env stores arrivals only via pipeline shift; so keep 0 here.
        # If you prefer "pipeline[0] arrives today", adjust indexing accordingly.

        # Shift pipeline: items in bucket 0 arrive now? We'll define:
        # pipeline[0] arrives TODAY, pipeline[1] arrives in 1 day, ...
        # That tends to be simpler for daily step ordering.
        arrivals_today = int(self.pipeline[0])
        self.on_hand += arrivals_today

        # Move the pipeline forward by one day
        self.pipeline[:-1] = self.pipeline[1:]
        self.pipeline[-1] = 0

        # --- 2) Observe state (agent sees after arrivals, before ordering)
        obs = self._get_obs()

        # --- 3) Place new order into pipeline with stochastic lead time
        if order_qty > 0:
            lead_time = int(self.rng.integers(self.cfg.min_lead_time, self.cfg.max_lead_time + 1))
            # If lead_time=1, it arrives tomorrow => bucket index (lead_time-1)
            bucket = lead_time - 1
            self.pipeline[bucket] += order_qty
        else:
            lead_time = 0  # no order

        # --- 4) Sample demand and fulfill
        demand = self._sample_demand(self.day)
        demand = int(max(0, demand))

        fulfilled = min(self.on_hand, demand)
        self.on_hand -= fulfilled
        short = demand - fulfilled  # unmet demand (lost sales/backorder proxy)

        # --- 5) Compute costs
        holding_cost = self.cfg.holding_cost * self.on_hand
        stockout_cost = self.cfg.stockout_cost * short

        variable_order_cost = self.cfg.unit_order_cost * order_qty
        fixed_order_cost = self.cfg.fixed_order_cost if order_qty > 0 else 0.0

        total_cost_today = holding_cost + stockout_cost + variable_order_cost + fixed_order_cost
        self.total_cost += total_cost_today

        # Reward is negative cost (RL maximizes reward)
        reward = -float(total_cost_today)

        # --- 6) Next day
        self.day += 1
        done = self.day >= self.horizon_days

        obs_next = self._get_obs()

        info = {
            "day": self.day,
            "demand": demand,
            "fulfilled": fulfilled,
            "short": short,
            "arrivals_today": arrivals_today,
            "order_qty": order_qty,
            "lead_time": lead_time,
            "on_hand": int(self.on_hand),
            "pipeline": self.pipeline.copy(),
            "cost_today": float(total_cost_today),
            "total_cost": float(self.total_cost),
        }
        return obs_next, reward, done, info

    def _sample_demand(self, day: int) -> int:
        """
        Demand model:
        - Base normal distribution around mean_demand
        - Optional weekly seasonality factor (Mon..Sun)
        Replace this with your real demand generator or historical bootstrap.
        """
        base = self.rng.normal(self.cfg.mean_demand, self.cfg.demand_std)

        if not self.cfg.use_weekly_seasonality:
            return int(round(base))

        # Simple weekly pattern: e.g., weekend higher/lower (customize as needed)
        dow = day % 7  # 0..6
        seasonal_multiplier = [1.0, 1.0, 1.05, 1.05, 1.10, 1.15, 1.10][dow]
        return int(round(base * seasonal_multiplier))

    def _get_obs(self) -> np.ndarray:
        """
        Observation as a normalized float vector:
        [on_hand, pipeline_0, ..., pipeline_{L-1}, day_of_week]
        """
        on_hand = np.clip(self.on_hand, 0, self.cfg.max_inventory)
        pipe = np.clip(self.pipeline, 0, self.cfg.max_pipeline_per_bucket)

        dow = self.day % 7

        obs = np.concatenate([
            np.array([on_hand], dtype=np.float32),
            pipe.astype(np.float32),
            np.array([dow], dtype=np.float32),
        ])

        # Normalize for NN stability
        obs[0] /= float(self.cfg.max_inventory)
        obs[1:1 + self.cfg.max_lead_time] /= float(self.cfg.max_pipeline_per_bucket)
        obs[-1] /= 6.0  # day_of_week in [0,6] -> [0,1]

        return obs.astype(np.float32)

In [5]:
# =============================================================================
# 2) Replay Buffer for DQN
# =============================================================================
class ReplayBuffer:
    def __init__(self, capacity: int):
        self.buffer: Deque = deque(maxlen=capacity)

    def add(self, s: np.ndarray, a: int, r: float, s2: np.ndarray, done: bool):
        self.buffer.append((s, a, r, s2, done))

    def sample(self, batch_size: int):
        batch = random.sample(self.buffer, batch_size)
        s, a, r, s2, d = map(np.array, zip(*batch))
        return (
            s.astype(np.float32),
            a.astype(np.int32),
            r.astype(np.float32),
            s2.astype(np.float32),
            d.astype(np.float32),
        )

    def __len__(self) -> int:
        return len(self.buffer)

In [6]:
# =============================================================================
# 3) DQN Model (Q-network) in TensorFlow/Keras
# =============================================================================
def build_q_network(state_dim: int, n_actions: int) -> tf.keras.Model:
    """
    Q(s,a) approximator:
    - Input: state vector
    - Output: Q-values for each discrete action
    """
    inputs = tf.keras.Input(shape=(state_dim,), name="state")

    x = tf.keras.layers.Dense(128, activation="relu")(inputs)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.Dense(64, activation="relu")(x)

    # Output layer: one Q-value per action
    q_values = tf.keras.layers.Dense(n_actions, activation=None, name="q_values")(x)

    model = tf.keras.Model(inputs=inputs, outputs=q_values)
    return model

In [7]:
# =============================================================================
# 4) DQN Agent (epsilon-greedy + target network)
# =============================================================================
class DQNAgent:
    def __init__(self, state_dim: int, n_actions: int):
        self.state_dim = state_dim
        self.n_actions = n_actions

        # Online network (trainable) and target network (stable bootstrap target)
        self.q_online = build_q_network(state_dim, n_actions)
        self.q_target = build_q_network(state_dim, n_actions)
        self.q_target.set_weights(self.q_online.get_weights())

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

        # Step counter for epsilon schedule + target updates
        self.global_step = 0

    def epsilon(self) -> float:
        """
        Linear decay: EPS_START -> EPS_END over EPS_DECAY_STEPS
        """
        frac = min(1.0, self.global_step / float(EPS_DECAY_STEPS))
        return EPS_START + frac * (EPS_END - EPS_START)

    def act(self, state: np.ndarray) -> int:
        """
        Epsilon-greedy action selection.
        """
        eps = self.epsilon()
        self.global_step += 1

        if random.random() < eps:
            return random.randrange(self.n_actions)

        # Greedy action from Q-network
        s = state.reshape(1, -1).astype(np.float32)
        q = self.q_online(s, training=False).numpy()[0]
        return int(np.argmax(q))

    @tf.function
    def _train_step(self, s, a, r, s2, done):
        """
        One gradient update using DQN target:
        y = r + gamma * (1-done) * max_a' Q_target(s2, a')
        loss = MSE(Q_online(s, a), y)
        """
        # Compute target y
        q_next = self.q_target(s2, training=False)             # [B, A]
        max_q_next = tf.reduce_max(q_next, axis=1)             # [B]
        y = r + GAMMA * (1.0 - done) * max_q_next              # [B]

        with tf.GradientTape() as tape:
            q_all = self.q_online(s, training=True)            # [B, A]

            # Gather Q(s,a) for taken actions
            idx = tf.stack([tf.range(tf.shape(a)[0]), a], axis=1)
            q_sa = tf.gather_nd(q_all, idx)                    # [B]

            loss = tf.reduce_mean(tf.square(y - q_sa))

        grads = tape.gradient(loss, self.q_online.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.q_online.trainable_variables))
        return loss

    def learn(self, replay: ReplayBuffer) -> float:
        """
        Sample a batch and update online network.
        Also periodically update the target network weights.
        """
        s, a, r, s2, done = replay.sample(BATCH_SIZE)

        loss = self._train_step(
            tf.convert_to_tensor(s),
            tf.convert_to_tensor(a),
            tf.convert_to_tensor(r),
            tf.convert_to_tensor(s2),
            tf.convert_to_tensor(done),
        )

        # Target network hard update
        if (self.global_step % TARGET_UPDATE_EVERY) == 0:
            self.q_target.set_weights(self.q_online.get_weights())

        return float(loss.numpy())

In [8]:
# =============================================================================
# 5) Baseline policy: classic (s, S) reorder rule (for comparison)
# =============================================================================
class ReorderPolicy_sS:
    """
    If inventory_position <= s: order up to S
    inventory_position = on_hand + pipeline_total
    """
    def __init__(self, s: int, S: int):
        assert S >= s
        self.s = s
        self.S = S

    def act(self, on_hand: int, pipeline: np.ndarray) -> int:
        inv_pos = on_hand + int(pipeline.sum())
        if inv_pos <= self.s:
            return max(0, self.S - inv_pos)
        return 0


def order_qty_to_action_index(order_qty: int) -> int:
    """
    Convert a quantity to the nearest discrete action in ACTIONS.
    """
    order_qty = int(np.clip(order_qty, ACTIONS[0], ACTIONS[-1]))
    # snap to step
    snapped = int(round(order_qty / ORDER_STEP) * ORDER_STEP)
    snapped = int(np.clip(snapped, ACTIONS[0], ACTIONS[-1]))
    return int(np.where(ACTIONS == snapped)[0][0])

In [9]:
# =============================================================================
# 6) Training + Evaluation helpers
# =============================================================================
def run_episode_with_agent(env: InventoryEnv, agent: DQNAgent, replay: ReplayBuffer, training: bool) -> Dict:
    """
    Runs 1 episode using DQN agent.
    If training=True:
      - store transitions
      - train periodically
    """
    s = env.reset()
    done = False

    total_reward = 0.0
    total_loss = 0.0
    n_updates = 0

    while not done:
        a = agent.act(s)
        s2, r, done, info = env.step(a)

        total_reward += r

        if training:
            replay.add(s, a, r, s2, done)

            # Train only when replay has enough samples
            if len(replay) >= LEARN_START and (agent.global_step % TRAIN_EVERY == 0):
                loss = agent.learn(replay)
                total_loss += loss
                n_updates += 1

        s = s2

    return {
        "total_reward": float(total_reward),
        "total_cost": float(info["total_cost"]),
        "avg_loss": float(total_loss / max(1, n_updates)),
        "updates": int(n_updates),
    }


def run_episode_with_policy(env: InventoryEnv, policy: ReorderPolicy_sS) -> Dict:
    """
    Runs 1 episode using a non-learning baseline policy.
    """
    s = env.reset()
    done = False

    # De-normalize extraction helper:
    # obs = [on_hand_norm, pipeline_norm..., dow_norm]
    def denorm_on_hand(obs):
        return int(round(obs[0] * env.cfg.max_inventory))

    def denorm_pipeline(obs):
        pipe_norm = obs[1:1 + env.cfg.max_lead_time]
        pipe = (pipe_norm * env.cfg.max_pipeline_per_bucket).round().astype(int)
        return pipe

    total_reward = 0.0

    while not done:
        on_hand = denorm_on_hand(s)
        pipe = denorm_pipeline(s)

        order_qty = policy.act(on_hand, pipe)
        a = order_qty_to_action_index(order_qty)

        s2, r, done, info = env.step(a)
        total_reward += r
        s = s2

    return {"total_reward": float(total_reward), "total_cost": float(info["total_cost"])}

In [10]:
# =============================================================================
# 7) Main: Train DQN and compare vs baseline
# =============================================================================
def main():
    cfg = InventoryEnvConfig(
        mean_demand=20.0,
        demand_std=6.0,
        use_weekly_seasonality=True,
        min_lead_time=1,
        max_lead_time=5,
        holding_cost=1.0,
        stockout_cost=12.0,
        unit_order_cost=2.0,
        fixed_order_cost=15.0,
        max_inventory=300,
        max_pipeline_per_bucket=200,
    )

    # Create environment
    env = InventoryEnv(cfg, horizon_days=DAYS_PER_EPISODE, seed=SEED)
    print(f"State dim = {env.state_dim}, Actions = {N_ACTIONS} ({ACTIONS.tolist()})")

    # Create DQN agent and replay buffer
    agent = DQNAgent(state_dim=env.state_dim, n_actions=N_ACTIONS)
    replay = ReplayBuffer(capacity=REPLAY_CAPACITY)

    # Baseline: simple reorder policy
    # Rule-of-thumb starting point:
    # - reorder point s ~ mean_demand*(avg_lead_time) + safety_stock
    # - safety stock ~ z * sigma_demand * sqrt(lead_time)
    avg_lt = (cfg.min_lead_time + cfg.max_lead_time) / 2.0
    safety_stock = int(round(1.65 * cfg.demand_std * np.sqrt(avg_lt)))  # ~95% CSL-ish
    s = int(round(cfg.mean_demand * avg_lt + safety_stock))
    S = s + 80  # order-up-to level (tune)
    baseline = ReorderPolicy_sS(s=s, S=S)
    print(f"Baseline (s,S) = ({s}, {S})  | safety_stock≈{safety_stock}, avg_lt≈{avg_lt:.2f}")

    # --- Training loop
    history = []
    for ep in range(1, EPISODES + 1):
        metrics = run_episode_with_agent(env, agent, replay, training=True)
        history.append(metrics)

        # Lightweight logging every N episodes
        if ep % 25 == 0:
            recent = history[-25:]
            avg_cost = np.mean([h["total_cost"] for h in recent])
            avg_loss = np.mean([h["avg_loss"] for h in recent])
            eps = agent.epsilon()
            print(f"EP {ep:4d} | avg_cost(last25)={avg_cost:8.2f} | avg_loss={avg_loss:8.5f} | eps={eps:5.3f}")

    # --- Evaluation: run a few episodes without exploration (set epsilon ~ 0)
    # For a clean eval, we'll temporarily override epsilon by setting global_step large.
    agent.global_step = int(EPS_DECAY_STEPS * 10)

    eval_runs = 50
    dqn_costs = []
    base_costs = []

    for _ in range(eval_runs):
        # DQN (no training)
        dqn_metrics = run_episode_with_agent(env, agent, replay, training=False)
        dqn_costs.append(dqn_metrics["total_cost"])

        # Baseline
        base_metrics = run_episode_with_policy(env, baseline)
        base_costs.append(base_metrics["total_cost"])

    print("\n=== Evaluation Results (lower cost is better) ===")
    print(f"DQN:      mean_cost={np.mean(dqn_costs):.2f} | std={np.std(dqn_costs):.2f}")
    print(f"Baseline: mean_cost={np.mean(base_costs):.2f} | std={np.std(base_costs):.2f}")

    improvement = (np.mean(base_costs) - np.mean(dqn_costs)) / np.mean(base_costs) * 100.0
    print(f"Relative improvement vs baseline: {improvement:.2f}%")

    # Optional: show a few costs (sanity)
    print("\nSample episode costs:")
    print("DQN:     ", [round(x, 1) for x in dqn_costs[:10]])
    print("Baseline:", [round(x, 1) for x in base_costs[:10]])


if __name__ == "__main__":
    main()

State dim = 7, Actions = 11 ([0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50])
Baseline (s,S) = (77, 157)  | safety_stock≈17, avg_lt≈3.00
EP   25 | avg_cost(last25)=64845.08 | avg_loss=13507.69178 | eps=0.914
EP   50 | avg_cost(last25)=44195.52 | avg_loss=32081.96900 | eps=0.829
EP   75 | avg_cost(last25)=28632.84 | avg_loss=68666.94621 | eps=0.744
EP  100 | avg_cost(last25)=22893.96 | avg_loss=90401.99663 | eps=0.658
EP  125 | avg_cost(last25)=21435.76 | avg_loss=92433.20650 | eps=0.573
EP  150 | avg_cost(last25)=20392.36 | avg_loss=100011.53665 | eps=0.487
EP  175 | avg_cost(last25)=19173.68 | avg_loss=94663.01036 | eps=0.402
EP  200 | avg_cost(last25)=19005.88 | avg_loss=88652.05983 | eps=0.316
EP  225 | avg_cost(last25)=18573.40 | avg_loss=87852.41867 | eps=0.231
EP  250 | avg_cost(last25)=18289.92 | avg_loss=84201.79950 | eps=0.145
EP  275 | avg_cost(last25)=18762.28 | avg_loss=87433.49176 | eps=0.059
EP  300 | avg_cost(last25)=18474.12 | avg_loss=74763.30404 | eps=0.050
EP  325 | avg_c