In [1]:
import gym
import numpy as np
import random
import math
from gym import spaces
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

# -----------------------------
# Supply Chain Gym Environment
# -----------------------------
class SupplyChainEnv(gym.Env):
    """
    A simple supply chain simulation environment.
    At each time step, the agent selects a production multiplier (0, 1, or 2).
    The production is subject to random disruption.
    Demand is generated with a seasonal pattern.

    The state is represented as:
      - current inventory level (normalized)
      - current seasonal factor (sinusoidal signal)

    The reward is defined as the negative total cost incurred:
      cost = production_cost * production + holding_cost * new_inventory + shortage_penalty * shortage
    """
    metadata = {'render.modes': ['human']}

    def __init__(self):
        super(SupplyChainEnv, self).__init__()

        # Define action and observation spaces
        # Action: 0 (no production), 1 (normal production), 2 (increased production)
        self.action_space = spaces.Discrete(3)
        # Observation: [inventory level, seasonal factor]
        self.observation_space = spaces.Box(low=np.array([0.0, -1.0]),
                                            high=np.array([100.0, 1.0]),
                                            dtype=np.float32)

        # Environment parameters
        self.baseline_production = 1.0      # base production units
        self.production_cost = 0.5          # cost per unit produced
        self.holding_cost = 0.05            # cost per unit in inventory per step
        self.shortage_penalty = 2.0         # penalty per unit short
        self.DISRUPTION_PROB = 0.1          # probability that production is disrupted (production becomes 0)

        # Demand generation parameters
        self.demand_min = 1
        self.demand_max = 3
        self.seasonal_variation = 0.3       # amplitude of seasonal effect
        self.season_period = 20             # period of the seasonal sine wave

        # Episode parameters
        self.max_steps = 100
        self.current_step = 0
        self.inventory = 0.0

    def reset(self):
        self.current_step = 0
        self.inventory = 0.0
        return self._get_obs()

    def _get_obs(self):
        # Seasonal factor based on current step
        seasonal_factor = math.sin(2 * math.pi * self.current_step / self.season_period)
        # We can optionally normalize inventory; here we keep it raw
        return np.array([self.inventory, seasonal_factor], dtype=np.float32)

    def step(self, action):
        """
        Action: production multiplier
            0: no production
            1: normal production (1 unit)
            2: increased production (2 units)
        """
        # Convert action into production target
        multiplier = float(action)  # 0, 1, or 2
        planned_production = multiplier * self.baseline_production

        # Check for production disruption
        if random.random() < self.DISRUPTION_PROB:
            production = 0.0
            disruption = True
        else:
            production = planned_production
            disruption = False

        # Generate demand (seasonal effect)
        # Seasonal factor (scales demand): 1 + variation * sin(...)
        seasonal_effect = 1 + self.seasonal_variation * math.sin(2 * math.pi * self.current_step / self.season_period)
        demand = random.randint(self.demand_min, self.demand_max) * seasonal_effect

        # Update inventory
        available = self.inventory + production
        if available >= demand:
            shortage = 0.0
            fulfilled = demand
            new_inventory = available - demand
        else:
            shortage = demand - available
            fulfilled = available
            new_inventory = 0.0

        # Compute cost
        cost = self.production_cost * production + self.holding_cost * new_inventory + self.shortage_penalty * shortage
        reward = -cost  # Our goal is to minimize cost

        # Update state variables
        self.inventory = new_inventory
        self.current_step += 1
        done = self.current_step >= self.max_steps

        # Optional: info dict for debugging
        info = {
            'production': production,
            'planned_production': planned_production,
            'disruption': disruption,
            'demand': demand,
            'shortage': shortage,
            'inventory': self.inventory,
            'cost': cost
        }

        return self._get_obs(), reward, done, info

    def render(self, mode='human'):
        print(f"Step: {self.current_step}, Inventory: {self.inventory:.2f}")

# -----------------------------
# Deep Q-Network (DQN) Agent
# -----------------------------
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )

    def forward(self, state):
        return self.net(state)

class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.array, zip(*batch))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

class DQNAgent:
    def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99, epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma

        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.policy_net = DQN(state_dim, action_dim).to(self.device)
        self.target_net = DQN(state_dim, action_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.replay_buffer = ReplayBuffer()

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randrange(self.action_dim)
        else:
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            with torch.no_grad():
                q_values = self.policy_net(state_tensor)
            return q_values.argmax().item()

    def update(self, batch_size):
        if len(self.replay_buffer) < batch_size:
            return

        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)

        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(self.device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)

        # Compute current Q values
        current_q = self.policy_net(states).gather(1, actions)

        # Compute next Q values from target network
        with torch.no_grad():
            next_q = self.target_net(next_states).max(1)[0].unsqueeze(1)

        # Compute target Q value
        target_q = rewards + self.gamma * next_q * (1 - dones)

        # Compute loss
        loss = nn.MSELoss()(current_q, target_q)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Decay epsilon
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_end)

    def update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

# -----------------------------
# Training Loop
# -----------------------------
def train(num_episodes=500, batch_size=64, target_update=10):
    env = SupplyChainEnv()
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    agent = DQNAgent(state_dim, action_dim)

    episode_rewards = []

    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0.0

        done = False
        while not done:
            action = agent.select_action(state)
            next_state, reward, done, info = env.step(action)
            total_reward += reward

            agent.replay_buffer.push(state, action, reward, next_state, done)
            state = next_state

            agent.update(batch_size)

        # Update target network periodically
        if episode % target_update == 0:
            agent.update_target()

        episode_rewards.append(total_reward)
        if (episode + 1) % 10 == 0:
            avg_reward = np.mean(episode_rewards[-10:])
            print(f"Episode {episode+1}, Average Reward: {avg_reward:.2f}, Epsilon: {agent.epsilon:.2f}")

    return agent, episode_rewards

if __name__ == '__main__':
    trained_agent, rewards = train()


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Episode 10, Average Reward: -202.72, Epsilon: 0.01
Episode 20, Average Reward: -173.71, Epsilon: 0.01
Episode 30, Average Reward: -162.69, Epsilon: 0.01
Episode 40, Average Reward: -158.16, Epsilon: 0.01
Episode 50, Average Reward: -164.72, Epsilon: 0.01
Episode 60, Average Reward: -162.80, Epsilon: 0.01
Episode 70, Average Reward: -160.51, Epsilon: 0.01
Episode 80, Average Reward: -162.06, Epsilon: 0.01
Episode 90, Average Reward: -166.02, Epsilon: 0.01
Episode 100, Average Reward: -158.39, Epsilon: 0.01
Episode 110, Average Reward: -170.03, Epsilon: 0.01
Episode 120, Average Reward: -156.74, Epsilon: 0.01
Episode 130, Average Reward: -165.59, Epsilon: 0.01
Episode 140, Average Reward: -158.50, Epsilon: 0.01
Episode 150, Average Reward: -163.35, Epsilon: 0.01
Episode 160, Average Reward: -167.54, Epsilon: 0.01
Episode 170, Average Reward: -166.29, Epsilon: 0.01
Episode 180, Average Reward: -169.00, Epsilon: 0.01
Episode 190, Average Reward: -167.75, Epsilon: 0.01
Episode 200, Average 