In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class MultiPeriodNewsvendorEnv(gym.Env):
    """
    Multi-period newsvendor problem with vendor lead time and lost sales.
    """
    metadata = {"render_modes": ["human"]}

    def __init__(self, lead_time=5, episode_length=40, mu=100, price=50, cost=25, holding=0.5, penalty=5, max_order=300):
        super().__init__()
        self.lead_time = lead_time
        self.episode_length = episode_length
        self.mu = mu  # Mean of Poisson demand
        self.price = price
        self.cost = cost
        self.holding = holding
        self.penalty = penalty
        self.max_order = max_order

        self.action_space = spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(lead_time + 1,), dtype=np.float32)

        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.t = 0
        self.pipeline = [0] * self.lead_time
        self.inventory = 0
        self.state = np.array([self.inventory] + self.pipeline, dtype=np.float32)
        return self.state, {}

    def step(self, action):
        order_qty = int(action[0] * self.max_order)
        demand = np.random.poisson(self.mu)

        # Receive oldest order
        self.inventory += self.pipeline.pop(0)
        self.pipeline.append(order_qty)

        sold = min(self.inventory, demand)
        lost_sales = max(0, demand - self.inventory)
        leftover = max(0, self.inventory - demand)

        revenue = self.price * sold
        cost = self.cost * order_qty
        holding_cost = self.holding * leftover
        penalty_cost = self.penalty * lost_sales

        reward = revenue - cost - holding_cost - penalty_cost
        self.inventory = leftover

        self.state = np.array([self.inventory] + self.pipeline, dtype=np.float32)
        self.t += 1
        terminated = self.t >= self.episode_length

        return self.state, reward, terminated, False, {}

    def render(self):
        print(f"Time: {self.t}, Inventory: {self.inventory}, Pipeline: {self.pipeline}")

In [7]:
# Guardar el modelo entrenado

# Evaluar el modelo
env = MultiPeriodNewsvendorEnv()
obs, _ = env.reset()
total_reward = 0
for _ in range(env.episode_length):
    action= env.action_space.sample()
    print(action*300)
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward
    env.render()
    if done:
        break

print(f"Total reward: {total_reward}")

[291.09695]
Time: 1, Inventory: 0, Pipeline: [0, 0, 0, 0, 291]
[277.9776]
Time: 2, Inventory: 0, Pipeline: [0, 0, 0, 291, 277]
[11.105144]
Time: 3, Inventory: 0, Pipeline: [0, 0, 291, 277, 11]
[151.51865]
Time: 4, Inventory: 0, Pipeline: [0, 291, 277, 11, 151]
[248.24957]
Time: 5, Inventory: 0, Pipeline: [291, 277, 11, 151, 248]
[264.42224]
Time: 6, Inventory: 192, Pipeline: [277, 11, 151, 248, 264]
[69.916565]
Time: 7, Inventory: 365, Pipeline: [11, 151, 248, 264, 69]
[46.619904]
Time: 8, Inventory: 288, Pipeline: [151, 248, 264, 69, 46]
[125.67666]
Time: 9, Inventory: 351, Pipeline: [248, 264, 69, 46, 125]
[236.43262]
Time: 10, Inventory: 493, Pipeline: [264, 69, 46, 125, 236]
[261.69342]
Time: 11, Inventory: 647, Pipeline: [69, 46, 125, 236, 261]
[4.663822]
Time: 12, Inventory: 627, Pipeline: [46, 125, 236, 261, 4]
[143.28151]
Time: 13, Inventory: 586, Pipeline: [125, 236, 261, 4, 143]
[39.12287]
Time: 14, Inventory: 619, Pipeline: [236, 261, 4, 143, 39]
[186.2598]
Time: 15, Invento

In [2]:

import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv



# Crear y validar el entorno
env = MultiPeriodNewsvendorEnv()
check_env(env, warn=True)

# Envolver el entorno para entrenamiento
vec_env = DummyVecEnv([lambda: env])

# Inicializar el agente PPO
model = PPO("MlpPolicy", vec_env, verbose=1, learning_rate=1e-4, n_steps=2048, batch_size=64, gamma=0.99)

# Entrenar el modelo
model.learn(total_timesteps=100_000)





Using cpu device
-----------------------------
| time/              |      |
|    fps             | 7099 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 3748          |
|    iterations           | 2             |
|    time_elapsed         | 1             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00012248501 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.42         |
|    explained_variance   | -9.42e-06     |
|    learning_rate        | 0.0001        |
|    loss                 | 1.8e+08       |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.000979     |
|    std                  | 0.999         |
|    value_loss          

<stable_baselines3.ppo.ppo.PPO at 0x16b3f5a30>

In [8]:
# Guardar el modelo entrenado
model.save("ppo_newsvendor")

# Evaluar el modelo
obs, _ = env.reset()
total_reward = 0
for _ in range(env.episode_length):
    action, _ = model.predict(obs)
    print(action*300)
    obs, reward, done, _, _ = env.step(action)
    total_reward += reward
    env.render()
    if done:
        break

print(f"Total reward: {total_reward}")

[0.]
Time: 1, Inventory: 0, Pipeline: [0, 0, 0, 0, 0]
[141.79575]
Time: 2, Inventory: 0, Pipeline: [0, 0, 0, 0, 141]
[260.26855]
Time: 3, Inventory: 0, Pipeline: [0, 0, 0, 141, 260]
[83.57681]
Time: 4, Inventory: 0, Pipeline: [0, 0, 141, 260, 83]
[0.]
Time: 5, Inventory: 0, Pipeline: [0, 141, 260, 83, 0]
[34.459476]
Time: 6, Inventory: 0, Pipeline: [141, 260, 83, 0, 34]
[300.]
Time: 7, Inventory: 35, Pipeline: [260, 83, 0, 34, 300]
[0.]
Time: 8, Inventory: 187, Pipeline: [83, 0, 34, 300, 0]
[0.]
Time: 9, Inventory: 164, Pipeline: [0, 34, 300, 0, 0]
[175.71413]
Time: 10, Inventory: 56, Pipeline: [34, 300, 0, 0, 175]
[0.]
Time: 11, Inventory: 0, Pipeline: [300, 0, 0, 175, 0]
[0.]
Time: 12, Inventory: 205, Pipeline: [0, 0, 175, 0, 0]
[154.95032]
Time: 13, Inventory: 124, Pipeline: [0, 175, 0, 0, 154]
[0.]
Time: 14, Inventory: 14, Pipeline: [175, 0, 0, 154, 0]
[221.62448]
Time: 15, Inventory: 108, Pipeline: [0, 0, 154, 0, 221]
[46.113102]
Time: 16, Inventory: 9, Pipeline: [0, 154, 0, 221, 