In [29]:
# Define the environment
import gym
from gym import spaces
import numpy as np
from stable_baselines3 import PPO, A2C, DQN
from sb3_contrib import TRPO

# Define the market environment
class MarketEnvironment(gym.Env):
    def __init__(self):
        super(MarketEnvironment, self).__init__()

        # Define action and observation space
        # They must be gym.spaces objects
        # Example when using discrete actions, Box(2,) for two sellers
        self.action_space = spaces.MultiDiscrete([101, 101])

        # Prices could range from 0 to 100, there are four buyers
        self.observation_space = spaces.Box(low=0, high=100, shape=(6,))
        self.prices_history = []


        # Initialize state
        self.reset()

    def step(self, action):
        # Execute one time step within the environment
        assert self.action_space.contains(action)

        # Simple model: buyers buy from the cheapest seller
        sorted_sellers = np.argsort(action)
        self.state[0] = action[sorted_sellers[0]]
        self.state[1] = action[sorted_sellers[1]]

        # Distribute the buyers
        for i in range(2, 6):
            if self.state[i] >= self.state[0]:
                self.state[0] += self.state[i]
                self.state[i] = 0
            elif self.state[i] >= self.state[1]:
                self.state[1] += self.state[i]
                self.state[i] = 0

        # Set reward as the profit of the sellers
        reward = self.state[0] + self.state[1]

        # Set done flag if all buyers have bought the products
        done = np.sum(self.state[2:]) == 0

        # Save the prices to history
        self.prices_history.append(action)

        return self.state, reward, done, {}


    def reset(self):
        # Reset the state of the environment to an initial state
        self.state = np.zeros(6)
        # Initialize buyers' willingness to pay
        self.state[2:6] = np.random.uniform(low=0, high=100, size=4)
        return self.state

# Initialize environment
env = MarketEnvironment()

# Initialize reinforcement learning agents
model1 = PPO("MlpPolicy", env, verbose=1)
model2 = TRPO("MlpPolicy", env, verbose=1)
# Train agents
model1.learn(total_timesteps=20000)
model2.learn(total_timesteps=20000)



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 10.3     |
|    ep_rew_mean     | 1.23e+03 |
| time/              |          |
|    fps             | 918      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 9.52         |
|    ep_rew_mean          | 1.15e+03     |
| time/                   |              |
|    fps                  | 761          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0024758424

<sb3_contrib.trpo.trpo.TRPO at 0x7f0f5571d340>

In [39]:
def evaluate_model(model, env, num_episodes=100):
    episode_rewards = []
    episode_actions = []
    for i in range(num_episodes):
        obs = env.reset()
        done = False
        episode_reward = 0
        actions = []
        while not done:
            action, _ = model.predict(obs)
            obs, reward, done, info = env.step(action)
            episode_reward += reward
            actions.append(action)
        episode_rewards.append(episode_reward)
        episode_actions.append(actions)
    return np.mean(episode_rewards), episode_actions

# Evaluate the first agent
mean_reward1, actions1 = evaluate_model(model1, env)
print(f"Mean reward for the first agent: {mean_reward1}")

# Evaluate the second agent
mean_reward2, actions2 = evaluate_model(model2, env)
print(f"Mean reward for the second agent: {mean_reward2}")


Mean reward for the first agent: 2160.6553933399573
Mean reward for the second agent: 12321.024391415709
