In [18]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
from tqdm import tqdm
from IPython.display import clear_output
import matplotlib.pyplot as plt

from stable_baselines3.common.env_util import make_vec_env


In [19]:
# Simple Q-network
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


# Hyperparameters
BATCH_SIZE = 64
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 500
LR = 1e-3
TARGET_UPDATE = 10
MEM_SIZE = 10000

# env = gym.make("CartPole-v1")
env = make_vec_env("CartPole-v1", n_envs=4)

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

policy_net = QNetwork(state_size, action_size)
target_net = QNetwork(state_size, action_size)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LR)
memory = deque(maxlen=MEM_SIZE)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy_net.to(device)
target_net.to(device)

def select_action(state, steps_done):
    eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(
        -1.0 * steps_done / EPS_DECAY
    )
    if random.random() < eps_threshold:
        return env.action_space.sample()
    else:
        with torch.no_grad():
            return policy_net(torch.FloatTensor(state)).argmax().item()


def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    batch = random.sample(memory, BATCH_SIZE)

    states, actions, rewards, next_states, dones = zip(*batch)

    states = torch.FloatTensor(states).to(device)
    actions = torch.LongTensor(actions).to(device)
    rewards = torch.FloatTensor(rewards).to(device)
    next_states = torch.FloatTensor(next_states).to(device)
    print(type(states))

    dones = torch.FloatTensor(dones)

    q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
    next_q_values = target_net(next_states).max(1)[0]
    target = rewards + (1 - dones) * GAMMA * next_q_values

    loss = nn.MSELoss()(q_values, target.detach())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


num_episodes = 200
steps_done = 0

progress_bar = tqdm(range(num_episodes), desc="Training")

# for episode in progress_bar:
#     env.reset()
#     state, _ = env.reset()
#     total_reward = 0
#     done = False
#     while not done:
#         action = select_action(state, steps_done)
#         steps_done += 1
#         next_state, reward, terminated, truncated, _ = env.step(action)
#         done = terminated or truncated
#         memory.append((state, action, reward, next_state, float(done)))
#         state = next_state
#         total_reward += reward

#         optimize_model()

#     if episode % TARGET_UPDATE == 0:
#         target_net.load_state_dict(policy_net.state_dict())

#     progress_bar.set_postfix({"Episode": episode, "Reward": total_reward})
all_rewards = []
plotting = False  # Set to True to enable plotting

for episode in progress_bar:
    state, _ = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = select_action(state, steps_done)
        steps_done += 1
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        memory.append((state, action, reward, next_state, float(done)))
        state = next_state
        total_reward += reward

        optimize_model()

    if episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

    all_rewards.append(total_reward)

    # Clear and update the plot every few episodes to reduce flicker
    if episode % 10 == 0 and plotting:
        clear_output(wait=True)
        plt.figure(figsize=(10, 5))
        plt.title("Total Rewards by Episode")
        plt.xlabel("Episode")
        plt.ylabel("Total Reward")
        plt.plot(all_rewards)
        plt.show()

    progress_bar.set_postfix({"Episode": episode, "Reward": total_reward})


env.close()

Training:   0%|          | 0/200 [00:00<?, ?it/s]


ValueError: too many values to unpack (expected 2)

In [15]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make("CartPole-v1")

model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=20000)

mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

obs, _ = env.reset()
for _ in range(1000):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env.step(action)
    env.render()
    if done or truncated:
        obs, _ = env.reset()

env.close()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 26.8     |
|    ep_rew_mean      | 26.8     |
|    exploration_rate | 0.949    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 21402    |
|    time_elapsed     | 0        |
|    total_timesteps  | 107      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.516    |
|    n_updates        | 1        |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 21.5     |
|    ep_rew_mean      | 21.5     |
|    exploration_rate | 0.918    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7995     |
|    time_elapsed     | 0        |
|    total_timesteps  | 172      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.484    |
|    n_updates        | 17       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 20.3     |
|    ep_rew_mean      | 20.3     |
|    exploration_rate | 0.884    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 6096     |
|    time_elapsed     | 0        |
|    total_timesteps  | 244      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.444    |
|    n_updates      

  gym.logger.warn(
