# Reinforcement Learning on CartPole-v1

In [None]:
#!pip install stable_baselines3
#!pip install gym
#!pip install pygame
#pip install opencv-python

In [24]:
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from stable_baselines3.common.monitor import Monitor
import gymnasium as gym
import os

import tensorflow as tf
tf.config.set_visible_devices(tf.config.list_physical_devices('GPU'), 'GPU')

In [25]:
# Create path where model and evaluation logs will be saved
save_path = os.path.join('Training', 'Saved Models')
log_path = os.path.join('Training', 'Logs')

In [34]:
# Create environment 
env = gym.make("CartPole-v1", render_mode="rgb_array")

# Wrap the environment with monitor wrapper
env = Monitor(env)

stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, 
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

In [35]:
# Define a custom learning rate schedule
def custom_learning_rate(progress_remaining):
    # Adjust the learning rate based on the progress remaining
    return 0.001 * progress_remaining

# Adjusted gamma value
gamma = 0.9
learning_rate = 0.0001

In [39]:
# Create an instance of the model
model = DQN("MlpPolicy", 
            env, 
            verbose=1, 
            tensorboard_log=log_path, 
            learning_rate=learning_rate,
            gamma=gamma)

# Train the model
model.learn(total_timesteps=100000, callback=eval_callback) # Try with more timesteps. I got decent results after training for 50000 steps

# Save the model
model.save(os.path.join(save_path, 'dqn_cartpole'))

Using cpu device
Wrapping the env in a DummyVecEnv.
Logging to Training/Logs/DQN_10
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 18.8     |
|    ep_rew_mean      | 18.8     |
|    exploration_rate | 0.993    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 13801    |
|    time_elapsed     | 0        |
|    total_timesteps  | 75       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 20.9     |
|    ep_rew_mean      | 20.9     |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 6895     |
|    time_elapsed     | 0        |
|    total_timesteps  | 167      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.522    |
|    n_updates        | 16       |
----------------------------------
------

In [40]:
del model # remove to demonstrate saving and loading
model = DQN.load(os.path.join(save_path, 'best_model'))

In [41]:
# Get the 
episode_rewards = env.get_episode_rewards()
sorted_episode_rewards = sorted(episode_rewards, reverse=True)
sorted_episode_rewards[:5]

[499.0, 480.0, 346.0, 332.0, 296.0]

#### Evaluation

In [42]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)

(289.5, 37.53465065775889)

In [43]:
env.close()

#### Test Model

In [47]:
import pygame
import cv2

obs, info = env.reset()
while True:

    try:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)

        # Render the RGB array
        rgb_array = env.render()
        cv2.imshow("CartPole-v1", rgb_array)
        cv2.waitKey(1)  # Wait for 1 millisecond to display the image

        if terminated or truncated:
            obs, info = env.reset()
            break
    except pygame.error as e:
        if str(e) == "display Surface quit":
            print("Pygame window closed. Exiting the loop.")
            break
        else:
            raise e

# Close the windows
cv2.destroyAllWindows()

#### View Logs in Tensorboard

In [48]:
# Set up the training log path
train_log_path = os.path.join(log_path, "DQN_10") # This is the name of the log file generated. You will change this filename in your code

In [49]:
!tensorboard --logdir={train_log_path}


Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.16.2 at http://localhost:6006/ (Press CTRL+C to quit)
^C
