<a href="https://colab.research.google.com/github/pj0620/google-colab-notebooks/blob/main/Minesweeper_rl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random
from scipy.signal import convolve2d
%pip install stable-baselines3[extra]

import numpy as np
import gymnasium as gym
from gymnasium import spaces

!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]

!apt-get install swig cmake ffmpeg
!pip install git+https://github.com/DLR-RM/rl-baselines3-zoo

Collecting shimmy~=1.3.0 (from shimmy[atari]~=1.3.0; extra == "extra"->stable-baselines3[extra])
  Using cached Shimmy-1.3.0-py3-none-any.whl.metadata (3.7 kB)
Collecting autorom~=0.6.1 (from autorom[accept-rom-license]~=0.6.1; extra == "extra"->stable-baselines3[extra])
  Using cached AutoROM-0.6.1-py3-none-any.whl.metadata (2.4 kB)
Using cached AutoROM-0.6.1-py3-none-any.whl (9.4 kB)
Using cached Shimmy-1.3.0-py3-none-any.whl (37 kB)
Installing collected packages: shimmy, autorom
  Attempting uninstall: shimmy
    Found existing installation: Shimmy 0.2.1
    Uninstalling Shimmy-0.2.1:
      Successfully uninstalled Shimmy-0.2.1
  Attempting uninstall: autorom
    Found existing installation: AutoROM 0.4.2
    Uninstalling AutoROM-0.4.2:
      Successfully uninstalled AutoROM-0.4.2
Successfully installed autorom-0.6.1 shimmy-1.3.0
Collecting shimmy<1.0,>=0.1.0 (from shimmy[atari]<1.0,>=0.1.0; extra == "atari"->gymnasium[atari])
  Using cached Shimmy-0.2.1-py3-none-any.whl.metadata (2

Collecting autorom~=0.4.2 (from autorom[accept-rom-license]~=0.4.2; extra == "accept-rom-license"->gymnasium[accept-rom-license])
  Using cached AutoROM-0.4.2-py3-none-any.whl.metadata (2.8 kB)
Using cached AutoROM-0.4.2-py3-none-any.whl (16 kB)
Installing collected packages: autorom
  Attempting uninstall: autorom
    Found existing installation: AutoROM 0.6.1
    Uninstalling AutoROM-0.6.1:
      Successfully uninstalled AutoROM-0.6.1
Successfully installed autorom-0.4.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 49 not upgraded.
Need to get 1,116 kB of archives.
A

# Hyperparameter configurations
Create a file named `dql.yml` with the following contents

```
Minesweeper-v1:
  frame_stack: 1
  policy: 'CnnPolicy'
  n_timesteps: !!float 1e6
  buffer_size: 100000
  learning_rate: !!float 1e-4
  batch_size: 32
  learning_starts: 100000
  target_update_interval: 1000
  train_freq: 4
  gradient_steps: 1
  exploration_fraction: 0.1
  exploration_final_eps: 0.01
  # If True, you need to deactivate handle_timeout_termination
  # in the replay_buffer_kwargs
  optimize_memory_usage: False
```





## Custom gymnasium env

In [112]:
import gymnasium as gym
import numpy as np
from gymnasium import spaces
from skimage.transform import resize

class MinesweeperEnvironment(gym.Env):
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {"render_modes": ["console"]}

  def __init__(self, board_size=14, total_bombs=14, render_mode="console"):
    super().__init__()
    self.render_mode = render_mode
    self.board_size = board_size
    self.total_bombs = total_bombs

    self.visible = np.zeros((self.board_size, self.board_size), dtype=np.uint8)
    self.set_bombs()
    self.set_values()

    # Define observation space
    # self.observation_space = spaces.Dict({
    #   "visible_vals": spaces.Box(low=0, high=8, shape=(self.board_size, self.board_size), dtype=np.int8),
    #   "visible": spaces.Box(low=0, high=1, shape=(self.board_size, self.board_size), dtype=np.int8)
    # })

    # self.observation_space = spaces.Box(
    #     low=0, high=8, shape=(2 * self.board_size**2,), dtype=np.int8
    # )

    self.observation_space = spaces.Box(
        low=0, high=255, shape=(84, 84, 3), dtype=np.uint8
    )

    # Define action space
    self.action_space = spaces.Discrete(self.board_size**2)

    self.last_action = -1

  def set_bombs(self):
    self.bombs = np.zeros(shape=(self.board_size, self.board_size),  dtype=np.uint8)
    placed_bombs = 0
    while placed_bombs < self.total_bombs:
      i = random.randint(0, self.board_size-1)
      j = random.randint(0, self.board_size-1)

      if self.bombs[i][j] == 0:
        self.bombs[i][j] = 1
        placed_bombs += 1

  def set_values(self):
    KERNAL = np.ones((3, 3))
    self.vals = convolve2d(self.bombs, KERNAL, mode='same').astype(np.uint8)

  def reset(self, seed=None, options=None):
    # Reset the environment to an initial state
    self.visible = np.zeros((self.board_size, self.board_size), dtype=np.uint8)
    self.set_bombs()
    self.set_values()
    self.last_action = -1

    return self.get_state(), {}

  def propogate(self, x: int, y: int):
    if self.bombs[x][y] == 1 or self.visible[x][y] == 1:
      return

    if self.vals[x][y] == 0:
      self.visible[x][y] = 1
      for x_k in [x - 1, x, x + 1]:
        for y_k in [y - 1, y, y + 1]:
          if (x_k, y_k) == (x, y):
            continue

          if x_k < 0 or x_k > self.board_size - 1:
            continue

          if y_k < 0 or y_k > self.board_size - 1:
            continue

          self.propogate(x_k, y_k)

  def step(self, action):
    # Implement the logic for taking a step in the environment
    x = action // self.board_size
    y = action % self.board_size

    start_visible_cells = np.sum(self.visible)
    start_teminated = bool((self.board_size**2 - start_visible_cells) == self.total_bombs)
    if (self.visible[x][y] == 1) or (action == self.last_action) or (self.bombs[x][y] == 1):
      return self.get_state(), -100.0, start_teminated, False, {}

    self.last_action = action

    if self.vals[x][y] == 0:
      self.propogate(x, y)

    self.visible[x][y] = 1
    end_visible_cells = np.sum(self.visible)
    teminated = bool((self.board_size**2 - end_visible_cells) == self.total_bombs)
    return self.get_state(), float(end_visible_cells) - float(start_visible_cells), teminated, False, {}

  def get_state(self):
    # print(f"self.visible: min={self.visible.min()} max={self.visible.max()}")
    # print(f"self.visible * self.vals: min={(self.visible * self.vals).min()} max={(self.visible * self.vals).max()}")
    res = (255 / 8) * np.stack((self.visible * self.vals, self.visible, np.zeros((self.board_size, self.board_size))), axis=2)
    # print(f"res: min={res.min()} max={res.max()}")
    return resize(res, (84, 84, 3), anti_aliasing=False).astype(np.uint8)
    # return np.hstack((np.ravel(self.visible * self.vals), np.ravel(self.visible)))

  def render(self, mode="console"):
    if self.render_mode != "console":
        raise NotImplementedError("Render mode not supported.")

    # Print the current visible board state
    print("Current Board:")
    for i in range(self.board_size):
        row = ""
        for j in range(self.board_size):
            if self.visible[i][j] == 1:
                # If the cell is visible, show its value (number of adjacent bombs)
                if self.bombs[i][j] == 1:
                  row += f"B "
                else:
                  row += f"{self.vals[i][j]} "
            else:
                # If the cell is hidden, show an asterisk
                row += "* "
        print(row)
    print("\n")


from stable_baselines3.common.env_checker import check_env

env = MinesweeperEnvironment(board_size=10, total_bombs=10)

# Testing env
check_env(env)
for round in range(10):
  obs, info = env.reset()
  for _ in range(10):
      # Random action
      action = env.action_space.sample()
      obs, reward, terminated, truncated, info = env.step(action)
      if terminated:
          obs, info = env.reset()
      # env.render()
  env.reset()

  and should_run_async(code)


In [113]:
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env

# Instantiate the env
vec_env = make_vec_env(MinesweeperEnvironment, n_envs=1)
learning_rate = 0.1e-4       # Adjust learning rate
buffer_size = 100000       # Size of the replay buffer
learning_starts = 10000    # Number of steps before learning starts
batch_size = 32            # Batch size for training
train_freq = 4             # Frequency of training (in steps)
target_update_interval = 500  # Frequency to update target network
exploration_fraction = 0.2 # Fraction of total timesteps for exploration
exploration_final_eps = 0.01  # Final epsilon for exploration

# Instantiate DQN with custom hyperparameters
model = DQN(
    "CnnPolicy",            # Policy type (can also use "CnnPolicy" for image-based input)
    vec_env,
    learning_rate=learning_rate,
    buffer_size=buffer_size,
    learning_starts=learning_starts,
    batch_size=batch_size,
    train_freq=train_freq,
    target_update_interval=target_update_interval,
    exploration_fraction=exploration_fraction,
    exploration_final_eps=exploration_final_eps,
    verbose=1               # Verbose output
)
model.learn(total_timesteps=100000)

Using cuda device
Wrapping the env in a VecTransposeImage.
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 742       |
|    ep_rew_mean      | -6.66e+04 |
|    exploration_rate | 0.853     |
| time/               |           |
|    episodes         | 4         |
|    fps              | 560       |
|    time_elapsed     | 5         |
|    total_timesteps  | 2966      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 967       |
|    ep_rew_mean      | -8.89e+04 |
|    exploration_rate | 0.617     |
| time/               |           |
|    episodes         | 8         |
|    fps              | 508       |
|    time_elapsed     | 15        |
|    total_timesteps  | 7734      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 1.4e+03   |
|    ep_rew_mean      | -1.32e+05 |
|    

<stable_baselines3.dqn.dqn.DQN at 0x7a33f171fd30>

In [114]:
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env

vec_env = make_vec_env(MinesweeperEnvironment, n_envs=1)

# Define hyperparameters for PPO
learning_rate = 1e-4       # Learning rate for PPO
n_steps = 2048             # Number of steps to run for each environment per update
batch_size = 64            # Batch size for each update
n_epochs = 10              # Number of times to train on each batch
gamma = 0.99               # Discount factor
gae_lambda = 0.95          # GAE lambda, for variance reduction in advantage estimation
clip_range = 0.2           # Clip range for PPO, helps with stable training

# Instantiate PPO with custom hyperparameters
model = PPO(
    "MlpPolicy",           # Policy type, can try "CnnPolicy" for image-based inputs
    vec_env,
    learning_rate=learning_rate,
    n_steps=n_steps,
    batch_size=batch_size,
    n_epochs=n_epochs,
    gamma=gamma,
    gae_lambda=gae_lambda,
    clip_range=clip_range,
    verbose=1               # Verbose output
)

# Train the model
model.learn(total_timesteps=100000)

Using cuda device
Wrapping the env in a VecTransposeImage.




----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1.02e+03  |
|    ep_rew_mean     | -9.41e+04 |
| time/              |           |
|    fps             | 305       |
|    iterations      | 1         |
|    time_elapsed    | 6         |
|    total_timesteps | 2048      |
----------------------------------


KeyboardInterrupt: 

In [95]:
# Test the trained agent
# using the vecenv
obs = vec_env.reset()
n_steps = 20
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=False)
    # print(f"Step {step + 1}")
    x = int(action[0]) // 10
    y = int(action[0]) % 10
    print("Action: ", (x, y))
    obs, reward, done, info = vec_env.step(action)
    print("reward=", reward, "done=", done)
    vec_env.render()
    if done:
        # Note that the VecEnv resets automatically
        # when a done signal is encountered
        print("Goal reached!", "reward=", reward)
        break

Action:  (9, 4)
reward= [43.] done= [False]
Current Board:
0 0 * * * 0 * * * * 
0 0 * * * 0 * * * * 
0 0 * * * 0 * * * * 
* * 0 0 0 0 0 * * * 
* * 0 0 0 0 0 * * * 
* * 0 0 0 0 0 * * * 
* * 0 * * * * * 0 0 
* * 0 * * * * * 0 0 
* * 0 * * * * * 0 0 
0 0 0 0 0 0 0 0 0 0 


Action:  (2, 1)
reward= [-100.] done= [False]
Current Board:
0 0 * * * 0 * * * * 
0 0 * * * 0 * * * * 
0 0 * * * 0 * * * * 
* * 0 0 0 0 0 * * * 
* * 0 0 0 0 0 * * * 
* * 0 0 0 0 0 * * * 
* * 0 * * * * * 0 0 
* * 0 * * * * * 0 0 
* * 0 * * * * * 0 0 
0 0 0 0 0 0 0 0 0 0 


Action:  (7, 8)
reward= [-100.] done= [False]
Current Board:
0 0 * * * 0 * * * * 
0 0 * * * 0 * * * * 
0 0 * * * 0 * * * * 
* * 0 0 0 0 0 * * * 
* * 0 0 0 0 0 * * * 
* * 0 0 0 0 0 * * * 
* * 0 * * * * * 0 0 
* * 0 * * * * * 0 0 
* * 0 * * * * * 0 0 
0 0 0 0 0 0 0 0 0 0 


Action:  (2, 4)
reward= [1.] done= [False]
Current Board:
0 0 * * * 0 * * * * 
0 0 * * * 0 * * * * 
0 0 * * 1 0 * * * * 
* * 0 0 0 0 0 * * * 
* * 0 0 0 0 0 * * * 
* * 0 0 0 0 0 * * * 

In [19]:
from rl_zoo3.train import train
from gym.envs.registration import register

register(
    id='Minesweeper-v1',
    entry_point='msenv:MinesweeperEnvironment',  # Update '__main__' to the module name if this is not in your main script
    max_episode_steps=100,  # Adjust based on expected game length
)

import gym
print([k for k in gym.envs.registry.keys() if "Minesweeper" in k])

!python -m rl_zoo3.train --algo dqn --env Minesweeper-v1 -f logs/ -c dqn.yml


  and should_run_async(code)
  logger.warn(f"Overriding environment {spec.id}")


['Minesweeper-v1']
2024-10-30 01:14:03.087725: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-30 01:14:03.121048: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-30 01:14:03.130907: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  from jax import xla_computation as _xla_computation
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/r