<a href="https://colab.research.google.com/github/pj0620/google-colab-notebooks/blob/main/Minesweeper_rl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random
from scipy.signal import convolve2d
%pip install stable-baselines3[extra]

import numpy as np
import gymnasium as gym
from gymnasium import spaces

!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]

!apt-get install swig cmake ffmpeg
!pip install git+https://github.com/DLR-RM/rl-baselines3-zoo

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.3.2-py3-none-any.whl.metadata (5.1 kB)
Collecting gymnasium<0.30,>=0.28.1 (from stable-baselines3[extra])
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting shimmy~=1.3.0 (from shimmy[atari]~=1.3.0; extra == "extra"->stable-baselines3[extra])
  Downloading Shimmy-1.3.0-py3-none-any.whl.metadata (3.7 kB)
Collecting autorom~=0.6.1 (from autorom[accept-rom-license]~=0.6.1; extra == "extra"->stable-baselines3[extra])
  Downloading AutoROM-0.6.1-py3-none-any.whl.metadata (2.4 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]~=0.6.1; extra == "extra"->stable-baselines3[extra])
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdon

Collecting autorom~=0.4.2 (from autorom[accept-rom-license]~=0.4.2; extra == "accept-rom-license"->gymnasium[accept-rom-license])
  Downloading AutoROM-0.4.2-py3-none-any.whl.metadata (2.8 kB)
Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Installing collected packages: autorom
  Attempting uninstall: autorom
    Found existing installation: AutoROM 0.6.1
    Uninstalling AutoROM-0.6.1:
      Successfully uninstalled AutoROM-0.6.1
Successfully installed autorom-0.4.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 49 not upgraded.
Need to get 1,116 kB of archives.
Aft

## Hyperparameter configurations





Create a file named `dql.yml` with the following contents

```
Minesweeper-v1:
  frame_stack: 1
  policy: 'CnnPolicy'
  n_timesteps: !!float 1e6
  buffer_size: 100000
  learning_rate: !!float 1e-4
  batch_size: 32
  learning_starts: 100000
  target_update_interval: 1000
  train_freq: 4
  gradient_steps: 1
  exploration_fraction: 0.1
  exploration_final_eps: 0.01
  # If True, you need to deactivate handle_timeout_termination
  # in the replay_buffer_kwargs
  optimize_memory_usage: False
```


## Custom gymnasium env

In [9]:
import gymnasium as gym
import numpy as np
from gymnasium import spaces
from skimage.transform import resize

CLICK_BOMB = "click-bomb"
GAME_WIN = "game-win"
GAME_LOSE = "game-lose"
CLICK_VISIBLE = "click-visible"
CLICK_GUESS = "click-guess"
CLICK_VALID = "click-valid"

DEFAULT_REWARDS = {
    CLICK_VISIBLE: -0.5,
    CLICK_BOMB: -1.,
    GAME_WIN: 1,
    CLICK_VALID: 0.3,
    CLICK_GUESS: -0.3
}

class MinesweeperEnvironment(gym.Env):
  # Because of google colab, we cannot implement the GUI ('human' render mode)
  metadata = {"render_modes": ["console"]}

  def __init__(self, board_size=9, total_bombs=8, render_mode="console",
               end_on_bomb=False, randomize_on_reset=False,
               end_on_visible_click=False, rewards=None,
               use_dict_space=False):
    super().__init__()
    self.render_mode = render_mode
    self.board_size = board_size
    self.total_bombs = total_bombs
    self.end_on_bomb = end_on_bomb
    self.randomize_on_reset = randomize_on_reset
    self.end_on_visible_click = end_on_visible_click
    self.use_dict_space = use_dict_space

    self.visible = np.zeros((self.board_size, self.board_size), dtype=np.uint8)
    self.set_bombs_vals()
    # self.set_values()

    if self.use_dict_space:
      self.observation_space = spaces.Dict(dict(
        board=spaces.Box(
          low=-1, high=1, shape=(self.board_size, self.board_size, 2)
        ))
      )
    else:
      self.observation_space = spaces.Box(
          low=-1, high=1, shape=(self.board_size, self.board_size, 2)
      )

    # Define action space
    self.action_space = spaces.Discrete(self.board_size**2)

    self.last_action = -1

    if rewards is None:
      self.rewards = DEFAULT_REWARDS
    else:
      self.rewards = rewards

  def set_bombs_vals(self):
    self.bombs = np.zeros((self.board_size, self.board_size), dtype=np.uint8)
    self.vals = np.zeros((self.board_size, self.board_size), dtype=np.uint8)
    # Generate unique random positions for bombs
    bomb_positions = np.random.choice(self.board_size * self.board_size, self.total_bombs, replace=False)

    # Convert linear indices to row, column indices
    rows, cols = np.unravel_index(bomb_positions, (self.board_size, self.board_size))
    self.bombs[rows, cols] = 1

    # Increment neighbors for each bomb position using array slicing
    for r, c in zip(rows, cols):
      # Use slicing to add 1 to all neighboring cells in `self.vals`
      self.vals[max(0, r - 1):min(self.board_size, r + 2),
                max(0, c - 1):min(self.board_size, c + 2)] += 1

  def reset(self, seed=None, options=None):
    # Reset the environment to an initial state
    self.visible = np.zeros((self.board_size, self.board_size), dtype=np.uint8)

    # same everytime
    if self.randomize_on_reset:
      self.set_bombs_vals()
    self.last_action = -1

    return self.get_state(), {}

  def propogate(self, x: int, y: int):
    # If the initial cell is a bomb or already visible, return immediately
    if self.bombs[x][y] == 1 or self.visible[x][y] == 1:
      return

    # Initialize a stack for iterative propagation
    stack = [(x, y)]

    while stack:
      cx, cy = stack.pop()

      # Skip cells already visible
      if self.visible[cx][cy] == 1:
        continue

      # Mark current cell as visible
      self.visible[cx][cy] = 1

      # Only continue to neighbors if this cell has no adjacent bombs
      if self.vals[cx][cy] == 0:
        # Add all valid neighbors to the stack
        for nx in range(max(0, cx - 1), min(self.board_size - 1, cx + 1) + 1):
          for ny in range(max(0, cy - 1), min(self.board_size - 1, cy + 1) + 1):
            # Skip the cell itself
            if (nx, ny) != (cx, cy) and self.visible[nx][ny] == 0:
              stack.append((nx, ny))


  def step(self, action):
    # Implement the logic for taking a step in the environment
    x = action // self.board_size
    y = action % self.board_size

    start_visible_cells = np.sum(self.visible)

    if self.visible[x][y] == 1:
      start_teminated = bool((self.board_size**2 - start_visible_cells) == self.total_bombs)
      if self.end_on_visible_click:
        return self.get_state(), self.rewards[CLICK_VISIBLE], True, True, {"effect": CLICK_VISIBLE}
      else:
        return self.get_state(), self.rewards[CLICK_VISIBLE], start_teminated, False, {"effect": CLICK_VISIBLE}
    elif self.bombs[x][y] == 1:
      if self.end_on_bomb:
        return self.get_state(), self.rewards[CLICK_BOMB], True, True, {"effect": CLICK_BOMB}
      else:
        return self.get_state(), self.rewards[CLICK_BOMB], False, False, {"effect": CLICK_BOMB}

    # allow first click to be guess if no other cells available
    if start_visible_cells < 2:
      is_guess_click = False
    else:
      # Set boundaries for slicing without going out of bounds
      x_min = max(0, x - 1)
      x_max = min(self.board_size - 1, x + 1)
      y_min = max(0, y - 1)
      y_max = min(self.board_size - 1, y + 1)

      # Get neighborhood slice
      neighborhood = self.visible[x_min:x_max+1, y_min:y_max+1]

      # Check for any visible cells, excluding the center cell
      is_guess_click = np.any(neighborhood)

    self.last_action = action

    if self.vals[x][y] == 0:
      self.propogate(x, y)

    self.visible[x][y] = 1
    end_visible_cells = np.sum(self.visible)
    teminated = bool((self.board_size**2 - end_visible_cells) == self.total_bombs)

    if teminated:
      return self.get_state(), self.rewards[GAME_WIN], teminated, False, {"effect": GAME_WIN}
    else:
      reward = self.rewards[CLICK_GUESS] if is_guess_click else self.rewards[CLICK_VALID]
      info = CLICK_GUESS if is_guess_click else CLICK_VALID
      return self.get_state(), reward, teminated, False, {"effect": info}

  def get_state(self):
    visible_vals = self.visible * self.vals
    visible_vals = (visible_vals.astype(np.float32) - 4) / 4
    visible_scaled = 2 * (self.visible.astype(np.float32) - 0.5)
    board = np.stack([visible_vals, visible_scaled], axis=2).astype(np.float32)

    if self.use_dict_space:
      return dict(board=board)
    else:
      return board

  def render(self, mode="console"):
    if self.render_mode != "console":
        raise NotImplementedError("Render mode not supported.")

    # Print the current visible board state
    cells_left = int(self.board_size**2 - np.sum(self.visible))
    print(f"Current Board: {cells_left - self.total_bombs} Cells Left")
    print("# " + " ".join(str(i) for i in range(self.board_size)))
    for i in range(self.board_size):
        print(f"{i} ", end="")
        row = ""
        for j in range(self.board_size):
            if self.visible[i][j] == 1:
                # If the cell is visible, show its value (number of adjacent bombs)
                if self.bombs[i][j] == 1:
                  row += f"B "
                else:
                  row += f"{self.vals[i][j]} "
            else:
                # If the cell is hidden, show an asterisk
                row += "* "
        print(row)
    # print("\n")


from stable_baselines3.common.env_checker import check_env

env = MinesweeperEnvironment()

# Testing env
check_env(env)
print("starting game")
for round in range(1):
  obs, info = env.reset()
  for _ in range(10):
      # Random action
      action = env.action_space.sample()
      obs, reward, terminated, truncated, info = env.step(action)
      if terminated:
          obs, info = env.reset()
      env.render()
      reward_pos = f"{reward * (env.board_size ** 2)} / {env.board_size ** 2}"
      print(f"action: {action} -> {(action // env.board_size, action % env.board_size)}")
      # print(f"reward: {reward if reward <= 0 else reward_pos}")
      print(f"reward: {reward}")
      print("\n")
  env.reset()

starting game
Current Board: 72 Cells Left
# 0 1 2 3 4 5 6 7 8
0 * * * * * * * * * 
1 * * * * * * * * * 
2 * * * * * * * * * 
3 * * * * * * * * * 
4 * * * * 1 * * * * 
5 * * * * * * * * * 
6 * * * * * * * * * 
7 * * * * * * * * * 
8 * * * * * * * * * 
action: 40 -> (4, 4)
reward: 0.3


Current Board: 7 Cells Left
# 0 1 2 3 4 5 6 7 8
0 0 0 0 0 1 * * * * 
1 0 0 0 0 2 * * * * 
2 0 0 0 0 2 * * 3 1 
3 1 1 0 0 2 * 3 1 0 
4 * 1 0 0 1 * 1 0 0 
5 1 1 0 0 1 1 1 0 0 
6 0 0 0 0 0 0 0 0 0 
7 1 1 1 0 0 0 0 0 0 
8 * * 1 0 0 0 0 0 0 
action: 47 -> (5, 2)
reward: 0.3


Current Board: 7 Cells Left
# 0 1 2 3 4 5 6 7 8
0 0 0 0 0 1 * * * * 
1 0 0 0 0 2 * * * * 
2 0 0 0 0 2 * * 3 1 
3 1 1 0 0 2 * 3 1 0 
4 * 1 0 0 1 * 1 0 0 
5 1 1 0 0 1 1 1 0 0 
6 0 0 0 0 0 0 0 0 0 
7 1 1 1 0 0 0 0 0 0 
8 * * 1 0 0 0 0 0 0 
action: 36 -> (4, 0)
reward: -1.0


Current Board: 7 Cells Left
# 0 1 2 3 4 5 6 7 8
0 0 0 0 0 1 * * * * 
1 0 0 0 0 2 * * * * 
2 0 0 0 0 2 * * 3 1 
3 1 1 0 0 2 * 3 1 0 
4 * 1 0 0 1 * 1 0 0 
5 1 1 0 0 1 1 1

  and should_run_async(code)


### Play minesweeper

In [22]:
env = MinesweeperEnvironment(end_on_bomb=True)

terminated = False
obs, info = env.reset()
env.render()
while not terminated:
  actions_str = input("enter row then column with a space(\"X Y\"):").split(" ")
  actions_int = [int(x) for x in actions_str]
  action = actions_int[0] + actions_int[1] * env.board_size
  obs, reward, terminated, truncated, info = env.step(action)
  print(f"reward: {reward}")
  env.render()

Current Board: 73 Cells Left
# 0 1 2 3 4 5 6 7 8
0 * * * * * * * * * 
1 * * * * * * * * * 
2 * * * * * * * * * 
3 * * * * * * * * * 
4 * * * * * * * * * 
5 * * * * * * * * * 
6 * * * * * * * * * 
7 * * * * * * * * * 
8 * * * * * * * * * 
enter row then column with a space("X Y"):0 3
reward: -0.3
Current Board: 72 Cells Left
# 0 1 2 3 4 5 6 7 8
0 * * * * * * * * * 
1 * * * * * * * * * 
2 * * * * * * * * * 
3 1 * * * * * * * * 
4 * * * * * * * * * 
5 * * * * * * * * * 
6 * * * * * * * * * 
7 * * * * * * * * * 
8 * * * * * * * * * 
enter row then column with a space("X Y"):0 4
reward: 0.3
Current Board: 71 Cells Left
# 0 1 2 3 4 5 6 7 8
0 * * * * * * * * * 
1 * * * * * * * * * 
2 * * * * * * * * * 
3 1 * * * * * * * * 
4 1 * * * * * * * * 
5 * * * * * * * * * 
6 * * * * * * * * * 
7 * * * * * * * * * 
8 * * * * * * * * * 
enter row then column with a space("X Y"):5 4
reward: -0.3
Current Board: 70 Cells Left
# 0 1 2 3 4 5 6 7 8
0 * * * * * * * * * 
1 * * * * * * * * * 
2 * * * * * * * * *

# Non-random Minesweeper

## DQN

In [15]:
from math import log

# 18418
log(0.01) / log(0.99975)

18418 / 100_000

0.18418

In [31]:
import torch as th
import torch.nn as nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomCNN(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=256):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[2]  # Should be 2 for (9, 9, 2) input

        # Define a custom CNN architecture
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 64, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.Conv2d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            # nn.Conv2d(64, 64, kernel_size=3, padding=1),
            # nn.ReLU(),
            # nn.Conv2d(64, 64, kernel_size=3, padding=1),
            # nn.ReLU(),
            nn.Flatten()
        )

        # Calculate the output size of the CNN dynamically
        with th.no_grad():
            sample_input = th.as_tensor(observation_space.sample()[None]).float().permute(0, 3, 1, 2)
            n_flatten = self.cnn(sample_input).shape[1]
            print("Flattened output size after CNN:", n_flatten)  # Debugging statement

        # Define fully connected layers using the computed flattened size
        # self.linear = nn.Sequential(
        #     nn.Linear(n_flatten, 256),  # Use dynamically calculated n_flatten
        #     nn.ReLU(),
        #     nn.Linear(256, 128),
        #     nn.ReLU(),
        #     nn.Linear(256, features_dim)  # Output size of features_dim (256)
        # )
        self.linear = nn.Sequential(
            nn.Linear(n_flatten, features_dim),
            nn.ReLU()
        )

    def forward(self, observations):
        observations = observations.permute(0, 3, 1, 2)  # Rearrange dimensions for Conv2d
        cnn_output = self.cnn(observations)
        return self.linear(cnn_output)


  and should_run_async(code)


In [62]:
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecTransposeImage

vec_env = make_vec_env(MinesweeperEnvironment, n_envs=1, env_kwargs=dict(end_on_bomb=True))

# vec_env = make_vec_env(MinesweeperEnvironment, n_envs=1)
# vec_env = VecTransposeImage(vec_env)  # Transpose to [batch_size, channels, height, width]

# Optimized hyperparameters for DQN
learning_rate = 1e-4           # Learning rate for weight updates
buffer_size = 500_000            # Replay buffer size
learning_starts = 1000         # Steps before learning begins
batch_size = 64                # Number of samples per training update
train_freq = (4, 'step')    # Frequency of training updates
# train_freq = (1, 'episode')
target_update_interval = 750   # Steps between target network updates
exploration_fraction = 0.25    # Fraction of total timesteps for epsilon decay
exploration_final_eps = 0.01   # Final epsilon value after decay
gamma = 0.99                    # Discount factor for future rewards

#  0.99975 ** x = 0.01 => x = log(0.01) / log(0.99975)

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=256)  # Output size of the final layer
)

# Instantiate DQN with improved hyperparameters
model = DQN(
    "CnnPolicy",
    vec_env,
    learning_rate=learning_rate,
    buffer_size=buffer_size,
    learning_starts=learning_starts,
    batch_size=batch_size,
    train_freq=train_freq,
    target_update_interval=target_update_interval,
    exploration_fraction=exploration_fraction,
    exploration_final_eps=exploration_final_eps,
    gamma=gamma,
    policy_kwargs=policy_kwargs,  # Use custom network
    verbose=1
)

# Train the model
model.learn(total_timesteps=500_000)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00342  |
|    n_updates        | 113283   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 32.4     |
|    ep_rew_mean      | 9.61     |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 18220    |
|    fps              | 462      |
|    time_elapsed     | 982      |
|    total_timesteps  | 454265   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000982 |
|    n_updates        | 113316   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 32.4     |
|    ep_rew_mean      | 9.62     |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 1

<stable_baselines3.dqn.dqn.DQN at 0x782e3aef3040>

## PPO

In [None]:
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env

vec_env = make_vec_env(MinesweeperEnvironment, n_envs=1)

# Define hyperparameters for PPO
learning_rate = 1e-4       # Learning rate for PPO
n_steps = 2048             # Number of steps to run for each environment per update
batch_size = 64            # Batch size for each update
n_epochs = 10              # Number of times to train on each batch
gamma = 0.99               # Discount factor
gae_lambda = 0.95          # GAE lambda, for variance reduction in advantage estimation
clip_range = 0.2           # Clip range for PPO, helps with stable training

# Instantiate PPO with custom hyperparameters
model = PPO(
    "CnnPolicy",           # Policy type, can try "CnnPolicy" for image-based inputs
    vec_env,
    learning_rate=learning_rate,
    n_steps=n_steps,
    batch_size=batch_size,
    n_epochs=n_epochs,
    gamma=gamma,
    gae_lambda=gae_lambda,
    clip_range=clip_range,
    verbose=1               # Verbose output
)

# Train the model
model.learn(total_timesteps=100000)

Using cuda device
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.63e+03 |
|    ep_rew_mean     | -9.6e+05 |
| time/              |          |
|    fps             | 226      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 2048     |
---------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.63e+03  |
|    ep_rew_mean          | -9.6e+05  |
| time/                   |           |
|    fps                  | 207       |
|    iterations           | 2         |
|    time_elapsed         | 19        |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 25.478064 |
|    clip_fraction        | 0.66      |
|    clip_range           | 0.2       |
|    entropy_loss         | -2.98     |
|    explained_variance   | -6.08e-06 |
|

## Evaluation

In [66]:
import torch

# Test the trained agent
# using the vecenv
vec_env2 = make_vec_env(MinesweeperEnvironment, n_envs=1, env_kwargs=dict(end_on_bomb=True))
obs = vec_env2.reset()
n_steps = 100
total_substeps = 0
for step in range(n_steps):
    # action, _ = model.predict(obs, deterministic=False)

    # Convert the observation to a tensor and ensure it's on the same device as the model
    obs_tensor = torch.tensor(obs, dtype=torch.float32).to(model.device)

    # Get Q-values directly from the model's q_net
    with torch.no_grad():
        q_values = model.q_net(obs_tensor).cpu().numpy()

    # Flatten the Q-values and get indices in descending order
    ranked_actions = q_values[0].argsort()[::-1]

    print(f"found {len(ranked_actions)} ranked actions")

    for k, action_k in enumerate(ranked_actions):
      total_substeps += 1
      print(f"trying action {k}")
      x = int(action_k) // 9
      y = int(action_k) % 9
      print("Action: ", (x, y), action_k)
      obs, reward, done, info = vec_env2.step([action_k])
      effect = info[0]["effect"]
      print("reward=", reward, "done=", done, "effect=", effect)
      if not done:
        vec_env2.render()

      if done or (effect in [CLICK_GUESS, CLICK_VALID]):
        break

    if done:
        print("Won!" if effect == GAME_WIN else "Lost :(")
        print("total_steps:", step + 1)
        print("total_substeps:", total_substeps)
        break

found 81 ranked actions
trying action 0
Action:  (6, 4) 58
reward= [-0.3] done= [False] effect= click-guess
Current Board: 72 Cells Left
# 0 1 2 3 4 5 6 7 8
0 * * * * * * * * * 
1 * * * * * * * * * 
2 * * * * * * * * * 
3 * * * * * * * * * 
4 * * * * * * * * * 
5 * * * * * * * * * 
6 * * * * 1 * * * * 
7 * * * * * * * * * 
8 * * * * * * * * * 
found 81 ranked actions
trying action 0
Action:  (1, 5) 14
reward= [-0.3] done= [False] effect= click-guess
Current Board: 71 Cells Left
# 0 1 2 3 4 5 6 7 8
0 * * * * * * * * * 
1 * * * * * 1 * * * 
2 * * * * * * * * * 
3 * * * * * * * * * 
4 * * * * * * * * * 
5 * * * * * * * * * 
6 * * * * 1 * * * * 
7 * * * * * * * * * 
8 * * * * * * * * * 
found 81 ranked actions
trying action 0
Action:  (7, 5) 68
reward= [0.3] done= [False] effect= click-valid
Current Board: 70 Cells Left
# 0 1 2 3 4 5 6 7 8
0 * * * * * * * * * 
1 * * * * * 1 * * * 
2 * * * * * * * * * 
3 * * * * * * * * * 
4 * * * * * * * * * 
5 * * * * * * * * * 
6 * * * * 1 * * * * 
7 * *

# Random Minesweeper

## Play Random map minesweeper

In [5]:
env_test = MinesweeperEnvironment(randomize_on_reset=False)

print("Bombs before")
print(env_test.bombs)

env_test.reset()

print("\n")
print("Bombs after")
print(env_test.bombs)

Bombs before
[[0 0 0 1 0 0 0 0 1]
 [0 0 0 0 0 1 0 1 1]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1]]


Bombs after
[[0 0 0 1 0 0 0 0 1]
 [0 0 0 0 0 1 0 1 1]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1]]


## PPO Solver

In [5]:
import torch as th
import torch.nn as nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecTransposeImage

class CustomCNN2(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=256):
        super(CustomCNN2, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[2]  # Should be 2 for (9, 9, 2) input

        # Define a custom CNN architecture
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            # nn.Conv2d(32, 32, kernel_size=3, padding=1),
            # nn.ReLU(),
            # nn.Conv2d(32, 32, kernel_size=3, padding=1),
            # nn.ReLU(),
            nn.Flatten()
        )

        # Calculate the output size of the CNN dynamically
        with th.no_grad():
            sample_input = th.as_tensor(observation_space.sample()[None]).float().permute(0, 3, 1, 2)
            n_flatten = self.cnn(sample_input).shape[1]
            print("Flattened output size after CNN:", n_flatten)  # Debugging statement

        # Define fully connected layers using the computed flattened size
        # self.linear = nn.Sequential(
        #     nn.Linear(n_flatten, 256),  # Use dynamically calculated n_flatten
        #     nn.ReLU(),
        #     nn.Linear(256, 128),
        #     nn.ReLU(),
        #     nn.Linear(256, features_dim)  # Output size of features_dim (256)
        # )
        self.linear = nn.Sequential(
            nn.Linear(n_flatten, 1024),
            nn.ReLU(),
            # nn.Linear(1024, 512),
            # nn.ReLU(),
            # nn.Linear(512, 256),
            # nn.ReLU(),
            nn.Linear(1024, features_dim),
        )

    def forward(self, observations):
        observations = observations.permute(0, 3, 1, 2)  # Rearrange dimensions for Conv2d
        cnn_output = self.cnn(observations)
        return self.linear(cnn_output)

ppo_rewards = {
    CLICK_VISIBLE: 0,
    CLICK_BOMB: -2.,
    GAME_WIN: 2,
    CLICK_VALID: 0.5,
    CLICK_GUESS: 0.25
}

vec_env_rand = make_vec_env(MinesweeperEnvironment, n_envs=1, env_kwargs=dict(end_on_bomb=True, randomize_on_reset=True, end_on_visible_click=False, rewards=ppo_rewards))

# vec_env = make_vec_env(MinesweeperEnvironment, n_envs=1)
# vec_env = VecTransposeImage(vec_env)  # Transpose to [batch_size, channels, height, width]

policy_kwargs = dict(
    features_extractor_class=CustomCNN2,
    features_extractor_kwargs=dict(features_dim=256)  # Output size of the final layer
)

# Define hyperparameters for PPO
learning_rate = 1e-4       # Learning rate for PPO
n_steps = 2048             # Number of steps to run for each environment per update
batch_size = 64            # Batch size for each update
n_epochs = 10              # Number of times to train on each batch
gamma = 0.99               # Discount factor
gae_lambda = 0.95          # GAE lambda, for variance reduction in advantage estimation
clip_range = 0.2           # Clip range for PPO, helps with stable training

# Instantiate PPO with custom hyperparameters
model = PPO(
    "CnnPolicy",           # Policy type, can try "CnnPolicy" for image-based inputs
    vec_env_rand,
    learning_rate=learning_rate,
    n_steps=n_steps,
    batch_size=batch_size,
    n_epochs=n_epochs,
    gamma=gamma,
    gae_lambda=gae_lambda,
    clip_range=clip_range,
    policy_kwargs=policy_kwargs,
    verbose=1               # Verbose output
)

# Train the model
model.learn(total_timesteps=300_000)

Using cuda device
Flattened output size after CNN: 2592
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 11.3     |
|    ep_rew_mean     | -0.492   |
| time/              |          |
|    fps             | 461      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 11.1        |
|    ep_rew_mean          | -0.542      |
| time/                   |             |
|    fps                  | 394         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008461751 |
|    clip_fraction        | 0.0359      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.39       |
|    explained_v

KeyboardInterrupt: 

## HER

In [13]:
import torch as th
import torch.nn as nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3 import PPO, A2C, DQN, HerReplayBuffer
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecTransposeImage

class CustomCNN2(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=256):
        super(CustomCNN2, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[2]  # Should be 2 for (9, 9, 2) input

        # Define a custom CNN architecture
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.Conv2d(32, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            # nn.Conv2d(32, 32, kernel_size=3, padding=1),
            # nn.ReLU(),
            # nn.Conv2d(32, 32, kernel_size=3, padding=1),
            # nn.ReLU(),
            nn.Flatten()
        )

        # Calculate the output size of the CNN dynamically
        with th.no_grad():
            sample_input = th.as_tensor(observation_space.sample()[None]).float().permute(0, 3, 1, 2)
            n_flatten = self.cnn(sample_input).shape[1]
            print("Flattened output size after CNN:", n_flatten)  # Debugging statement

        # Define fully connected layers using the computed flattened size
        # self.linear = nn.Sequential(
        #     nn.Linear(n_flatten, 256),  # Use dynamically calculated n_flatten
        #     nn.ReLU(),
        #     nn.Linear(256, 128),
        #     nn.ReLU(),
        #     nn.Linear(256, features_dim)  # Output size of features_dim (256)
        # )
        self.linear = nn.Sequential(
            nn.Linear(n_flatten, 1024),
            nn.ReLU(),
            # nn.Linear(1024, 512),
            # nn.ReLU(),
            # nn.Linear(512, 256),
            # nn.ReLU(),
            nn.Linear(1024, features_dim),
        )

    def forward(self, observations):
        observations = observations.permute(0, 3, 1, 2)  # Rearrange dimensions for Conv2d
        cnn_output = self.cnn(observations)
        return self.linear(cnn_output)

ppo_rewards = {
    CLICK_VISIBLE: 0,
    CLICK_BOMB: -2.,
    GAME_WIN: 2,
    CLICK_VALID: 0.5,
    CLICK_GUESS: 0.25
}

# vec_env_rand = make_vec_env(
#     MinesweeperEnvironment,
#     n_envs=1,
#     env_kwargs=dict(
#         end_on_bomb=True,
#         randomize_on_reset=True,
#         end_on_visible_click=False,
#         rewards=ppo_rewards,
#         use_dict_space=True
#     )
# )

vec_env_rand = MinesweeperEnvironment(
  end_on_bomb=True,
  randomize_on_reset=True,
  end_on_visible_click=False,
  rewards=ppo_rewards,
  use_dict_space=True
)

# vec_env = make_vec_env(MinesweeperEnvironment, n_envs=1)
# vec_env = VecTransposeImage(vec_env)  # Transpose to [batch_size, channels, height, width]

policy_kwargs = dict(
    features_extractor_class=CustomCNN2,
    features_extractor_kwargs=dict(features_dim=256)  # Output size of the final layer
)

# Instantiate PPO with custom hyperparameters
# model = PPO(
#     "CnnPolicy",           # Policy type, can try "CnnPolicy" for image-based inputs
#     vec_env_rand,
#     learning_rate=learning_rate,
#     n_steps=n_steps,
#     batch_size=batch_size,
#     n_epochs=n_epochs,
#     gamma=gamma,
#     gae_lambda=gae_lambda,
#     clip_range=clip_range,
#     policy_kwargs=policy_kwargs,
#     replay_buffer_class=HerReplayBuffer,
#     verbose=1               # Verbose output
# )

model = DQN(
    "MultiInputPolicy",
    vec_env_rand,
    replay_buffer_class=HerReplayBuffer,
    # Parameters for HER
    replay_buffer_kwargs=dict(
        n_sampled_goal=4,
        goal_selection_strategy="future",
    ),
    # policy_kwargs=policy_kwargs,
    verbose=1,
)

# Train the model
model.learn(total_timesteps=300_000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 4.5      |
|    ep_rew_mean      | -0.312   |
|    exploration_rate | 0.999    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3422     |
|    time_elapsed     | 0        |
|    total_timesteps  | 18       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 5.38     |
|    ep_rew_mean      | -0.625   |
|    exploration_rate | 0.999    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2431     |
|    time_elapsed     | 0        |
|    total_timesteps  | 43       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 5.92     |
|    ep_rew_mean      | -0.792

KeyError: 'achieved_goal'

## Evaluation on random game

In [39]:
import torch

# Test the trained agent
# using the vecenv
vec_env2_rand = make_vec_env(MinesweeperEnvironment, n_envs=1, env_kwargs=dict(end_on_bomb=True, randomize_on_reset=True))
obs = vec_env2_rand.reset()
n_steps = 1000
total_substeps = 0
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=False)
    ranked_actions = [action]

    # # Convert the observation to a tensor and ensure it's on the same device as the model
    # obs_tensor = torch.tensor(obs, dtype=torch.float32).to(model.device)

    # # Get Q-values directly from the model's q_net
    # with torch.no_grad():
    #     q_values = model.q_net(obs_tensor).cpu().numpy()

    # # Flatten the Q-values and get indices in descending order
    # ranked_actions = q_values[0].argsort()[::-1]

    print(f"found {len(ranked_actions)} ranked actions")

    for k, action_k in enumerate(ranked_actions):
      total_substeps += 1
      print(f"trying action {k}")
      x = int(action_k) // 9
      y = int(action_k) % 9
      print("Action: ", (x, y), action_k)
      obs, reward, done, info = vec_env2_rand.step(action_k)
      effect = info[0]["effect"]
      print("reward=", reward, "done=", done, "effect=", effect)
      if not done:
        vec_env2_rand.render()

      if done or (effect in [CLICK_GUESS, CLICK_VALID]):
        break

    if done:
        print("Won!" if effect == GAME_WIN else "Lost :(")
        print("total_steps:", step + 1)
        print("total_substeps:", total_substeps)
        break

found 1 ranked actions
trying action 0
Action:  (5, 7) [52]
reward= [0.3] done= [False] effect= click-valid
Current Board: 72 Cells Left
# 0 1 2 3 4 5 6 7 8
0 * * * * * * * * * 
1 * * * * * * * * * 
2 * * * * * * * * * 
3 * * * * * * * * * 
4 * * * * * * * * * 
5 * * * * * * * 2 * 
6 * * * * * * * * * 
7 * * * * * * * * * 
8 * * * * * * * * * 
found 1 ranked actions
trying action 0
Action:  (4, 5) [41]
reward= [0.3] done= [False] effect= click-valid
Current Board: 71 Cells Left
# 0 1 2 3 4 5 6 7 8
0 * * * * * * * * * 
1 * * * * * * * * * 
2 * * * * * * * * * 
3 * * * * * * * * * 
4 * * * * * 1 * * * 
5 * * * * * * * 2 * 
6 * * * * * * * * * 
7 * * * * * * * * * 
8 * * * * * * * * * 
found 1 ranked actions
trying action 0
Action:  (0, 8) [8]
reward= [0.3] done= [False] effect= click-valid
Current Board: 28 Cells Left
# 0 1 2 3 4 5 6 7 8
0 0 0 0 0 0 0 0 0 0 
1 0 0 0 0 0 0 0 0 0 
2 0 0 0 0 0 0 0 0 0 
3 0 0 1 1 1 1 2 2 1 
4 0 1 2 * * 1 * * * 
5 0 1 * * * * * 2 * 
6 2 3 * * * * * * * 
7 * *

  x = int(action_k) // 9


found 1 ranked actions
trying action 0
Action:  (7, 2) [65]
reward= [-0.3] done= [False] effect= click-guess
Current Board: 10 Cells Left
# 0 1 2 3 4 5 6 7 8
0 0 0 0 0 0 0 0 0 0 
1 0 0 0 0 0 0 0 0 0 
2 0 0 0 0 0 0 0 0 0 
3 0 0 1 1 1 1 2 2 1 
4 0 1 2 * * 1 * * * 
5 0 1 * * * * 2 2 1 
6 2 3 * * * 2 1 0 0 
7 * * 1 * 2 * 1 0 0 
8 2 2 * * 1 1 1 0 0 
found 1 ranked actions
trying action 0
Action:  (0, 3) [3]
reward= [-0.5] done= [False] effect= click-visible
Current Board: 10 Cells Left
# 0 1 2 3 4 5 6 7 8
0 0 0 0 0 0 0 0 0 0 
1 0 0 0 0 0 0 0 0 0 
2 0 0 0 0 0 0 0 0 0 
3 0 0 1 1 1 1 2 2 1 
4 0 1 2 * * 1 * * * 
5 0 1 * * * * 2 2 1 
6 2 3 * * * 2 1 0 0 
7 * * 1 * 2 * 1 0 0 
8 2 2 * * 1 1 1 0 0 
found 1 ranked actions
trying action 0
Action:  (5, 5) [50]
reward= [-0.3] done= [False] effect= click-guess
Current Board: 9 Cells Left
# 0 1 2 3 4 5 6 7 8
0 0 0 0 0 0 0 0 0 0 
1 0 0 0 0 0 0 0 0 0 
2 0 0 0 0 0 0 0 0 0 
3 0 0 1 1 1 1 2 2 1 
4 0 1 2 * * 1 * * * 
5 0 1 * * * 2 2 2 1 
6 2 3 * * * 2 1 0 0 
7

## Old RL Zoo code

In [None]:
# from rl_zoo3.train import train
# from gym.envs.registration import register

# register(
#     id='Minesweeper-v1',
#     entry_point='msenv:MinesweeperEnvironment',  # Update '__main__' to the module name if this is not in your main script
#     max_episode_steps=100,  # Adjust based on expected game length
# )

# import gym
# print([k for k in gym.envs.registry.keys() if "Minesweeper" in k])

# !python -m rl_zoo3.train --algo dqn --env Minesweeper-v1 -f logs/ -c dqn.yml
