# **download dataset**

In [None]:
!kaggle datasets download bryanpark/sudoku

Dataset URL: https://www.kaggle.com/datasets/bryanpark/sudoku
License(s): CC0-1.0
Downloading sudoku.zip to /content
 93% 63.0M/68.1M [00:00<00:00, 129MB/s]
100% 68.1M/68.1M [00:00<00:00, 123MB/s]


In [None]:
!unzip sudoku.zip

Archive:  sudoku.zip
  inflating: sudoku.csv              


In [None]:
!pip install stable-baselines3[extra]

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.4.0-py3-none-any.whl.metadata (4.5 kB)
Collecting gymnasium<1.1.0,>=0.29.1 (from stable-baselines3[extra])
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting ale-py>=0.9.0 (from stable-baselines3[extra])
  Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium<1.1.0,>=0.29.1->stable-baselines3[extra])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading stable_bas

# **PPO model**

In [None]:
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
import torch.nn as nn
from torch.cuda import is_available
import gc

class SudokuEnv(gym.Env):
    def __init__(self, data_path, batch_size=1000):
        super(SudokuEnv, self).__init__()
        self.data_path = data_path
        self.batch_size = batch_size
        self.current_batch_index = 0

        # Load data in chunks
        self.data_iterator = pd.read_csv(
            self.data_path,
            chunksize=self.batch_size
        )
        self.current_batch = next(self.data_iterator)

        # Convert current batch to numpy arrays
        self.puzzles = np.array([list(map(int, puzzle))
                                for puzzle in self.current_batch['quizzes']])
        self.solutions = np.array([list(map(int, solution))
                                 for solution in self.current_batch['solutions']])

        # Simplified observation space
        self.observation_space = spaces.Box(
            low=0, high=9, shape=(9, 9), dtype=np.float32
        )
        self.action_space = spaces.Discrete(81)

        self.current_puzzle = None
        self.current_solution = None
        self.steps = 0
        self.max_steps = 100

    def _load_next_batch(self):
        try:
            self.current_batch = next(self.data_iterator)
            self.puzzles = np.array([list(map(int, puzzle))
                                   for puzzle in self.current_batch['quizzes']])
            self.solutions = np.array([list(map(int, solution))
                                    for solution in self.current_batch['solutions']])
            self.current_batch_index = 0
        except StopIteration:
            # Reset iterator if we've gone through all chunks
            self.data_iterator = pd.read_csv(
                self.data_path,
                chunksize=self.batch_size
            )
            self.current_batch = next(self.data_iterator)
            self.puzzles = np.array([list(map(int, puzzle))
                                   for puzzle in self.current_batch['quizzes']])
            self.solutions = np.array([list(map(int, solution))
                                    for solution in self.current_batch['solutions']])
            self.current_batch_index = 0

        # Force garbage collection
        gc.collect()

    def reset(self, seed=None):
        super().reset(seed=seed)

        # Load next batch if we've used all puzzles in current batch
        if self.current_batch_index >= len(self.puzzles):
            self._load_next_batch()

        self.current_puzzle = self.puzzles[self.current_batch_index].reshape(9, 9).copy()
        self.current_solution = self.solutions[self.current_batch_index].reshape(9, 9)
        self.current_batch_index += 1
        self.steps = 0

        return self.current_puzzle.astype(np.float32), {}

    def _is_valid_move(self, row, col, num):
        # Check if cell is empty
        if self.current_puzzle[row, col] != 0:
            return False

        # Check row
        if num in self.current_puzzle[row]:
            return False

        # Check column
        if num in self.current_puzzle[:, col]:
            return False

        # Check 3x3 box
        box_row, box_col = 3 * (row // 3), 3 * (col // 3)
        box = self.current_puzzle[box_row:box_row+3, box_col:box_col+3]
        if num in box:
            return False

        return True

    def step(self, action):
        row, col = divmod(action, 9)
        number = self.current_solution[row, col]  # Use solution number
        self.steps += 1

        # Check if move is valid
        if not self._is_valid_move(row, col, number):
            return self.current_puzzle.astype(np.float32), -1, True, False, {}

        # Apply move
        self.current_puzzle[row, col] = number

        # Calculate reward
        reward = 1 if number == self.current_solution[row, col] else -1

        # Check if puzzle is solved or max steps reached
        done = np.array_equal(self.current_puzzle, self.current_solution)
        truncated = self.steps >= self.max_steps

        if done:
            reward += 10

        return self.current_puzzle.astype(np.float32), reward, done, truncated, {}


class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.shared_net = nn.Sequential(
            nn.Linear(81, 128),  # 9x9 flattened board
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU()
        )

        self.policy_net = nn.Sequential(
            nn.Linear(128, 81)  # Output for each cell
        )

        self.value_net = nn.Sequential(
            nn.Linear(128, 1)
        )

def train_model(env_fn, total_timesteps=100_000):
    # Create environment
    env = DummyVecEnv([env_fn])

    # Initialize model with network
    policy_kwargs = dict(
        net_arch=[64, 64]
    )

    # Use GPU if available
    device = 'cpu'

    model = PPO(
        "MlpPolicy",
        env,
        verbose=1,
        learning_rate=3e-4,
        n_steps=1024,  # Smaller batch size
        batch_size=64,
        n_epochs=5,
        gamma=0.99,
        policy_kwargs=policy_kwargs,
        device=device
    )

    # Train in smaller chunks
    chunk_size = 10_000
    for i in range(0, total_timesteps, chunk_size):
        current_chunk = min(chunk_size, total_timesteps - i)
        model.learn(total_timesteps=current_chunk)

        # Force garbage collection
        gc.collect()

        # Save checkpoint
        if (i + 1) % 50_000 == 0:
            model.save(f"ppo_sudoku_checkpoint_{i+1}")

    return model

def evaluate_model(model, env_fn, n_episodes=10):
    """
    Evaluate the model's performance while maintaining memory efficiency.

    Args:
        model: Trained PPO model
        env_fn: Function that creates a new environment
        n_episodes: Number of episodes to evaluate

    Returns:
        success_rate: Percentage of successfully solved puzzles
        avg_reward: Average reward across all episodes
    """
    # Create a fresh environment for evaluation
    eval_env = env_fn()

    successes = 0
    total_reward = 0

    for episode in range(n_episodes):
        obs, _ = eval_env.reset()
        episode_reward = 0
        done = False
        truncated = False

        while not (done or truncated):
            # Get model's action (using deterministic policy for evaluation)
            action, _ = model.predict(obs, deterministic=True)

            # Take step in environment
            obs, reward, done, truncated, _ = eval_env.step(action)
            episode_reward += reward

        # Count as success if puzzle was solved (not just truncated)
        success = done and not truncated and episode_reward > 0
        successes += success
        total_reward += episode_reward

        print(f"Episode {episode + 1}/{n_episodes}: "
              f"Success = {success}, Reward = {episode_reward:.2f}")

        # Force garbage collection after each episode
        gc.collect()

    success_rate = (successes / n_episodes) * 100
    avg_reward = total_reward / n_episodes

    return success_rate, avg_reward

# Example usage:
if __name__ == "__main__":
    # Create environment factory function
    def make_env():
        return SudokuEnv('/content/sudoku.csv', batch_size=1000)

    # Train model
    model = train_model(make_env)

    # Evaluate model
    print("\nEvaluating model performance...")
    success_rate, avg_reward = evaluate_model(model, make_env)
    print(f"\nFinal Results:")
    print(f"Success Rate: {success_rate:.2f}%")
    print(f"Average Reward: {avg_reward:.2f}")

    # Save final model
    model.save("ppo_sudoku_final")
    print("\nModel saved as 'ppo_sudoku_final'")

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1114 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 1024 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 914         |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.005044965 |
|    clip_fraction        | 0.0107      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.39       |
|    explained_variance   | -0.169      |
|    learning_rate        | 0.0003      |
|    loss                 | 1.12        |
|    n_updates            | 5           |
|    policy_gradient_loss | -0.0345     |
|    value_loss           | 2.03        |
-----------------------------------------
-----------------

# **DQN model**

In [None]:
# Import necessary libraries
import numpy as np
import gymnasium as gym
from gymnasium import spaces
import pandas as pd
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import BaseCallback
import os
from tqdm import tqdm
import time

# Define the Sudoku environment class
class SudokuEnv(gym.Env):
    def __init__(self, df):
        super().__init__()
        self.puzzles = np.array([list(map(int, quiz)) for quiz in df['quizzes']])
        self.solutions = np.array([list(map(int, sol)) for sol in df['solutions']])
        self.action_space = spaces.Discrete(9 * 9 * 9)
        self.observation_space = spaces.Box(low=0, high=9, shape=(9, 9), dtype=np.float32)
        self.current_puzzle = None
        self.current_solution = None
        self.steps = 0
        self.max_steps = 100
        self.total_reward = 0

    def _action_to_coords(self, action):
        row = action // (9 * 9)
        col = (action % (9 * 9)) // 9
        number = (action % 9) + 1
        return row, col, number

    def reset(self, seed=None):
        super().reset(seed=seed)
        idx = np.random.randint(len(self.puzzles))
        self.current_puzzle = self.puzzles[idx].reshape(9, 9).copy()
        self.current_solution = self.solutions[idx].reshape(9, 9)
        self.steps = 0
        self.total_reward = 0
        return self.current_puzzle.astype(np.float32), {}

    def step(self, action):
        row, col, number = self._action_to_coords(action)
        self.steps += 1

        if self.current_puzzle[row, col] != 0:
            return self.current_puzzle.astype(np.float32), -1, True, False, {}

        self.current_puzzle[row, col] = number

        reward = 5 if self.current_puzzle[row, col] == self.current_solution[row, col] else -1
        self.total_reward += reward

        done = np.array_equal(self.current_puzzle, self.current_solution)
        truncated = self.steps >= self.max_steps

        return self.current_puzzle.astype(np.float32), reward, done, truncated, {}

# Define a callback for training
class TrainingCallback(BaseCallback):
    def __init__(self, total_timesteps, log_interval=100):
        super().__init__()
        self.total_timesteps = total_timesteps
        self.log_interval = log_interval

    def _on_step(self):
        if self.n_calls % self.log_interval == 0:
            print(f"[INFO] Timestep: {self.n_calls}/{self.total_timesteps} | "
                  f"Epsilon: {self.model.exploration_rate:.2f} | "
                  f"Reward: {self.training_env.get_attr('total_reward')[0]:.1f}")
        return True

# Function to evaluate the model
def evaluate_model(model, env, n_episodes=20):
    successes = 0
    total_reward = 0

    for episode in range(1, n_episodes + 1):
        obs, _ = env.reset()
        done = False
        episode_reward = 0

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, truncated, _ = env.step(action)
            episode_reward += reward
            done = done or truncated

        success = episode_reward > 0
        successes += success
        total_reward += episode_reward

        print(f"[INFO] Episode: {episode}/{n_episodes} | "
              f"Success: {success} | Reward: {episode_reward:.1f}")

    success_rate = (successes / n_episodes) * 100
    avg_reward = total_reward / n_episodes
    return success_rate, avg_reward

# Function to solve and display Sudoku puzzles
def evaluate_and_solve(model, env, algorithm_name):
    print(f"\nSolving Sudoku with {algorithm_name}...")

    obs, _ = env.reset()
    unsolved_board = env.current_puzzle.copy()

    done = False
    episode_reward = 0
    start_time = time.time()

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, truncated, _ = env.step(action)
        episode_reward += reward
        done = done or truncated

    end_time = time.time()

    print("\nUnsolved Board:")
    print(unsolved_board)
    print("\nSolved Board:")
    print(env.current_puzzle)
    print(f"\nTime Taken by {algorithm_name}: {end_time - start_time:.2f} seconds")
    print(f"Reward Achieved: {episode_reward}\n")

# Function to train and save the DQN model
def train_and_save_model(model, env, model_type, total_timesteps=10000):
    print(f"Training {model_type} model...")
    callback = TrainingCallback(total_timesteps)
    model.learn(total_timesteps=total_timesteps, callback=callback)

    os.makedirs("models", exist_ok=True)
    model_path = f"models/{model_type.lower()}_sudoku"
    model.save(model_path)
    print(f"Model trained successfully and saved at: {model_path}")

# Main function to run the program
def main():
    df = pd.read_csv('sudoku.csv')
    env = SudokuEnv(df)

    policy_kwargs = dict(net_arch=[64, 64])
    dqn_model = DQN("MlpPolicy", env, verbose=0, learning_rate=0.0003, policy_kwargs=policy_kwargs)

    train_and_save_model(dqn_model, env, "DQN", total_timesteps=10000)

    print("\nEvaluating DQN model...")
    dqn_success_rate, dqn_avg_reward = evaluate_model(dqn_model, env)
    print("DQN Evaluation Completed.")
    print(f"DQN Success Rate: {dqn_success_rate:.1f}%")
    print(f"DQN Average Reward: {dqn_avg_reward:.1f}")

if __name__ == "__main__":
    main()

Training DQN model...
[INFO] Timestep: 100/10000 | Epsilon: 0.91 | Reward: 2.0
[INFO] Timestep: 200/10000 | Epsilon: 0.81 | Reward: -2.0
[INFO] Timestep: 300/10000 | Epsilon: 0.72 | Reward: -4.0
[INFO] Timestep: 400/10000 | Epsilon: 0.62 | Reward: 0.0
[INFO] Timestep: 500/10000 | Epsilon: 0.53 | Reward: -1.0
[INFO] Timestep: 600/10000 | Epsilon: 0.43 | Reward: -1.0
[INFO] Timestep: 700/10000 | Epsilon: 0.34 | Reward: 0.0
[INFO] Timestep: 800/10000 | Epsilon: 0.24 | Reward: 0.0
[INFO] Timestep: 900/10000 | Epsilon: 0.15 | Reward: 0.0
[INFO] Timestep: 1000/10000 | Epsilon: 0.05 | Reward: -2.0
[INFO] Timestep: 1100/10000 | Epsilon: 0.05 | Reward: 0.0
[INFO] Timestep: 1200/10000 | Epsilon: 0.05 | Reward: -1.0
[INFO] Timestep: 1300/10000 | Epsilon: 0.05 | Reward: 5.0
[INFO] Timestep: 1400/10000 | Epsilon: 0.05 | Reward: 0.0
[INFO] Timestep: 1500/10000 | Epsilon: 0.05 | Reward: 0.0
[INFO] Timestep: 1600/10000 | Epsilon: 0.05 | Reward: 0.0
[INFO] Timestep: 1700/10000 | Epsilon: 0.05 | Reward:

# **NeuralLogicMachine**

In [None]:
url='https://arxiv.org/pdf/2307.00653'

In [None]:
import numpy as np
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
import gymnasium as gym
from gymnasium import spaces

# Load and preprocess dataset
data = pd.read_csv('/content/sudoku.csv')
puzzles = np.array([list(map(int, p)) for p in data['quizzes']])
solutions = np.array([list(map(int, s)) for s in data['solutions']])

# Custom Gym Environment for Sudoku
class SudokuNLMEnv(gym.Env):
    """
    Custom Gym Environment for solving Sudoku puzzles using Neuro Logic Machines (NLM).
    """
    def __init__(self, puzzles, solutions):
        super(SudokuNLMEnv, self).__init__()
        self.puzzles = puzzles
        self.solutions = solutions

        # Action space: (row, col, number) encoded as a single integer (0 to 729)
        self.action_space = spaces.Discrete(9 * 9 * 9)

        # Observation space: predicates for row, column, and submatrix
        self.observation_space = spaces.Box(low=0, high=1, shape=(9, 9, 3), dtype=np.float32)

        self.current_puzzle = None
        self.current_solution = None
        self.steps = 0
        self.max_steps = 729

    def _action_to_coords(self, action):
        """
        Convert an action into row, column, and number to place.
        """
        row = action // (9 * 9)
        col = (action % (9 * 9)) // 9
        number = (action % 9) + 1
        return row, col, number

    def _generate_predicates(self, grid):
        """
        Generate predicates and summarize over the number dimension.
        """
        is_row = np.zeros((9, 9), dtype=int)
        is_col = np.zeros((9, 9), dtype=int)
        is_submat = np.zeros((9, 9), dtype=int)

        for r in range(9):
            for num in range(1, 10):
                is_row[r, num - 1] = (grid[r, :] == num).any()

        for c in range(9):
            for num in range(1, 10):
                is_col[c, num - 1] = (grid[:, c] == num).any()

        for sub_r in range(0, 9, 3):
            for sub_c in range(0, 9, 3):
                subgrid = grid[sub_r:sub_r+3, sub_c:sub_c+3]
                for num in range(1, 10):
                    is_present = (subgrid == num).any()
                    is_submat[sub_r:sub_r+3, sub_c:sub_c+3] += is_present

        is_submat = np.clip(is_submat, 0, 1)
        return np.stack([is_row, is_col, is_submat], axis=-1)

    def reset(self, seed=None, options=None):
        """
        Reset the environment with a random Sudoku puzzle.
        """
        super().reset(seed=seed)
        idx = np.random.randint(len(self.puzzles))
        self.current_puzzle = self.puzzles[idx].reshape(9, 9).copy()
        self.current_solution = self.solutions[idx].reshape(9, 9).copy()
        self.steps = 0

        return self._generate_predicates(self.current_puzzle), {}

    def step(self, action):
        """
        Perform an action in the environment.
        """
        row, col, number = self._action_to_coords(action)
        self.steps += 1

        if self.current_puzzle[row, col] != 0:
            return self._generate_predicates(self.current_puzzle), -5, False, self.steps >= self.max_steps, {}

        valid = (
            number not in self.current_puzzle[row, :] and
            number not in self.current_puzzle[:, col] and
            number not in self.current_puzzle[row//3*3:row//3*3+3, col//3*3:col//3*3+3]
        )
        reward = 1 if valid else -5

        if valid:
            self.current_puzzle[row, col] = number
            if self.current_puzzle[row, col] == self.current_solution[row, col]:
                reward += 10

        done = np.array_equal(self.current_puzzle, self.current_solution)
        truncated = self.steps >= self.max_steps

        return self._generate_predicates(self.current_puzzle), reward, done, truncated, {}

# Custom Callback for Monitoring Training
class SudokuCallback(BaseCallback):
    """
    Custom callback for logging and monitoring training progress.
    """
    def __init__(self, verbose=1):
        super(SudokuCallback, self).__init__(verbose)

    def _on_step(self) -> bool:
        if self.n_calls % 1000 == 0:
            episode_rewards = self.locals.get('rewards', [])
            if episode_rewards:
                avg_reward = np.mean(episode_rewards)
                print(f"[INFO] Step {self.n_calls}: Avg Reward = {avg_reward:.2f}")
        return True

# Evaluate the Model
def evaluate_model(model, env, n_episodes=100):
    """
    Evaluate the trained model on the environment.
    """
    success_count = 0
    total_reward = 0

    for episode in range(n_episodes):
        obs, _ = env.reset()
        done = False
        episode_reward = 0

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, truncated, _ = env.step(action)
            episode_reward += reward

            if done:
                success_count += 1
                break

            if truncated:
                break

        total_reward += episode_reward

    success_rate = success_count / n_episodes
    avg_reward = total_reward / n_episodes

    print(f"Evaluation Results:\nSuccess Rate: {success_rate:.2%}\nAverage Reward: {avg_reward:.2f}")
    return success_rate, avg_reward

# Main Script
if __name__ == "__main__":
    # Train the Model
    env = DummyVecEnv([lambda: SudokuNLMEnv(puzzles, solutions)])
    model = PPO("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps=100_000, callback=SudokuCallback())
    model.save("sudoku_nlm_model")

    # Evaluate the Model
    eval_env = SudokuNLMEnv(puzzles, solutions)
    evaluate_model(model, eval_env, n_episodes=100)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    loss                 | 2.84        |
|    n_updates            | 2390        |
|    policy_gradient_loss | -0.0291     |
|    value_loss           | 8.5         |
-----------------------------------------
[INFO] Step 492000: Avg Reward = -5.00
[INFO] Step 493000: Avg Reward = -5.00
-----------------------------------------
| time/                   |             |
|    fps                  | 226         |
|    iterations           | 241         |
|    time_elapsed         | 2176        |
|    total_timesteps      | 493568      |
| train/                  |             |
|    approx_kl            | 0.020133015 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.94       |
|    explained_variance   | 0.927       |
|    learning_rate        | 0.0003      |
|    loss                 | 3.06        |
|    n_updates            | 2400        |
|    policy_gradi

In [None]:
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO

# Step 1: Load and preprocess the dataset
data = pd.read_csv('/content/sudoku.csv')
data = data.sample(n=1000000, random_state=42)

# Convert puzzles and solutions into numpy arrays
puzzles = np.array([list(map(int, puzzle)) for puzzle in data['quizzes']])
solutions = np.array([list(map(int, solution)) for solution in data['solutions']])

# Step 2: Define the custom Sudoku environment
class SudokuEnv(gym.Env):
    def __init__(self, puzzles, solutions):
        super(SudokuEnv, self).__init__()
        self.puzzles = puzzles
        self.solutions = solutions
        self.action_space = spaces.Discrete(81 * 9)  # 81 cells × 9 numbers
        self.observation_space = spaces.Box(
            low=0, high=9, shape=(9, 9), dtype=np.float32
        )
        self.current_puzzle = None
        self.current_solution = None
        self.steps = 0
        self.max_steps = 100

    def _action_to_coords(self, action):
        """Convert a discrete action into row, column, and number."""
        cell = action // 9
        number = (action % 9) + 1
        row, col = divmod(cell, 9)
        return row, col, number

    def reset(self, seed=None):
        super().reset(seed=seed)
        idx = np.random.randint(len(self.puzzles))
        self.current_puzzle = self.puzzles[idx].reshape(9, 9).copy()
        self.current_solution = self.solutions[idx].reshape(9, 9)
        self.steps = 0
        return self.current_puzzle.astype(np.float32), {}

    def step(self, action):
        row, col, number = self._action_to_coords(action)
        self.steps += 1

        # Check if the cell is prefilled
        if self.puzzles[0].reshape(9, 9)[row, col] != 0:
            return self.current_puzzle.astype(np.float32), -1, True, False, {}

        # Apply the action
        self.current_puzzle[row, col] = number

        # Calculate reward
        if self.current_puzzle[row, col] == self.current_solution[row, col]:
            reward = 5  # Positive reward for correct placement
        else:
            reward = -1  # Negative reward for incorrect placement

        # Check if puzzle is solved or max steps are reached
        done = np.array_equal(self.current_puzzle, self.current_solution)
        truncated = self.steps >= self.max_steps

        return self.current_puzzle.astype(np.float32), reward, done, truncated, {}

# Step 3: Initialize the environment
env = SudokuEnv(puzzles, solutions)

# Step 4: Define the PPO model
policy_kwargs = dict(net_arch=[256, 256])  # Neural network architecture
model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    policy_kwargs=policy_kwargs,
)

# Step 5: Train the model
total_timesteps = 100_000
model.learn(total_timesteps=total_timesteps)

# Step 6: Evaluate the model
def evaluate_model(model, env, n_episodes=10):
    successes = 0
    total_reward = 0

    for episode in range(n_episodes):
        obs, _ = env.reset()
        done = False
        episode_reward = 0

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, truncated, _ = env.step(action)
            episode_reward += reward
            done = done or truncated

        success = episode_reward > 0
        successes += success
        total_reward += episode_reward

        print(f"Episode {episode + 1}/{n_episodes}: Success = {success}, Reward = {episode_reward}")

    success_rate = (successes / n_episodes) * 100
    avg_reward = total_reward / n_episodes
    return success_rate, avg_reward

success_rate, avg_reward = evaluate_model(model, env)
print(f"Success Rate: {success_rate:.2f}%")
print(f"Average Reward: {avg_reward:.2f}")

# Save the model
model.save("ppo_sudoku")
print("Model saved as 'ppo_sudoku.zip'")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2.38     |
|    ep_rew_mean     | -1.6     |
| time/              |          |
|    fps             | 371      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 2.31       |
|    ep_rew_mean          | -1.29      |
| time/                   |            |
|    fps                  | 224        |
|    iterations           | 2          |
|    time_elapsed         | 18         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.05113909 |
|    clip_fraction        | 0.722      |
|    clip_range           | 0.2        |
|    entropy_loss         | -6.59