### Working 

In [None]:
import gym
import gym_sokoban
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import cv2

class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(self._get_conv_out(input_shape), 512)
        self.fc2 = nn.Linear(512, num_actions)

    def _get_conv_out(self, shape):
        o = self.conv1(torch.zeros(1, *shape))
        o = self.conv2(o)
        o = self.conv3(o)
        return int(np.prod(o.size()))

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

def preprocess_state(state):
    # Convert state to grayscale if it is in RGB
    if state.shape[-1] == 3:
        state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)
    state = np.array(state, dtype=np.float32) / 255.0  # Normalize pixel values to [0, 1]
    state = np.expand_dims(state, axis=0)  # Add channel dimension
    state = np.expand_dims(state, axis=0)  # Add batch dimension
    return torch.tensor(state)

def select_action(state, policy_net, epsilon, num_actions):
    if random.random() > epsilon:
        with torch.no_grad():
            return policy_net(state).argmax().item()
    else:
        return random.randrange(num_actions)

env = gym.make('Sokoban-small-v1')
num_actions = env.action_space.n
input_shape = (1, *env.observation_space.shape[:2])  # Adjust to grayscale shape

policy_net = DQN(input_shape, num_actions)
target_net = DQN(input_shape, num_actions)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=0.001)
memory = deque(maxlen=10000)
batch_size = 32
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.1
epsilon_decay = 1000
target_update = 10
num_episodes = 1000

episode_rewards = []  # To track rewards per episode

for episode in range(num_episodes):
    state = preprocess_state(env.reset())
    done = False
    epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-1. * episode / epsilon_decay)
    total_reward = 0  # Track total reward for the episode

    while not done:
        action = select_action(state, policy_net, epsilon, num_actions)
        next_state, reward, done, _ = env.step(action)
        next_state = preprocess_state(next_state)
        memory.append((state, action, reward, next_state, done))

        state = next_state
        total_reward += reward

        if len(memory) >= batch_size:
            batch = random.sample(memory, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)
            states = torch.cat(states)
            actions = torch.tensor(actions)
            rewards = torch.tensor(rewards)
            next_states = torch.cat(next_states)
            dones = torch.tensor(dones, dtype=torch.float32)

            q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            next_q_values = target_net(next_states).max(1)[0]
            expected_q_values = rewards + gamma * next_q_values * (1 - dones)

            loss = nn.MSELoss()(q_values, expected_q_values.detach())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if done:
            break

    episode_rewards.append(total_reward)

    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())

    if episode % 10 == 0:
        avg_reward = np.mean(episode_rewards[-10:])
        print(f'Episode {episode}, Average Reward: {avg_reward:.2f}, Epsilon: {epsilon:.2f}')

# Save the trained model
torch.save(policy_net.state_dict(), 'sokoban_dqn_model.pth')
print("Training completed and model saved.")


  logger.warn(
  logger.warn(
  logger.warn(
  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):


Episode 0, Average Reward: -20.00, Epsilon: 1.00
Episode 10, Average Reward: -18.60, Epsilon: 0.99
Episode 20, Average Reward: -18.90, Epsilon: 0.98
Episode 30, Average Reward: -15.99, Epsilon: 0.97
Episode 40, Average Reward: -16.05, Epsilon: 0.96
Episode 50, Average Reward: -13.60, Epsilon: 0.96
Episode 60, Average Reward: -18.80, Epsilon: 0.95
Episode 70, Average Reward: -15.87, Epsilon: 0.94
Episode 80, Average Reward: -18.30, Epsilon: 0.93
Episode 90, Average Reward: -13.04, Epsilon: 0.92
Episode 100, Average Reward: -18.80, Epsilon: 0.91
Episode 110, Average Reward: -11.59, Epsilon: 0.91
Episode 120, Average Reward: -19.00, Epsilon: 0.90
Episode 130, Average Reward: -18.90, Epsilon: 0.89
Episode 140, Average Reward: -18.60, Epsilon: 0.88
Episode 150, Average Reward: -16.26, Epsilon: 0.87
Episode 160, Average Reward: -16.37, Epsilon: 0.87
Episode 170, Average Reward: -14.90, Epsilon: 0.86
Episode 180, Average Reward: -14.53, Epsilon: 0.85
Episode 190, Average Reward: -18.50, Epsil