### Working 

In [1]:
import gym
import gym_sokoban
import numpy as np
import random
from PIL import Image
import pickle

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Initialize the Sokoban environment with a specific seed
env = gym.make('Sokoban-large-v1')
env.seed(42)

# Define Q-learning parameters
# Learning Rate (α): Balances new learning with retained knowledge. Higher values adapt quickly; lower values ensure stable learning.
# Discount Factor (γ): Determines the importance of future rewards. Higher values promote long-term planning; lower values focus on immediate rewards.
# Exploration Rate (ε): Manages the exploration-exploitation trade-off. Higher values encourage exploration; lower values exploit known strategies.
# Number of Episodes: More episodes provide more learning opportunities, leading to better performance but requiring more training time.
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 1000  # Number of training episodes
q_table_filename = "q_table_large.pkl"
# Initialize the Q-table
q_table = {}

def get_state_key(state):
    return str(state.tolist())

def choose_action(state, epsilon):
    state_key = get_state_key(state)
    if random.uniform(0, 1) < epsilon or state_key not in q_table:
        return env.action_space.sample()  # Explore action space
    return np.argmax(q_table[state_key])  # Exploit learned values
# ALREADY TRAINED PLEASE LOAD THE PICKLE BELOW q_table.pkl
# Training the Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    done = False

    while not done:
        state_key = get_state_key(state)
        action = choose_action(state, epsilon)

        next_state, reward, done, info = env.step(action)
        next_state_key = get_state_key(next_state)
        # print(reward, done, info)
        # Initialize Q-values for new states
        if state_key not in q_table:
            q_table[state_key] = np.zeros(env.action_space.n)
        if next_state_key not in q_table:
            q_table[next_state_key] = np.zeros(env.action_space.n)

        # Update Q-values
        best_next_action = np.argmax(q_table[next_state_key])
        td_target = reward + gamma * q_table[next_state_key][best_next_action]
        td_delta = td_target - q_table[state_key][action]
        q_table[state_key][action] += alpha * td_delta

        state = next_state

    if (episode + 1) % 100 == 0:
        print(f"Episode {episode + 1}/{num_episodes} completed.")

# Save the Q-table to a file
with open(q_table_filename, 'wb') as f:
    pickle.dump(q_table, f)
print(f"Q-table saved to {q_table_filename}")

print("Training completed.")

# Load the Q-table from the file
with open(q_table_filename, 'rb') as f:
    q_table = pickle.load(f)
print(f"Q-table loaded from {q_table_filename}")

# Function to save image as JPG
def save_image_as_jpg(image, filename):
    img = Image.fromarray(image)
    img.save(filename, 'JPEG')
    
# # Testing the Q-learning algorithm
state = env.reset()
# img = env.render(mode='rgb_array')
# save_image_as_jpg(img, f"large")
done = False
total_reward = 0
steps = 0
while not done:
    if steps == 0:
        img = env.render(mode='rgb_array')
        save_image_as_jpg(img, f"q_table_solver_{steps}")
    state_key = get_state_key(state)
    action = np.argmax(q_table[state_key])
    state, reward, done, info = env.step(action)
    total_reward += reward
    steps+=1
    img = env.render(mode='rgb_array')
    save_image_as_jpg(img, f"q_table_solver_{steps}")
    print(f"Step: {steps}, current reward {round(total_reward,2)}")

img = env.render(mode='rgb_array')
print(f"Total reward during test: {total_reward}")
print(f"Level Completed. Steps: {steps}")

  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):


Episode 100/1000 completed.
Episode 200/1000 completed.
Episode 300/1000 completed.
Episode 400/1000 completed.
Episode 500/1000 completed.
Episode 600/1000 completed.
Episode 700/1000 completed.
Episode 800/1000 completed.
Episode 900/1000 completed.
Episode 1000/1000 completed.
Q-table saved to q_table_large.pkl
Training completed.
Q-table loaded from q_table_large.pkl
Step: 1, current reward -0.1
Step: 2, current reward -0.2
Step: 3, current reward -0.3
Step: 4, current reward -0.4
Step: 5, current reward -0.5
Step: 6, current reward -0.6


  logger.warn(


Step: 7, current reward -0.7
Step: 8, current reward -0.8
Step: 9, current reward -0.9
Step: 10, current reward -1.0
Step: 11, current reward -1.1
Step: 12, current reward -1.2
Step: 13, current reward -1.3
Step: 14, current reward -1.4
Step: 15, current reward -1.5
Step: 16, current reward -1.6
Step: 17, current reward -1.7
Step: 18, current reward -1.8
Step: 19, current reward -1.9
Step: 20, current reward -2.0
Step: 21, current reward -2.1
Step: 22, current reward -2.2
Step: 23, current reward -2.3
Step: 24, current reward -2.4
Step: 25, current reward -2.5
Step: 26, current reward -2.6
Step: 27, current reward -2.7
Step: 28, current reward -2.8
Step: 29, current reward -2.9
Step: 30, current reward -3.0
Step: 31, current reward -3.1
Step: 32, current reward -3.2
Step: 33, current reward -3.3
Step: 34, current reward -3.4
Step: 35, current reward -3.5
Step: 36, current reward -3.6
Step: 37, current reward -3.7
Step: 38, current reward -3.8
Step: 39, current reward -3.9
Step: 40, cur