In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from PIL import Image

In [None]:
#define the DQN Network with convolutional layers for image input
class DQN(nn.Module):
    def __init__(self, action_dim):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(7*7*64, 512)
        self.fc2 = nn.Linear(512, action_dim)
    
    def forward(self, x):
        x = x.squeeze(1)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
#set up the Pacman environment
env = gym.make('MsPacman-v4')
#number of possible actions dimensions
action_dim = env.action_space.n

In [None]:
#initialize primary & target DQN networks
dqn = DQN(action_dim)
target_dqn = DQN(action_dim)
target_dqn.load_state_dict(dqn.state_dict()) #copying weights from primary to target

In [None]:
#Set up the optimizer (Adam) with a learning rate of 0.00025
optimizer = optim.Adam(dqn.parameters(), lr=0.00025)
#Define the loss function (Huber Loss or Smooth L1 Loss)
loss_fn = nn.SmoothL1Loss()

#Discount factor for future rewards
gamma = 0.99
#Initial epsilon (exploration rate)
epsilon = 1.0
#Epsilon decay rate after each episode
epsilon_decay = 0.995
#Exploration progression rate
epsilon_min = 0.1
#Memory buffer to store experiences
memory = []
#Maximum number of experiences to keep in memory
max_memory = 10000

batch_size = 32

episodes = 500

In [None]:
#Preprocess the image
def preprocess(image):
    #Convert image to grayscale
    image = Image.fromarray(image).convert('L')
    image = image.resize((84, 84))
    #Convert to numpy array and normalize the pixel values
    image = np.array(image) / 255.0
    return image


In [None]:
#Function to select an action using epsilon-greedy policy
def select_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()
    else:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            return dqn(state).argmax().item()

In [None]:
#Function to train the DQN using mini-batch from the memory buffer
def train():
    if len(memory) < batch_size:
        return
    
    batch = random.sample(memory, batch_size)
    state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
    
    state_batch = torch.FloatTensor(np.array(state_batch)).to(device)
    action_batch = torch.LongTensor(action_batch).unsqueeze(1).to(device)
    reward_batch = torch.FloatTensor(reward_batch).to(device)
    next_state_batch = torch.FloatTensor(np.array(next_state_batch)).to(device)
    done_batch = torch.FloatTensor(done_batch).to(device)
    
    q_values = dqn(state_batch).gather(1, action_batch)
    next_q_values = target_dqn(next_state_batch).max(1)[0].unsqueeze(1)
    
    expected_q_values = reward_batch.unsqueeze(1) + gamma * next_q_values * (1 - done_batch.unsqueeze(1))

    loss = loss_fn(q_values, expected_q_values.detach())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
#Multi-tier reward function
def adjust_reward(reward, info):
    #Refer to game info for actions
    if 'pellet_captured' in info:
        reward += 1  #Low-tier reward for capturing a regular pellet
    if 'power_pellet_captured' in info:
        reward += 30  #Mid-tier reward for capturing a power pellet
    if 'all_pellets_captured' in info:
        reward += 100  #High-tier reward for capturing all pellets
    if 'ghost_caught' in info:
        reward -= 50  #Negative reward for being caught by a ghost
    return reward

In [None]:
#GPU boost
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dqn.to(device)
target_dqn.to(device)

In [None]:
state_buffer = []

for episode in range(episodes):
    reset_result = env.reset()
    if isinstance(reset_result, tuple):
        state = reset_result[0]
    else:
        state = reset_result
    
    state_buffer.clear()
    processed_frame = preprocess(state)
    state_buffer = [processed_frame] * 4

    total_reward = 0
    
    for t in range(10000):
        stacked_state = np.stack(state_buffer, axis=0)
        stacked_state = np.expand_dims(stacked_state, axis=0)
        stacked_state = stacked_state.squeeze(0)

        action = select_action(stacked_state, epsilon)
        
        next_step_result = env.step(action)
        if isinstance(next_step_result, tuple):
            if len(next_step_result) == 5:
                next_state, reward, done, truncated, info = next_step_result
                done = done or truncated
            else:
                next_state, reward, done, info = next_step_result
        else:
            next_state, reward, done, info = next_step_result
        
        #Adjust the reward based on the reward tier system
        reward = adjust_reward(reward, info)

        processed_next_frame = preprocess(next_state)
        next_state_buffer = state_buffer[1:] + [processed_next_frame]
        
        memory.append((np.stack(state_buffer, axis=0), action, reward, np.stack(next_state_buffer, axis=0), done))
        if len(memory) > max_memory:
            memory.pop(0)
        
        train()
        
        state_buffer = next_state_buffer
        total_reward += reward
        if done:
            break
    
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay
    
    if episode % 10 == 0:
        target_dqn.load_state_dict(dqn.state_dict())
    
    print(f"Episode: {episode}, Total Reward: {total_reward}, Epsilon: {epsilon:.4f}")

env.close()

Episode: 0, Total Reward: 190.0, Epsilon: 0.9950
Episode: 1, Total Reward: 210.0, Epsilon: 0.9900
Episode: 2, Total Reward: 1100.0, Epsilon: 0.9851
Episode: 3, Total Reward: 190.0, Epsilon: 0.9801
Episode: 4, Total Reward: 260.0, Epsilon: 0.9752
Episode: 5, Total Reward: 120.0, Epsilon: 0.9704
Episode: 6, Total Reward: 170.0, Epsilon: 0.9655
Episode: 7, Total Reward: 340.0, Epsilon: 0.9607
Episode: 8, Total Reward: 230.0, Epsilon: 0.9559
Episode: 9, Total Reward: 250.0, Epsilon: 0.9511
Episode: 10, Total Reward: 100.0, Epsilon: 0.9464
Episode: 11, Total Reward: 150.0, Epsilon: 0.9416
Episode: 12, Total Reward: 240.0, Epsilon: 0.9369
Episode: 13, Total Reward: 210.0, Epsilon: 0.9322
Episode: 14, Total Reward: 200.0, Epsilon: 0.9276
Episode: 15, Total Reward: 180.0, Epsilon: 0.9229
Episode: 16, Total Reward: 190.0, Epsilon: 0.9183
Episode: 17, Total Reward: 160.0, Epsilon: 0.9137
Episode: 18, Total Reward: 200.0, Epsilon: 0.9092
Episode: 19, Total Reward: 240.0, Epsilon: 0.9046
Episode: 