In [5]:

from pyboy import PyBoy, WindowEvent
import numpy as np
import gym
class GameBoyEnv(gym.Env):
    metadata = {'render.modes': ['human', 'rgb_array']}
    
    class FrameStorage:
        def __init__(self):
            self.unique_frame_hashes = set()

        def add_frame(self, observation):
            frame_hash = hash(observation.tobytes())
            reward = 0.1 if frame_hash not in self.unique_frame_hashes else 0
            self.unique_frame_hashes.add(frame_hash)
            return reward

    def __init__(self, rom_path="Super Mario Land (JUE) (V1.1) [!].gb"):
        super(GameBoyEnv, self).__init__()
        self.pyboy = PyBoy(rom_path)
        self.action_space = gym.spaces.Discrete(8)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(160, 144, 3), dtype=np.uint8)
        self.frame_storage = self.FrameStorage()
        self.reset_game_state()
        self.initialize_game()

    def reset_game_state(self):
        self.time_steps = 0
        self.last_score = 0
        self.last_coins = 0
        self.last_world = None
        self.last_stage = None

    def initialize_game(self):
        self.pyboy.set_emulation_speed(0)
        for _ in range(350):
            self.pyboy.tick()
        self.pyboy.send_input(WindowEvent.PRESS_BUTTON_START)
        for _ in range(20):
            self.pyboy.tick()
        self.update_game_state()

    def update_game_state(self):
        self.last_world = self.pyboy.get_memory_value(0x982C)
        self.last_stage = self.pyboy.get_memory_value(0x982E)

    def step(self, action):
        gameboy_buttons = [
            WindowEvent.PRESS_BUTTON_A, WindowEvent.PRESS_BUTTON_B,
            WindowEvent.PRESS_ARROW_UP, WindowEvent.PRESS_ARROW_DOWN,
            WindowEvent.PRESS_ARROW_LEFT, WindowEvent.PRESS_ARROW_RIGHT,
            WindowEvent.PRESS_BUTTON_SELECT, WindowEvent.PRESS_BUTTON_START
        ]
        if action < len(gameboy_buttons):
            self.pyboy.send_input(gameboy_buttons[action])
        self.pyboy.tick()

        observation = self.pyboy.botsupport_manager().screen().screen_ndarray()
        frame_reward = self.frame_storage.add_frame(observation)
        reward, done = self.calculate_rewards_and_check_done(frame_reward)
        info = self.collect_game_info()
        self.time_steps += 1
        return observation, reward, done, info

    def get_score(self):
        return self.pyboy.get_memory_value(0xC0A0)
    
    def get_coins(self):
        # print(self.pyboy.get_memory_value(0xC0A2))
        return self.pyboy.get_memory_value(0xFFFA)

    def get_lives(self):
        # print(self.pyboy.get_memory_value(0xDA15),self.time_steps)
        return self.pyboy.get_memory_value(0xDA15)
    
    def check_game_over(self):
        return self.get_lives() == 0 or self.time_steps > 100000
    

    def calculate_rewards_and_check_done(self, frame_reward):
        current_score, current_coins = self.get_score(), self.get_coins()
        score_reward = (current_score - self.last_score) / 1000
        coin_reward = (current_coins - self.last_coins) / 10

        current_world = self.pyboy.get_memory_value(0x982C)
        current_stage = self.pyboy.get_memory_value(0x982E)
        stage_reward = 50 if current_stage != self.last_stage else 0
        world_reward = 100 if current_world != self.last_world else 0

        self.last_score, self.last_coins = current_score, current_coins
        self.last_world, self.last_stage = current_world, current_stage

        reward = frame_reward + score_reward + coin_reward + stage_reward + world_reward
        done = self.check_game_over()

        return reward, done

    def collect_game_info(self):
        return {
            'score': self.last_score, 
            'coins': self.last_coins, 
            'world': self.last_world, 
            'stage': self.last_stage
        }

    def reset(self):
        self.pyboy.stop()
        self.__init__()
        return self.pyboy.botsupport_manager().screen().screen_ndarray()

    def close(self):
        self.pyboy.stop()

        self.pyboy.stop()
        self.__init__()
        return self.pyboy.botsupport_manager().screen().screen_ndarray()

    def close(self):
        self.pyboy.stop()


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import gym

class DQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()
        self.input_dim = np.prod(input_shape)  # Calculate the total input dimension
        
        self.net = nn.Sequential(
            nn.Linear(self.input_dim, 128),  # Adjust the input dimension
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )

    def forward(self, x):
        x = x.float() / 255.0  # Normalize input
        x = x.view(x.size(0), -1)  # Flatten the input
        return self.net(x)



In [7]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.array(state), action, reward, np.array(next_state), done

    def __len__(self):
        return len(self.buffer)


In [8]:
class Agent:
    def __init__(self, env, replay_buffer):
        self.env = env
        self.replay_buffer = replay_buffer
        self.state_shape = env.observation_space.shape
        self.n_actions = env.action_space.n
        self.model = DQN(self.state_shape, self.n_actions).float()
        self.target_model = DQN(self.state_shape, self.n_actions).float()
        self.optimizer = optim.Adam(self.model.parameters())
        self.loss_fn = nn.MSELoss()

        self.gamma = 0.99
        self.batch_size = 64
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.update_target_every = 10

    def act(self, state):
        if random.random() < self.epsilon:
            return self.env.action_space.sample()
        state_t = torch.FloatTensor(state).unsqueeze(0)
        q_values = self.model(state_t)
        return q_values.max(1)[1].item()

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        state, action, reward, next_state, done = self.replay_buffer.sample(self.batch_size)

        state = torch.FloatTensor(state)
        next_state = torch.FloatTensor(next_state)
        action = torch.LongTensor(action)
        reward = torch.FloatTensor(reward)
        done = torch.FloatTensor(done)

        q_values = self.model(state).gather(1, action.unsqueeze(-1)).squeeze(-1)
        next_q_values = self.target_model(next_state).max(1)[0]
        expected_q_values = reward + self.gamma * next_q_values * (1 - done)

        loss = self.loss_fn(q_values, expected_q_values.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def train(self, num_episodes):
        for episode in range(num_episodes):
            state = self.env.reset()
            total_reward = 0
            done = False

            while not done:
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                self.replay_buffer.push(state, action, reward, next_state, done)

                state = next_state
                total_reward += reward

                self.update()

            if episode % self.update_target_every == 0:
                self.target_model.load_state_dict(self.model.state_dict())

            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay

            print(f"Episode: {episode}, Total reward: {total_reward}, Epsilon: {self.epsilon}")

env = GameBoyEnv("Super Mario Land (JUE) (V1.1) [!].gb")
replay_buffer = ReplayBuffer(100000)
agent = Agent(env, replay_buffer)

agent.train(1000)  # Train for 1000 episodes


Episode: 0, Total reward: 1.9000000000000006, Epsilon: 0.995
Episode: 1, Total reward: 0.6, Epsilon: 0.990025
Episode: 2, Total reward: 0.6, Epsilon: 0.985074875
Episode: 3, Total reward: 0.2, Epsilon: 0.9801495006250001
Episode: 4, Total reward: 0.6, Epsilon: 0.9752487531218751
Episode: 5, Total reward: 0.7999999999999999, Epsilon: 0.9703725093562657
Episode: 6, Total reward: 2.3000000000000007, Epsilon: 0.9655206468094844
Episode: 7, Total reward: 0.4, Epsilon: 0.960693043575437
Episode: 8, Total reward: 0.6, Epsilon: 0.9558895783575597
Episode: 9, Total reward: 0.5, Epsilon: 0.9511101304657719
Episode: 10, Total reward: 0.7, Epsilon: 0.946354579813443
Episode: 11, Total reward: 0.7999999999999999, Epsilon: 0.9416228069143757
Episode: 12, Total reward: 0.5, Epsilon: 0.9369146928798039
Episode: 13, Total reward: 0.4, Epsilon: 0.9322301194154049
Episode: 14, Total reward: 0.7, Epsilon: 0.9275689688183278
Episode: 15, Total reward: 1.2, Epsilon: 0.9229311239742362
Episode: 16, Total rew