# Install library

In [None]:
%pip install -q pip==23.3.2
%pip install -q gym_super_mario_bros nes_py
%pip install -q wheel==0.38.4 setuptools==66.0.0 gym==0.21.0
%pip install -q numpy==1.26.4 opencv-python==4.8.0.76

In [None]:
%pip install -q torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu121

In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA is available. Using device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

CUDA is available. Using device: NVIDIA GeForce RTX 4060 Laptop GPU


# Setup environment and wrappers


In [2]:
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from nes_py.wrappers import JoypadSpace
from gym.wrappers import GrayScaleObservation, ResizeObservation, FrameStack
from gym import RewardWrapper, Wrapper
import numpy as np

In [3]:
class CustomRewardWrapper(RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.curr_score = 0

    def reset(self, **kwargs):
        self.curr_score = 0
        return self.env.reset(**kwargs)

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        reward += (info["score"] - self.curr_score) / 40.
        self.curr_score = info["score"]
        
        if done:
            reward += 50 if info["flag_get"] else -50
        return state, reward/10, done, info

class SkipFrame(Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        for i in range(self._skip):
            state, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return state, total_reward, done, info

In [4]:
def create_env():
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = JoypadSpace(env, SIMPLE_MOVEMENT)
    env = CustomRewardWrapper(env)
    env = SkipFrame(env, skip=4)
    env = GrayScaleObservation(env, keep_dim=True)
    env = ResizeObservation(env, (84, 84))
    env = FrameStack(env, num_stack=4)
    return env

env = create_env()

# Training

In [5]:
import os
from collections import deque
from torch.utils.tensorboard import SummaryWriter
from PPO import PPO

In [6]:
CHECKPOINT_DIR = './train/'
LOG_DIR = './logs/'

In [None]:
model = PPO(
    state_dim=(84, 84, 4),
    action_dim=len(SIMPLE_MOVEMENT),
    lr_actor=0.0001,
    lr_critic=0.0001,
    gamma=0.9,
    n_epochs=10,
    eps_clip=0.2,
    gae_lambda=1.0,
    ent_coef=0.01,
    vf_coef=0.5,
    max_grad_norm=0.5,
    batch_size=64
)

In [8]:
total_timesteps = 5_000_000
n_steps = 512
obs = env.reset()

timestep = 0
save_freq = 200000

episode_rewards = deque(maxlen=100)
episode_lengths = deque(maxlen=100)
ep_reward = 0.0
ep_length = 0

writer = SummaryWriter(LOG_DIR)

try:
    while timestep < total_timesteps:
        for _ in range(n_steps):
            action = model.select_action(obs)
            next_obs, reward, done, info = env.step(action)

            model.buffer.rewards.append(float(reward))
            model.buffer.is_terminals.append(float(done))

            ep_reward += float(reward)
            ep_length += 1
            obs = next_obs
            timestep += 1

            if timestep % save_freq == 0 and timestep > 0:
                model_path = os.path.join(CHECKPOINT_DIR, f'model_{timestep}_steps.pt')
                model.save(model_path)
                print(f"Model saved at {timestep:,} steps")

            if done:
                episode_rewards.append(ep_reward)
                episode_lengths.append(ep_length)
                ep_reward = 0.0
                ep_length = 0
                obs = env.reset()

        model.update()

        mean_rew = float(np.mean(episode_rewards)) if len(episode_rewards) > 0 else float('nan')
        mean_len = float(np.mean(episode_lengths)) if len(episode_lengths) > 0 else float('nan')
        print(f"steps={timestep:,} | mean_reward={mean_rew:.2f} | mean_length={mean_len:.1f}")

        if len(episode_rewards) > 0:
            writer.add_scalar('rollout/ep_rew_mean', mean_rew, timestep)
            writer.add_scalar('rollout/ep_len_mean', mean_len, timestep)
        writer.add_scalar('time/steps', timestep, timestep)
        writer.flush()

finally:
    writer.close()

steps=512 | mean_reward=nan | mean_length=nan
steps=1,024 | mean_reward=nan | mean_length=nan
steps=1,536 | mean_reward=nan | mean_length=nan
steps=2,048 | mean_reward=56.95 | mean_length=1010.0
steps=2,560 | mean_reward=61.38 | mean_length=739.7
steps=3,072 | mean_reward=67.08 | mean_length=596.0
steps=3,584 | mean_reward=67.90 | mean_length=526.5
steps=4,096 | mean_reward=61.68 | mean_length=430.4
steps=4,608 | mean_reward=61.50 | mean_length=366.5
steps=5,120 | mean_reward=61.09 | mean_length=315.1
steps=5,632 | mean_reward=68.48 | mean_length=312.0
steps=6,144 | mean_reward=71.98 | mean_length=319.6
steps=6,656 | mean_reward=72.49 | mean_length=304.7
steps=7,168 | mean_reward=69.46 | mean_length=273.1
steps=7,680 | mean_reward=69.04 | mean_length=267.1
steps=8,192 | mean_reward=72.22 | mean_length=264.2
steps=8,704 | mean_reward=71.54 | mean_length=246.5
steps=9,216 | mean_reward=73.69 | mean_length=238.1
steps=9,728 | mean_reward=73.43 | mean_length=230.5
steps=10,240 | mean_rewar

# Test

In [6]:
from PPO import PPO

In [7]:
model = PPO(
    state_dim=(84, 84, 4), 
    action_dim=len(SIMPLE_MOVEMENT), 
    lr_actor=0.0001, 
    lr_critic=0.0001,
    gamma=0.9, 
    n_epochs=10, 
    eps_clip=0.2,
    gae_lambda=1.0, 
    ent_coef=0.01, 
    vf_coef=0.5, 
    max_grad_norm=0.5, 
    batch_size=64
)
model.load('./train/model_2000000_steps.pt')

In [None]:
import time
env = create_env()
state = env.reset()
num_episodes = 10
total_rewards = 0.0
total_steps = 0
total_flags = 0

try:
    for episode in range(num_episodes):
        while True:
            action = model.select_action(state)
            state, reward, done, info = env.step(action)
            total_rewards += float(reward)
            total_steps += 1
            
            if info.get("flag_get"):
                total_flags += 1
                
            env.render()
            time.sleep(1/80)
            
            if done:
                state = env.reset()
                break
                
    print(f"Episodes: {num_episodes}")
    print(f"Mean reward: {total_rewards/num_episodes:.2f}")
    print(f"Mean steps: {total_steps/num_episodes:.1f}")
    print(f"Total flags: {total_flags}")
    env.close()

except KeyboardInterrupt:
    print("\nInterrupted by user. Closing environment...")
    env.close()

Episodes: 10
Mean reward: 234.85
Mean steps: 258.8
Total flags: 6
