## Setup the StreetFighter environment

In [1]:
%pip install --upgrade pip
%pip install gym==0.21.0
%pip install gym-retro
%pip install retrowrapper
%pip install opencv-python
%pip install matplotlib
%pip install torch 
%pip install stable-baselines3[extra]
%pip install stable-baselines3
%pip install optuna
%pip install tensorboard

Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.1.2
    Uninstalling pip-22.1.2:
      Successfully uninstalled pip-22.1.2
Successfully installed pip-22.3.1
[0mNote: you may need to restart the kernel to use updated packages.
Collecting gym==0.21.0
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: gym
  Building wheel for gym (setup.py) ... [?25ldone
[?25h  Created wheel for gym: filename=gym-0.21.0-py3-none-any.whl size=1616828 sha256=b8bd5e6a1a9fd7099a25bad493c9b12311d041f1195b118e988aaaa45a3b9fa4
  Sto

In [2]:
# # Get the dependencies for the virutal display
# !apt-get install python-opengl -y
# !apt install xvfb -y
# !pip install pyvirtualdisplay
# !pip install https://github.com/pyglet/pyglet/archive/pyglet-1.5-maintenance.zip
# !apt-get install ffmpeg -y

In [3]:
# from pyvirtualdisplay import Display
# import gym
# from gym import wrappers
# from gym import envs
# import matplotlib.pyplot as plt

# # Setup the virutal display
# display = Display(visible=0,size=(600,600))
# display.start()

In [4]:
# import retro for retro games (Street Fighter)
import retro
import retrowrapper
# import time to slow down the game
import time
import os 

# After downloading the ROM for Street Fighter, we used this command in the roms folder to connect it with our gym retro environment (python -m retro.import .)
!python -m retro.import /kaggle/input/street-fighter/street_fighter_rom
# import the ROM for Street Fighter
gamename = "StreetFighterIISpecialChampionEdition-Genesis"
# env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis')
env = retrowrapper.RetroWrapper(gamename, use_restricted_actions=retro.Actions.FILTERED)

Importing StreetFighterIISpecialChampionEdition-Genesis
Imported 1 games


In [5]:
# # wrap the environment in monitor for rendering the training
# monitor_dir = os.getcwd()
# env = wrappers.Monitor(env,monitor_dir,video_callable=lambda ep_id: ep_id%1000 == 0,force=True)

### Figure out the observation and action space of the environment

In [6]:
env.observation_space

Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
 

This most likely tells us that each observation is an image of height 200, width of 256, and 3 channels of RGB

In [7]:
env.action_space
env.action_space.sample()

array([0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1], dtype=int8)

This means that we have a one-hot-encoded vector of length 12 to represent our action space. This means that we have 2^12 possible actions!

# Preprocess the environment

### Agenda:
- Shrink the images so we have less pixels
- Calculate the frame delta (to understand movement and change within the game)
- Filter the action 
- Set the reward function to the score of the game

In [8]:
# import the environment base class
from gym import Env

# import opencv to process the image
import cv2
# import numpy to work calculate the frame delta
import numpy as np
# import the space shapes for our environment
from gym.spaces import MultiBinary, Box
# import matplotlib to plot the image
from matplotlib import pyplot as plt

In [9]:
# Create custom environment
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        # startup an instance of the game
        gamename = 'StreetFighterIISpecialChampionEdition-Genesis'
        self.game = retrowrapper.RetroWrapper(gamename, use_restricted_actions=retro.Actions.FILTERED)
    
    def step(self, action):
        # take a step (using the base environment)
        obs, reward, done, info = self.game.step(action)
        # preprocess the observation
        obs = self.preprocess(obs)

        # calculate the frame delta
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs

        # calculate the score delta and reshape the reward function based on the score in the environment
        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info

    def reset(self):
        obs = self.game.reset()
        # preprocess the image
        obs = self.preprocess(obs)
        # initialize the previous_frame value with the first frame
        self.previous_frame = obs
        # create a default value for the score delta
        self.score = 0
        return obs
    
    def preprocess(self, observation):
        # Grayscaling 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # resize the image
        resize = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        # add the channels value 
        channels = np.reshape(resize, (84, 84, 1))

        return channels
        
    def render(self, *args, **kwargs):
        self.game.render()

    def close(self):
        self.game.close()

In [10]:
# # Setup a game loop to see what the game looks like (testing)
# obs = env.reset()
# done = False
# # we are choosing to only play one game
# for game in range(1):
#     while not done:
#         if done:
#             obs = env.reset()
#         env.render()
#         action = env.action_space.sample()
#         obs, reward, done, info = env.step(action)
#         if reward > 0:
#             print(reward)

## Tune hyperparameters with Optuna

In [11]:
import optuna 
from stable_baselines3 import PPO
# useful for evaluting the current policy during our hyperparameter tuning
from stable_baselines3.common.evaluation import evaluate_policy
# import Monitor for logging
from stable_baselines3.common.monitor import Monitor
# import DummyVecEnv for vectorizing our environment and frame stacking
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

In [12]:
LOG_DIR = "./logs/"
OPT_DIR = "./opt/"

In [13]:
# Function to return test hyperparameters
def optimize_ppo(trial):
    return {
        "n_steps": trial.suggest_int("n_steps", 2048, 8192),
        "gamma": trial.suggest_loguniform("gamma", 0.8, 0.9999),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e-4),
        "clip_range": trial.suggest_uniform("clip_range", 0.1, 0.4),
        "gae_lambda": trial.suggest_uniform("gae_lambda", 0.8, 0.99),
    }

In [14]:
env.close()

In [15]:
# Setup the training loop and return the mean reward
total_steps = 100000
def train_ppo(trial):
    try:
        # setup the hyperparameters
        hyperparams = optimize_ppo(trial)
        # setup the environment
        env = StreetFighter()
        # setup the monitor (this is important since we are vectorizing the environment, because this allows us 
        # to get the mean episode reward and mean episode length)
        env = Monitor(env, LOG_DIR)
        # setup the vectorized environment
        env = DummyVecEnv([lambda: env])
        # setup the frame stacking
        env = VecFrameStack(env, n_stack=4, channels_order='last')
        # setup the model
        model = PPO("CnnPolicy", env, verbose=0, tensorboard_log=LOG_DIR, **hyperparams)
        # train the model
        model.learn(total_timesteps=total_steps)
        # evaluate the model
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        # close the environment
        env.close()

        # save the best model
        SAVE_PATH = os.path.join(OPT_DIR, "trial_{}_best_model".format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        print(e)
        return -1000

In [16]:
# NOTE that since we used a positive reward function, we are maximizing the reward
study = optuna.create_study(direction="maximize")
study.optimize(train_ppo, n_trials=10, n_jobs=1)

[32m[I 2022-12-30 00:41:42,221][0m A new study created in memory with name: no-name-d25fb59d-f24e-451a-9f36-1c64ae86bbb8[0m
  """
  
  import sys
  
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=6790 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-12-30 01:09:08,556][0m Trial 0 finished with value: 2000.0 and parameters: {'n_steps': 6790, 'gamma': 0.8143299100790934, 'learning_rate': 1.0448588909232619e-05, 'clip_range': 0.3071062374064162, 'gae_lambda': 0.9655908762120912}. Best is trial 0 with value: 2000.0.[0m
  """
  
  import sys
  
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2959 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-12-30 01:37:10,357][0m Trial 1 finished with value: 1900.0 and parameters: {'n_steps': 2959, 'gamma': 0.9145911973419378, 'learning_rate': 5.797715368664101e-05, 'clip_range': 0.26413604648

In [17]:
best_model = PPO.load(os.path.join(OPT_DIR, "trial_{}_best_model".format(study.best_trial.number)))

# Setup Callback

In [18]:
from stable_baselines3.common.callbacks import BaseCallback

In [19]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [20]:
CHECKPOINT_DIR = "./train/"
callback = TrainAndLoggingCallback(check_freq=250000, save_path=CHECKPOINT_DIR)

# Train Model

In [21]:
env.close()
# Recreate the environment
env = StreetFighter()
# setup the monitor (this is important since we are vectorizing the environment, because this allows us
# to get the mean episode reward and mean episode length)
env = Monitor(env, LOG_DIR)
# setup the vectorized environment
env = DummyVecEnv([lambda: env])
# setup the frame stacking
env = VecFrameStack(env, n_stack=4, channels_order='last')

Process Process-5:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/site-packages/retrowrapper.py", line 23, in _retrocom
    attr, args, kwargs = rx.get()
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 94, in get
    res = self._recv_bytes()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


In [22]:
# Automatically choose the params n_steps to be the nearest factor of 64
factored_steps = round(study.best_trial.params["n_steps"] / 64) * 64

In [23]:
# update the current model params for the factoring
model_params = study.best_trial.params
model_params['n_steps'] = factored_steps

model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

# update the model based on the previous trainings
# recreate the zip file for the best model so far
# import shutil
# shutil.make_archive("best_model", 'zip', "/kaggle/input/street-fighter-rom/best_model_990000")
# # load the model 
# model = PPO.load("/kaggle/working/best_model", env)

model.learn(total_timesteps=10000000, callback=callback)
env.close()

Using cpu device
Wrapping the env in a VecTransposeImage.
Logging to ./logs/PPO_4
-----------------------------
| time/              |      |
|    fps             | 177  |
|    iterations      | 1    |
|    time_elapsed    | 38   |
|    total_timesteps | 6784 |
-----------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.21e+04    |
|    ep_rew_mean          | 3.32e+04    |
| time/                   |             |
|    fps                  | 96          |
|    iterations           | 2           |
|    time_elapsed         | 140         |
|    total_timesteps      | 13568       |
| train/                  |             |
|    approx_kl            | 0.013914814 |
|    clip_fraction        | 0.0344      |
|    clip_range           | 0.307       |
|    entropy_loss         | -8.31       |
|    explained_variance   | -2.38e-07   |
|    learning_rate        | 1.04e-05    |
|    loss                 | 2.28e+03

KeyboardInterrupt: 

In [None]:
env.close()

In [None]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
# mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=1)
# print(mean_reward)
# env.close()

# Test the model

In [None]:
# # display the training
# from IPython.display import HTML
# from base64 import b64encode

# video = [v for v in os.listdir('./') if 'mp4' in v]
# video.sort()
# print(len(video))
# # print(video[:26])
# vid_1 = open(video[0],'rb').read()
# data_url_1 = "data:video/mp4;base64," + b64encode(vid_1).decode()
# HTML("""
# <video width=600 height=600 controls>
#       <source src="%s" type="video/mp4">
# </video>
# """ % data_url_1)

In [None]:
# # Create an HTML video frame for it if the previous video frame didn't work
# vid_2 = open(video[-1],'rb').read()
# data_url_2 = "data:video/mp4;base64," + b64encode(vid_2).decode()
# HTML("""
# <video width=600 height=600 controls>
#       <source src="%s" type="video/mp4">
# </video>
# """ % data_url_2)

In [None]:
# import time

# # code to render the agent's progress and log the rewards
# for episode in range(1): 
#     obs = env.reset()
#     done = False
#     total_reward = 0
#     while not done: 
#         action, _ = model.predict(obs)
#         obs, reward, done, info = env.step(action)
#         env.render()
#         time.sleep(0.01)
#         total_reward += reward
#     print('Total Reward for episode {} is {}'.format(total_reward, episode))
#     time.sleep(2)