## Setup the StreetFighter environment

In [22]:
# import retro for retro games (Street Fighter)
import retro
import retrowrapper
# use the time module to slow down the game if needed when viewing 
import time
import os 

# After downloading the ROM for Street Fighter, we used this command in the roms folder to connect it with our gym retro environment (python -m retro.import .)
# !python -m retro.import ../input/street-fighter-rom
# import the ROM for Street Fighter
gamename = "StreetFighterIISpecialChampionEdition-Genesis"
env = retrowrapper.RetroWrapper(gamename, use_restricted_actions=retro.Actions.FILTERED)

### Figure out the observation and action space of the environment

In [23]:
env.observation_space

Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
 

This most likely tells us that each observation is an image of height 200, width of 256, and 3 channels of RGB

In [24]:
env.action_space
env.action_space.sample()

array([0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int8)

This means that we have a one-hot-encoded vector of length 12 to represent our action space. This means that we have 2^12 possible actions!

# Preprocess the environment

### Agenda:
- Shrink the images so we have less pixels
- Calculate the frame delta (to understand movement and change within the game)
- Filter the action 
- Set the reward function to the score of the game

In [25]:
# import the environment base class
from gym import Env

# import opencv to process the image
import cv2
# import numpy to work calculate the frame delta
import numpy as np
# import the space shapes for our environment
from gym.spaces import MultiBinary, Box
# import matplotlib to plot the image
from matplotlib import pyplot as plt

In [26]:
# Create custom environment
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        # startup an instance of the game
        gamename = 'StreetFighterIISpecialChampionEdition-Genesis'
        self.game = retrowrapper.RetroWrapper(gamename, use_restricted_actions=retro.Actions.FILTERED)
    
    def step(self, action):
        # take a step (using the base environment)
        obs, reward, done, info = self.game.step(action)
        # preprocess the observation
        obs = self.preprocess(obs)

        # calculate the frame delta
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs

        # calculate the score delta and reshape the reward function based on the score in the environment
        reward = info['score'] - self.score
        self.score = info['score']

        return frame_delta, reward, done, info

    def reset(self):
        obs = self.game.reset()
        # preprocess the image
        obs = self.preprocess(obs)
        # initialize the previous_frame value with the first frame
        self.previous_frame = obs
        # create a default value for the score delta
        self.score = 0
        return obs
    
    def preprocess(self, observation):
        # Grayscaling 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # resize the image
        resize = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        # add the channels value 
        channels = np.reshape(resize, (84, 84, 1))

        return channels
        
    def render(self, *args, **kwargs):
        self.game.render()

    def close(self):
        self.game.close()

In [27]:
# # Setup a game loop to see what the game looks like (testing)
# obs = env.reset()
# done = False
# # we are choosing to only play one game
# for game in range(1):
#     while not done:
#         if done:
#             obs = env.reset()
#         env.render()
#         action = env.action_space.sample()
#         obs, reward, done, info = env.step(action)
#         if reward > 0:
#             print(reward)

## Tune hyperparameters with Optuna

In [28]:
import optuna 
from stable_baselines3 import PPO
# useful for evaluting the current policy during our hyperparameter tuning
from stable_baselines3.common.evaluation import evaluate_policy
# import Monitor for logging
from stable_baselines3.common.monitor import Monitor
# import DummyVecEnv for vectorizing our environment and frame stacking
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

In [29]:
LOG_DIR = "./logs/"
OPT_DIR = "./opt/"

In [30]:
# Function to return test hyperparameters
def optimize_ppo(trial):
    return {
        "n_steps": trial.suggest_int("n_steps", 2048, 8192),
        "gamma": trial.suggest_loguniform("gamma", 0.8, 0.9999),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e-4),
        "clip_range": trial.suggest_uniform("clip_range", 0.1, 0.4),
        "gae_lambda": trial.suggest_uniform("gae_lambda", 0.8, 0.99),
    }

In [31]:
env.close()

In [32]:
# Setup the training loop and return the mean reward
total_steps = 100000
def train_ppo(trial):
    try:
        # setup the hyperparameters
        hyperparams = optimize_ppo(trial)
        # setup the environment
        env = StreetFighter()
        # setup the monitor (this is important since we are vectorizing the environment, because this allows us 
        # to get the mean episode reward and mean episode length)
        env = Monitor(env, LOG_DIR)
        # setup the vectorized environment
        env = DummyVecEnv([lambda: env])
        # setup the frame stacking
        env = VecFrameStack(env, n_stack=4, channels_order='last')
        # setup the model
        model = PPO("CnnPolicy", env, verbose=0, tensorboard_log=LOG_DIR, **hyperparams)
        # train the model
        model.learn(total_timesteps=total_steps)
        # evaluate the model
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        # close the environment
        env.close()

        # save the best model
        SAVE_PATH = os.path.join(OPT_DIR, "trial_{}_best_model".format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        print(e)
        return -1000

In [33]:
# NOTE that since we used a positive reward function, we are maximizing the reward
# study = optuna.create_study(direction="maximize")
# study.optimize(train_ppo, n_trials=100, n_jobs=1)

In [34]:
# best_model = PPO.load(os.path.join(OPT_DIR, "trial_{}_best_model".format(study.best_trial.number)))

# Setup Callback

In [35]:
from stable_baselines3.common.callbacks import BaseCallback

In [36]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [37]:
CHECKPOINT_DIR = "./train/"
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

# Train Model

In [38]:
env.close()
# Recreate the environment
env = StreetFighter()
# setup the monitor (this is important since we are vectorizing the environment, because this allows us
# to get the mean episode reward and mean episode length)
env = Monitor(env, LOG_DIR)
# setup the vectorized environment
env = DummyVecEnv([lambda: env])
# setup the frame stacking
env = VecFrameStack(env, n_stack=4, channels_order='last')

In [39]:
# code that we used to originally train the model
# We got these model params from the hyperparameter optimization trials
model_params = {'n_steps': 2570.949, 'gamma': 0.906, 'learning_rate': 2e-07, 'clip_range': 0.369, 'gae_lambda': 0.891}
model_params['n_steps'] = 40 * 64 # based on optuna study from above (rounding to nearest factor of 64)

model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)
# model.learn(total_timesteps=5000000, callback=callback)
env.close()

Using cpu device
Wrapping the env in a VecTransposeImage.


In [40]:
model_params

{'n_steps': 2560,
 'gamma': 0.906,
 'learning_rate': 2e-07,
 'clip_range': 0.369,
 'gae_lambda': 0.891}

In [41]:
# # recreate the zip file for the best model so far
# import shutil
# shutil.make_archive("best_model", 'zip', "/kaggle/input/street-fighter-rom/best_model_5460000")
# # load the model 
# model = PPO.load("/kaggle/working/best_model")

In [42]:
# Load the model from the provided training (to save compute resources)
custom_objects = {
        "learning_rate": 2e-07,
        "clip_range": lambda _: 0.369,
    }
model_version = "best_model_nonoptuna_4"
model = PPO.load(f'./train/{model_version}.zip', custom_objects=custom_objects)



In [43]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [44]:
for episode in range(1): 
    obs = env.reset()
    done = False
    total_reward = 0
    while not done: 
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render()
        # time.sleep(0.01)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(episode, total_reward))
    time.sleep(2)



Total Reward for episode 0 is [164400.]


# Test the model

In [45]:
# import time

# # code to render the agent's progress and log the rewards
# for episode in range(1): 
#     obs = env.reset()
#     done = False
#     total_reward = 0
#     while not done: 
#         action, _ = model.predict(obs)
#         obs, reward, done, info = env.step(action)
#         env.render()
#         time.sleep(0.01)
#         total_reward += reward
#     print('Total Reward for episode {} is {}'.format(total_reward, episode))
#     time.sleep(2)

## Record a video of the current progress

In [46]:
# from stable_baselines3.common.vec_env import VecVideoRecorder
# import imageio
# from IPython.display import HTML
# from IPython import display as ipythondisplay
# import glob
# import io
# import base64
# from gym.wrappers import Monitor

# video_folder = "./logs/"
# video_length = 350

# def create_mp4(model, env):
#     model.set_env(env)
#     # Record the video starting at the first step
#     env = VecVideoRecorder(env, video_folder,
#                         record_video_trigger=lambda x: x == 0, video_length=video_length,
#                         name_prefix="ppo-sf2{}".format(gamename))

#     # update the model's env
#     model.set_env(env)
#     obs = model.env.reset()
#     for _ in range(video_length + 1):
#         action = model.predict(obs)
#         obs, _, _, _ = model.env.step(action)
#     # Save the video
#     model.env.close()


# def create_gif(model, env):
#     model.set_env(env)
#     images = []
#     obs = model.env.reset()
#     img = model.env.render()
#     # TODO the current issue is that our render method returns None (which makes sense since the return value is optional)
#     print(img)
#     for i in range(350):
#         images.append(img)
#         action, _ = model.predict(obs)
#         obs, _, _ ,_ = model.env.step(action)
#         img = model.env.render()
#         print(img)

#     imageio.mimsave(f'ppo_sf2_{model_version}.gif', [np.array(img) for i, img in enumerate(images) if i%2 == 0], fps=60)

In [47]:
# create_mp4(model, env)