# Setup StreetFighter


In [1]:
# Import retro to play Street Fighter using the rom.
import retro

# Import time to slow down game
import time

In [2]:
# See the different retro games
for retro_game in retro.data.list_games():
    if "StreetFighter" in retro_game:
        print(retro_game)

# To "install" the game into the gym-retro emulator, we need to use:
# ! python -m retro.import <path-to-rom-folder>

StreetFighterIISpecialChampionEdition-Genesis


In [3]:
# Can't have multiple retro environments open at once.
try:
    env.close()
except:
    pass

# retro.make allows us to create an environment with the retro frameworke
env = retro.make(game="StreetFighterIISpecialChampionEdition-Genesis")


In [4]:
# The game frame
env.observation_space.sample()

array([[[  7, 157, 185],
        [179, 248,  11],
        [164, 254, 157],
        ...,
        [ 49, 229, 181],
        [127, 244, 168],
        [255,  82,  76]],

       [[248, 227, 133],
        [163,  63, 169],
        [ 15, 206, 121],
        ...,
        [212, 181,  42],
        [117,  96, 219],
        [219,   5,  67]],

       [[254, 105, 206],
        [ 85,  69, 157],
        [229, 192, 175],
        ...,
        [ 98, 182, 220],
        [164,  83, 173],
        [198,  19, 180]],

       ...,

       [[120, 148, 238],
        [137,  94, 238],
        [223, 179,  55],
        ...,
        [ 92,   8, 241],
        [ 74,  57,  80],
        [ 20,  49,  38]],

       [[ 80, 233, 183],
        [ 87, 144,  59],
        [183,  53, 191],
        ...,
        [ 77,  20, 171],
        [229,  84,  81],
        [ 38,  78, 115]],

       [[ 62,  38,  30],
        [254, 248,  28],
        [150, 226, 245],
        ...,
        [186, 221, 146],
        [ 25, 198, 134],
        [ 32,  62,  81]]

In [5]:
# The action space is a 12-dimensional boolean vector representing button presses.
env.action_space.sample()

array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int8)

In [6]:
# Reset game to starting state.
obs = env.reset()

# Did we die/game over?
done = False

for game in range(1):
    while not done:
        if done:
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(env.action_space.sample())
        # time.sleep(0.0003)
        if reward > 0:
            print(reward)

500.0
1000.0
500.0
1000.0
100.0
100.0
500.0
300.0
100.0
100.0
100.0
100.0
100.0
100.0
1000.0
1000.0
1000.0
100.0
1000.0
300.0
500.0
1000.0
500.0
500.0
500.0
1500.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
1000.0
1000.0
10000.0
300.0
500.0
100.0
500.0
500.0
1000.0
500.0
100.0
1500.0
100.0
100.0
100.0
100.0
100.0
1000.0
1000.0
1000.0
10000.0
300.0
500.0
500.0
500.0
1000.0
1500.0
1000.0
100.0
100.0
100.0
100.0
100.0
100.0
1000.0
1000.0
1000.0
1000.0
1000.0
1000.0
10000.0
100.0
500.0
400.0
500.0
100.0
100.0
100.0
100.0
3000.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
1000.0
1000.0
1000.0
1000.0
1000.0
1000.0
1000.0
300.0
500.0
500.0
100.0
100.0
300.0
300.0
400.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
1000.0
1000.0
1000.0
1000.0
1000.0
1000.0
1000.0
500.0
1000.0
300.0
500.0
100.0
100.0
1000.0
100.0
1000.0
500.0
100.0
500.0
300.0
1000.0
500

In [7]:
info

{'enemy_matches_won': 2,
 'score': 111400,
 'matches_won': 1,
 'continuetimer': 10,
 'enemy_health': 0,
 'health': 0}

# Setup Environment

### What we are going to do
1. Preprocess
   - grayscale
   - frame delta
   - resize the frame so we have less pixels
1. Filter the action parameter
1. Reward Function - set this to the score

In [4]:
# Our base environment class
from gym import Env
# The Action-Space type; the space shapes for our environment
from gym.spaces import MultiBinary, Box
# To calculate the delta
import numpy as np
# For grayscaling our frame
import cv2
# for plotting frame
from matplotlib import pyplot as plt

In [5]:
# Create custom environment
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        # Specify Action and Observation Spaces
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        # Start up an instance of the game, only uses actions that are valid for this game.
        self.game = retro.make(
            game="StreetFighterIISpecialChampionEdition-Genesis", 
            use_restricted_actions=retro.Actions.FILTERED,
        )
        
    def reset(self):
        # Reset to first frame
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        # Create an attribute to hold score delta
        self.score = 0
        return obs
    
    def preprocess(self, observation):
        # Grayscale the observation
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # Resize the grayscaled observation using INTER_CUBE
        # (bicubic interpolation) which acts on 4x4 neighboring pixels and uses
        # an average of the 16 pixels to create a new interpolated pixel.
        resize = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        # Add the channels value
        channels = np.reshape(resize, (84, 84, 1))
        return channels
    
    def step(self, action):
        # Take a step
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        
        # Frame delta
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        
        # Reshape the reward function
        reward = info['score'] - self.score
        self.score = info['score']
        
        return frame_delta, reward, done, info
        
    
    def render(self, *args, **kwargs):
        self.game.render()
    
    def close(self):
        self.game.close()

In [6]:
# Can't have multiple retro environments open at once.
try:
    env.close()
except:
    pass
env = StreetFighter()

In [7]:
env.observation_space.shape

(84, 84, 1)

In [8]:
env.action_space.shape

(12,)

In [9]:
# Reset game to starting state.
obs = env.reset()

# Did we die/game over?
done = False

for game in range(1):
    while not done:
        if done:
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(env.action_space.sample())
        time.sleep(0.0003)
        if reward > 0:
            print(reward)

500
1000
100
400
100
500
500
500
1000
300
1000
1000


# Hyperparameter Tuning

In [14]:
# The optimization framework - HPO
import optuna
# PPO algorithm for RL
from stable_baselines3 import PPO
# Allows us to test out and evaluate KPIs -- for metric calc
from stable_baselines3.common.evaluation import evaluate_policy
# sb3 monitor for logging
from stable_baselines3.common.monitor import Monitor
# vec wrappers to vectorize and framestack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

import os

In [15]:
LOG_DIR = "./logs/"
OPT_DIR = "./opt/"

In [16]:
# Function to return test hyperparams -- define the objective function
def optimize_ppo(trial):
    return {
        # Should be a factor of 64 or the batch size for the PPO algo
        "n_steps": trial.suggest_int('n_steps', 2048, 8096),
        # "n_steps": trial.suggest_int('n_steps', 2048, 8192),
        "gamma": trial.suggest_loguniform('gamma', 0.8, 0.9999),
        "learning_rate": trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        "clip_range": trial.suggest_uniform('clip_range', 0.1, 0.4),
        "gae_lambda": trial.suggest_uniform('gae_lambda', 0.8, 0.99),
    }

In [17]:
# Run a training Loop and return mean reward
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial)
        
        # Wrapping our environment in our SB3 Monitor, VecENv, and FrameStack
        # The Monitor class is important if you're vectoring your environments
        # as this allows you to log the mean ep reward and mean ep length
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')
        
        # Unpacking the model_params we get from optuna and passing it to our PPO model.
        model = PPO("CnnPolicy", env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        model.learn(total_timesteps=30000)
        
        # Evaluate model 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()
        
        SAVE_PATH = os.path.join(OPT_DIR, "trial_{}_best_model".format(trial.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    
    except Exception as e:
        print(e)
        return -1000

In [18]:
# Creating the experiment
env.close()
study = optuna.create_study(direction="maximize")
study.optimize(optimize_agent, n_trials=10, n_jobs=1)

[32m[I 2022-06-20 17:18:47,923][0m A new study created in memory with name: no-name-20ee88d4-09cf-42a1-8c10-9f8d74ed5e67[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=5236 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-06-20 17:26:56,323][0m Trial 0 finished with value: 1300.0 and parameters: {'n_steps': 5236, 'gamma': 0.958309011091822, 'learning_rate': 3.753492113619118e-05, 'clip_range': 0.286070488968583, 'gae_lambda': 0.8870575833616735}. Best is trial 0 with value: 1300.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=7201 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2022-06-20 17:36:39,093][0m Trial 1 finished with value: 1000.0 and parameters: {'n_steps': 7201, 'gamma': 0.8991154937370388, 'learning_rate': 1.4991000236362131e-05, 'clip_range': 0.1732461570501994, 'gae_lambda': 0.8654904394555014}. Best is tr

In [19]:
study.best_params

{'n_steps': 6122,
 'gamma': 0.9877198464226749,
 'learning_rate': 5.372243577492638e-05,
 'clip_range': 0.24512514199723107,
 'gae_lambda': 0.8443383875152853}

In [31]:
study.best_trial

FrozenTrial(number=2, values=[11200.0], datetime_start=datetime.datetime(2022, 6, 20, 17, 36, 39, 94119), datetime_complete=datetime.datetime(2022, 6, 20, 17, 45, 53, 310438), params={'n_steps': 6122, 'gamma': 0.9877198464226749, 'learning_rate': 5.372243577492638e-05, 'clip_range': 0.24512514199723107, 'gae_lambda': 0.8443383875152853}, distributions={'n_steps': IntUniformDistribution(high=8192, low=2048, step=1), 'gamma': LogUniformDistribution(high=0.9999, low=0.8), 'learning_rate': LogUniformDistribution(high=0.0001, low=1e-05), 'clip_range': UniformDistribution(high=0.4, low=0.1), 'gae_lambda': UniformDistribution(high=0.99, low=0.8)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=2, state=TrialState.COMPLETE, value=None)

In [30]:
model = PPO.load(os.path.join(OPT_DIR, "trial_{}_best_model.zip".format(study.best_trial.number)))

# Setup Callback

In [32]:
# Import base callback 
from stable_baselines3.common.callbacks import BaseCallback

In [33]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [34]:
CHECKPOINT_DIR = './train/'

In [36]:
# check_freq determines how many steps before the model is saved
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

# Train Model

In [54]:
try:
    env.close()
except:
    pass
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [56]:
model_params = study.best_params
# A consequence of not using num steps divisible by batch size (64)
model_params["n_steps"] = 7488
# You could slow down the learning rate here to improve learning

In [62]:
model = PPO("CnnPolicy", env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [63]:
model.load(os.path.join(OPT_DIR, "trial_{}_best_model.zip".format(study.best_trial.number)))

<stable_baselines3.ppo.ppo.PPO at 0x7f4f80590790>

In [64]:
model.learn(total_timesteps=30000, callback=callback)

Logging to ./logs/PPO_15
-----------------------------
| time/              |      |
|    fps             | 384  |
|    iterations      | 1    |
|    time_elapsed    | 19   |
|    total_timesteps | 7488 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 146         |
|    iterations           | 2           |
|    time_elapsed         | 102         |
|    total_timesteps      | 14976       |
| train/                  |             |
|    approx_kl            | 0.029442988 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.245       |
|    entropy_loss         | -8.3        |
|    explained_variance   | 2.75e-05    |
|    learning_rate        | 5.37e-05    |
|    loss                 | 168         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.000411   |
|    value_loss           | 9.43e+04    |
-----------------------------------------
---------

<stable_baselines3.ppo.ppo.PPO at 0x7f4e5c97c390>

# Evaluate Model

In [91]:
# model = PPO.load('./train/best_model_5460000.zip')
model = PPO.load('./opt/trial_2_best_model.zip')
# model = PPO.load('./opt/trial_24_best_model.zip')

In [None]:
mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=5)

# Test out the Model

In [92]:
obs = env.reset()

In [93]:
obs.shape

(1, 84, 84, 4)

In [94]:
# Reset game to starting state.
obs = env.reset()

# Did we die/game over?
done = False

for game in range(1):
    while not done:
        if done:
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        time.sleep(0.01)
        if reward > 0:
            print(reward)

[500.]
[1000.]
[300.]


KeyboardInterrupt: 