## Setup the StreetFighter environment

In [None]:
%pip install --upgrade pip
%pip install gym==0.21.0
%pip install gym-retro
%pip install retrowrapper
%pip install opencv-python
%pip install matplotlib
%pip install torch 
%pip install stable-baselines3[extra]
%pip install stable-baselines3
%pip install optuna
%pip install tensorboard

In [None]:
# # Get the dependencies for the virutal display
# !apt-get install python-opengl -y
# !apt install xvfb -y
# !pip install pyvirtualdisplay
# !pip install https://github.com/pyglet/pyglet/archive/pyglet-1.5-maintenance.zip
# !apt-get install ffmpeg -y

In [None]:
# from pyvirtualdisplay import Display
# import gym
# from gym import wrappers
# from gym import envs
# import matplotlib.pyplot as plt

# # Setup the virutal display
# display = Display(visible=0,size=(600,600))
# display.start()

In [None]:
# import retro for retro games (Street Fighter)
import retro
import retrowrapper
# import time to slow down the game
import time
import os 

# After downloading the ROM for Street Fighter, we used this command in the roms folder to connect it with our gym retro environment (python -m retro.import .)
!python -m retro.import .
# import the ROM for Street Fighter
gamename = "StreetFighterIISpecialChampionEdition-Genesis"
# env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis')
env = retrowrapper.RetroWrapper(gamename, use_restricted_actions=retro.Actions.FILTERED)

In [None]:
# # wrap the environment in monitor for rendering the training
# monitor_dir = os.getcwd()
# env = wrappers.Monitor(env,monitor_dir,video_callable=lambda ep_id: ep_id%1000 == 0,force=True)

### Figure out the observation and action space of the environment

In [None]:
env.observation_space

This most likely tells us that each observation is an image of height 200, width of 256, and 3 channels of RGB

In [None]:
env.action_space
env.action_space.sample()

This means that we have a one-hot-encoded vector of length 12 to represent our action space. This means that we have 2^12 possible actions!

# Preprocess the environment

### Agenda:
- Shrink the images so we have less pixels
- Calculate the frame delta (to understand movement and change within the game)
- Filter the action 
- Set the reward function to the score of the game

In [None]:
# import the environment base class
from gym import Env

# import opencv to process the image
import cv2
# import numpy to work calculate the frame delta
import numpy as np
# import the space shapes for our environment
from gym.spaces import MultiBinary, Box
# import matplotlib to plot the image
from matplotlib import pyplot as plt

In [None]:
# Create custom environment
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        # startup an instance of the game
        gamename = 'StreetFighterIISpecialChampionEdition-Genesis'
        self.game = retrowrapper.RetroWrapper(gamename, use_restricted_actions=retro.Actions.FILTERED)
        self.health = 176
        self.enemy_health = 176
    
    def step(self, action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        
        # Preprocess frame from game
        frame_delta = obs 
#         - self.previous_frame
#         self.previous_frame = obs 
        
        # Shape reward
        # find the difference in previous health and current health
        # finally discourage if our health went down negatively 
        defense_reward = self.health - info['health']
        attack_reward = self.enemy_health - info['enemy_health']
        # score_reward = info['score'] - self.score 
        # if our attack or defense reward are negative then don't account into reward function
        # this is because we don't want to reward when the round resets
        if attack_reward < 0:
            # NOTE that we are giving a huge positive reward for winning the round (this should incentivize the agent to finish the rounds faster)
            attack_reward = 150
            defense_reward = 0
            self.enemy_health = 176
            self.health = 176
        elif defense_reward <0:
            attack_reward = 0
            defense_reward = 0
            self.enemy_health = 176
            self.health = 176
        else:  
            # update the health variables with the new value if it wasn't negative 
            self.enemy_health = info['enemy_health']
            self.health = info['health']
        # NOTE that the defense reward is a negative reward
        reward = attack_reward + (-1.0 * defense_reward)

        return frame_delta, reward, done, info 

    def render(self, *args, **kwargs): 
        self.game.render()
    
    def reset(self):
        self.previous_frame = np.zeros(self.game.observation_space.shape)
        
        # Frame delta
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        
        # Create initial variables
        # self.score = 0
        self.health = 176
        self.enemy_health = 176 

        return obs
    
    def preprocess(self, observation): 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (84,84,1))
        return state
    
    def close(self): 
        self.game.close()

In [None]:
# # Setup a game loop to see what the game looks like (testing)
# obs = env.reset()
# done = False
# # we are choosing to only play one game
# for game in range(1):
#     while not done:
#         if done:
#             obs = env.reset()
#         env.render()
#         action = env.action_space.sample()
#         obs, reward, done, info = env.step(action)
#         if reward > 0:
#             print(reward)

## Tune hyperparameters with Optuna

In [None]:
import optuna 
from stable_baselines3 import PPO
# useful for evaluting the current policy during our hyperparameter tuning
from stable_baselines3.common.evaluation import evaluate_policy
# import Monitor for logging
from stable_baselines3.common.monitor import Monitor
# import DummyVecEnv for vectorizing our environment and frame stacking
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

In [None]:
LOG_DIR = "./logs/"
OPT_DIR = "./opt/"

In [None]:
# Function to return test hyperparameters
def optimize_ppo(trial):
    return {
        "n_steps": trial.suggest_int("n_steps", 2048, 8192),
        "gamma": trial.suggest_loguniform("gamma", 0.8, 0.9999),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e-4),
        "clip_range": trial.suggest_uniform("clip_range", 0.1, 0.4),
        "gae_lambda": trial.suggest_uniform("gae_lambda", 0.8, 0.99),
    }

In [None]:
env.close()

In [None]:
# Setup the training loop and return the mean reward
total_steps = 100000
def train_ppo(trial):
    try:
        # setup the hyperparameters
        hyperparams = optimize_ppo(trial)
        # setup the environment
        env = StreetFighter()
        # setup the monitor (this is important since we are vectorizing the environment, because this allows us 
        # to get the mean episode reward and mean episode length)
        env = Monitor(env, LOG_DIR)
        # setup the vectorized environment
        env = DummyVecEnv([lambda: env])
        # setup the frame stacking
        env = VecFrameStack(env, n_stack=4, channels_order='last')
        # setup the model
        model = PPO("CnnPolicy", env, verbose=0, tensorboard_log=LOG_DIR, **hyperparams)
        # train the model
        model.learn(total_timesteps=total_steps)
        # evaluate the model
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        # close the environment
        env.close()

        # save the best model
        SAVE_PATH = os.path.join(OPT_DIR, "trial_{}_best_model".format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward

    except Exception as e:
        print(e)
        return -1000

In [None]:
# NOTE that since we used a positive reward function, we are maximizing the reward
# study = optuna.create_study(direction="maximize")
# study.optimize(train_ppo, n_trials=5, n_jobs=1)

In [None]:
# best_model = PPO.load(os.path.join(OPT_DIR, "trial_{}_best_model".format(study.best_trial.number)))

# Setup Callback

In [None]:
from stable_baselines3.common.callbacks import BaseCallback

In [None]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
CHECKPOINT_DIR = "./train/"
callback = TrainAndLoggingCallback(check_freq=250000, save_path=CHECKPOINT_DIR)

# Train Model

In [None]:
env.close()
# Recreate the environment
env = StreetFighter()
# setup the monitor (this is important since we are vectorizing the environment, because this allows us
# to get the mean episode reward and mean episode length)
env = Monitor(env, LOG_DIR)
# setup the vectorized environment
env = DummyVecEnv([lambda: env])
# setup the frame stacking
env = VecFrameStack(env, n_stack=4, channels_order='last')

In [None]:
# Automatically choose the params n_steps to be the nearest factor of 64
# factored_steps = round(study.best_trial.params["n_steps"] / 64) * 64

In [None]:
# update the current model params for the factoring
# model_params = study.best_trial.params
# model_params['n_steps'] = factored_steps
model_params = {'n_steps': 2560, 'gamma': 0.906, 'learning_rate': 2e-07, 'clip_range': 0.369, 'gae_lambda': 0.891}
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

# update the model based on the previous trainings
# recreate the zip file for the best model so far
import shutil
model_name = "best_model_nodelta_1"
shutil.make_archive("best_model", 'zip', f"/kaggle/input/street-fighter/{model_name}")
# load the model
model = PPO.load("/kaggle/working/best_model", env)

model.learn(total_timesteps=10000000, callback=callback)
env.close()

In [None]:
env.close()

In [None]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

# Test the model

In [None]:
# # display the training
# from IPython.display import HTML
# from base64 import b64encode

# video = [v for v in os.listdir('./') if 'mp4' in v]
# video.sort()
# print(len(video))
# # print(video[:26])
# vid_1 = open(video[0],'rb').read()
# data_url_1 = "data:video/mp4;base64," + b64encode(vid_1).decode()
# HTML("""
# <video width=600 height=600 controls>
#       <source src="%s" type="video/mp4">
# </video>
# """ % data_url_1)

In [None]:
# # Create an HTML video frame for it if the previous video frame didn't work
# vid_2 = open(video[-1],'rb').read()
# data_url_2 = "data:video/mp4;base64," + b64encode(vid_2).decode()
# HTML("""
# <video width=600 height=600 controls>
#       <source src="%s" type="video/mp4">
# </video>
# """ % data_url_2)

In [None]:
# import time

# # code to render the agent's progress and log the rewards
# for episode in range(1): 
#     obs = env.reset()
#     done = False
#     total_reward = 0
#     while not done: 
#         action, _ = model.predict(obs)
#         obs, reward, done, info = env.step(action)
#         env.render()
#         time.sleep(0.01)
#         total_reward += reward
#     print('Total Reward for episode {} is {}'.format(total_reward, episode))
#     time.sleep(2)