# Street Fighter Tutorial
This notebook accompanies the YouTube tutorial on <a href='https://www.youtube.com/c/NicholasRenotte'>Nicholas Renotte</a>

# Setup StreetFighter

In [1]:
# Import retro to play Street Fighter using a ROM
import retro
# Import time to slow down game
import time

In [2]:
# Starts up the game environment
env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis')

In [3]:
# Closes the game environment - important given we can only run one at a time 
env.close()

In [None]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(env.action_space.sample())
        #time.sleep(0.01)
        print(reward)

In [61]:
env.close()

In [None]:
info

# Setup Environment
## What we are going to do! FUNNN
- Observation Preprocess - grayscale (DONE), frame delta, resize the frame so we have less pixels  (DONE) 
- Filter the action - parameter DONE
- Reward function - set this to the score

In [4]:
# Import environment base class for a wrapper 
from gym import Env 
# Import the space shapes for the environment
from gym.spaces import MultiBinary, Box
# Import numpy to calculate frame delta 
import numpy as np
# Import opencv for grayscaling
import cv2
# Import matplotlib for plotting the image
from matplotlib import pyplot as plt
from collections import deque


In [5]:
# Create custom environment 
class StreetFighter(Env): 
    def __init__(self):
        super().__init__()
        # Specify action space and observation space 
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        # Startup and instance of the game 
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)
    
    def reset(self):
        # Return the first frame 
        obs = self.game.reset()
        obs = self.preprocess(obs) 
        self.previous_frame = obs 
        
        
        # Create a attribute to hold the score delta 
        self.score = 0 
        return obs
    
    def preprocess(self, observation): 
        # Grayscaling 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        # Resize 
        resize = cv2.resize(gray, (84,84), interpolation=cv2.INTER_CUBIC)
        # Add the channels value
        channels = np.reshape(resize, (84,84,1))
        return channels 
    
    def step(self, action): 
        # Take a step 
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs) 
        
        # Frame delta 
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs 
        
        # Reshape the reward function
        reward = info['score'] - self.score 
        self.score = info['score'] 
        
        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs):
        self.game.render()
        
    def close(self):
        self.game.close()

# Hyperparameter tune

In [6]:
# Importing the optimzation frame - HPO
import optuna
# PPO algo for RL
from stable_baselines3 import PPO
# Bring in the eval policy method for metric calculation
from stable_baselines3.common.evaluation import evaluate_policy
# Import the sb3 monitor for logging 
from stable_baselines3.common.monitor import Monitor
# Import the vec wrappers to vectorize and frame stack
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
# Import os to deal with filepaths
import os

In [7]:
LOG_DIR = './logs/'
OPT_DIR = './opt/'

In [8]:
def optimize_ppo(trial):
    """Définit l'espace de recherche des hyperparamètres avec des plages plus appropriées."""
    return {
        'n_steps':trial.suggest_int('n_steps', 2048, 8192),
        'gamma':trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trial.suggest_uniform('clip_range', 0.15, 0.2),
        'gae_lambda':trial.suggest_uniform('gae_lambda', 0.8, 0.99)
    }
# Entraîner et évaluer
def optimize_agent(trial):
     try:
        model_params = optimize_ppo(trial) 

        # Create environment 
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')

        # Create algo 
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        model.learn(total_timesteps=75000)
        #model.learn(total_timesteps=100000)

        # Evaluate model 
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)
        env.close()

        # Sauvegarder le meilleur modèle
        if mean_reward > trial.study.best_value:
            SAVE_PATH = os.path.join(OPT_DIR, f'trial_{trial.number}_best_model')
            model.save(SAVE_PATH)

        return mean_reward

     except Exception as e:
        print(f"Trial {trial.number} failed due to: {e}")
        return -1000

In [None]:
# Creating the experiment 
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(optimize_agent, n_trials=10, n_jobs=1)  # Ajustez n_jobs selon vos ressources
#study.optimize(optimize_agent, n_trials=100, n_jobs=1)

In [None]:
#print({'n_steps': 1024, 'gamma': 0.9745575902505328, 'learning_rate': 3.677113636015385e-05, 'clip_range': 0.1518664707208924, 'gae_lambda': 0.9785415948388584})
study.best_params

In [None]:
study.best_trial

In [10]:
model = PPO.load(os.path.join(OPT_DIR, 'trial_4_best_model.zip'))

# Setup Callback

In [9]:
# Import base callback 
import numpy as np
from stable_baselines3.common.callbacks import BaseCallback

In [10]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls + 5460000))
            self.model.save(model_path)

        return True

In [11]:
CHECKPOINT_DIR = './train/'

In [12]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

# Train Model

In [13]:
env.close()

In [14]:
# Create environment 
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [19]:
#model_params = {'n_steps': 7488, 'gamma': 0.9745575902505328, 'learning_rate': 3.677113636015385e-05, 'clip_range': 0.1518664707208924, 'gae_lambda': 0.9785415948388584}
#model_params['n_steps'] = 7488  # set n_steps to 7488 or a factor of 64
#model_params['learning_rate'] = 5e-7
#model_params = {'n_steps': 5445,'gamma': 0.9697421545493266,'learning_rate': 1.5047082217749493e-05,'clip_range': 0.15583917480556153,'gae_lambda': 0.849295667177658}#study.best_params

In [None]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

In [None]:
# Reload previous weights from HPO
model.load(os.path.join(OPT_DIR, 'trial_24_best_model.zip'))

In [None]:
# Charger le modèle sauvegardé
# model = PPO.load('./train/best_model_5460000.zip')

# Continuer l'entraînement pour 1,000,000 pas supplémentaires
model.learn(total_timesteps=100000, callback=callback)


# Evaluate the Model

In [15]:
model = PPO.load('./train/best_model_5460000.zip')

In [None]:
mean_reward, _ = evaluate_policy(model, env, render=True, n_eval_episodes=5)

In [None]:
mean_reward

# Test out the Model

In [None]:
def get_hyperparams(model):
    params = {
        "learning_rate": model.learning_rate,
        "gamma": model.gamma,
        "n_steps": model.n_steps,
        "ent_coef": model.ent_coef,
        "clip_range": model.clip_range,
        "vf_coef": model.vf_coef,
        "max_grad_norm": model.max_grad_norm,
        "gae_lambda": model.gae_lambda,
    }
    return params

In [62]:
model = PPO.load('./train/best_model_5460000.zip')

In [None]:
get_hyperparams(model)

In [71]:
model.learning_rate = 5e-6
model.n_steps = 1024 
#model.gamma = 0.9
model.ent_coef = 0.001

In [19]:
obs = env.reset()

In [None]:
obs.shape

In [None]:
# Reset game to starting state
obs = env.reset()
# Set flag to flase
done = False
for game in range(1): 
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        action = model.predict(obs)[0]
        obs, reward, done, info = env.step(action)
        #time.sleep(0.01)
        print(reward)

In [65]:
env.close()

In [None]:
info