# Street Fighter No Delta
This notebook shows how to create the environment without the delta transformation. 

## Requirements
* python 3.7.17

# Setup StreetFighter

In [1]:
!pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import retro

In [3]:
retro.data.list_games()

['1942-Nes',
 '1943-Nes',
 '3NinjasKickBack-Genesis',
 '8Eyes-Nes',
 'AaahhRealMonsters-Genesis',
 'AbadoxTheDeadlyInnerWar-Nes',
 'AcceleBrid-Snes',
 'ActRaiser2-Snes',
 'ActionPachio-Snes',
 'AddamsFamily-GameBoy',
 'AddamsFamily-Genesis',
 'AddamsFamily-Nes',
 'AddamsFamily-Sms',
 'AddamsFamily-Snes',
 'AddamsFamilyPugsleysScavengerHunt-Nes',
 'AddamsFamilyPugsleysScavengerHunt-Snes',
 'AdvancedBusterhawkGleylancer-Genesis',
 'Adventure-Atari2600',
 'AdventureIsland-GameBoy',
 'AdventureIsland3-Nes',
 'AdventureIslandII-Nes',
 'AdventuresOfBatmanAndRobin-Genesis',
 'AdventuresOfBayouBilly-Nes',
 'AdventuresOfDinoRiki-Nes',
 'AdventuresOfDrFranken-Snes',
 'AdventuresOfKidKleets-Snes',
 'AdventuresOfMightyMax-Genesis',
 'AdventuresOfMightyMax-Snes',
 'AdventuresOfRockyAndBullwinkleAndFriends-Genesis',
 'AdventuresOfRockyAndBullwinkleAndFriends-Nes',
 'AdventuresOfRockyAndBullwinkleAndFriends-Snes',
 'AdventuresOfStarSaver-GameBoy',
 'AdventuresOfYogiBear-Snes',
 'AeroFighters-Snes',
 

In [4]:
!python -m retro.import "/home/marcelo/Downloads/Street Fighter II' - Special Champion Edition (Genesis)"

Importing StreetFighterIISpecialChampionEdition-Genesis
Importing StreetFighterIISpecialChampionEdition-Genesis
Imported 2 games


In [5]:
env = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis')

In [6]:
obs = env.reset()

In [7]:
obs = env.reset()
done = True
for game in range(5):
    while not done: 
        if done: 
            obs = env.reset()
        env.render()
        obs, reward, done, info = env.step(env.action_space.sample())
        print(reward)

In [8]:
# https://wowroms.com/en/roms/sega-genesis-megadrive/download-street-fighter-ii-special-champion-edition-europe/26496.html

In [9]:
#env.close()

In [10]:
obs, reward, done, info = env.step(env.action_space.sample())

In [11]:
env.action_space.sample()

array([1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0], dtype=int8)

In [12]:
env.observation_space.sample()

array([[[ 37, 233, 238],
        [ 21, 121, 169],
        [172, 217, 244],
        ...,
        [123,  96, 149],
        [230,  72,  73],
        [ 42, 228,  37]],

       [[ 71,  64, 214],
        [251, 188, 213],
        [ 15,  51, 197],
        ...,
        [249, 243,  58],
        [ 20, 120, 253],
        [ 63, 171, 158]],

       [[246,  62,  84],
        [225, 201,  76],
        [  4, 215,  57],
        ...,
        [  8,  34,  91],
        [221, 239, 231],
        [  5,   7,  43]],

       ...,

       [[175, 250, 100],
        [ 36, 207,  45],
        [208, 243, 148],
        ...,
        [243, 197, 234],
        [ 45, 104, 124],
        [ 77, 238,  78]],

       [[161, 186, 196],
        [ 97, 119, 187],
        [141, 107, 238],
        ...,
        [120, 216, 213],
        [230, 127,  86],
        [125, 120, 150]],

       [[ 62, 123,  98],
        [ 80,  69, 114],
        [229, 185,  36],
        ...,
        [ 91, 133, 160],
        [ 79, 112,   8],
        [212,  81,  34]]

In [13]:
info['enemy_health']

176

In [14]:
info['health']

176

# Setup Environment

In [15]:
from gym import Env
from gym.spaces import Box, MultiBinary
import numpy as np
import cv2

In [16]:
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)
        #self.score = 0
    
    def step(self, action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        
        # Preprocess frame from game
        frame_delta = obs 
#         - self.previous_frame
#         self.previous_frame = obs 
        
        # Shape reward
        reward = info['score'] - self.score 
        self.score = info['score']

        return frame_delta, reward, done, info 
    
    def render(self, *args, **kwargs): 
        self.game.render()
    
    def reset(self):
        self.previous_frame = np.zeros(self.game.observation_space.shape)
        
        # Frame delta
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        
        # Create initial variables
        self.score = 0

        return obs
    
    def preprocess(self, observation): 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (84,84,1))
        return state
    
    def close(self): 
        self.game.close()

In [17]:
env.close()
env = StreetFighter()

In [18]:
env.observation_space.shape

(84, 84, 1)

# Hyperparameter tune

In [19]:
#!pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio===0.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

In [20]:
#!pip install stable-baselines3[extra] optuna

In [21]:
# Import optuna for HPO
import optuna
# Import PPO for algos
from stable_baselines3 import PPO
# Evaluate Policy
from stable_baselines3.common.evaluation import evaluate_policy
# Import wrappers
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack

In [22]:
LOG_DIR = './logs/'
OPT_DIR = './opt_nodelta/'

In [23]:
# #https://github.com/araffin/rl-baselines-zoo/issues/29
def optimize_ppo(trial):
    """ Learning hyperparamters we want to optimise"""
    return {
        'n_steps': trial.suggest_int('n_steps', 2048, 8192),
        'gamma': trial.suggest_float('gamma', 0.8, 0.9999, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-4, log=True),
        'clip_range': trial.suggest_float('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.8, .99)
    }

In [24]:
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial)
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        model.learn(total_timesteps=100000)
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=20)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        return mean_reward
    except Exception as e: 
        return -1000

In [25]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=100, n_jobs=1)

[I 2024-10-31 23:07:05,714] A new study created in memory with name: no-name-35a7ef78-a00b-4f96-84c1-aef631eb1f5a
[I 2024-10-31 23:07:05,773] Trial 0 finished with value: -1000.0 and parameters: {'n_steps': 7610, 'gamma': 0.8521107522787329, 'learning_rate': 1.1058839389664145e-05, 'clip_range': 0.37805160112985914, 'gae_lambda': 0.8711108836233263}. Best is trial 0 with value: -1000.0.
[I 2024-10-31 23:07:05,828] Trial 1 finished with value: -1000.0 and parameters: {'n_steps': 3155, 'gamma': 0.8115619441047558, 'learning_rate': 1.2014301909604693e-05, 'clip_range': 0.3385917406259863, 'gae_lambda': 0.920420253435188}. Best is trial 0 with value: -1000.0.
[I 2024-10-31 23:07:05,883] Trial 2 finished with value: -1000.0 and parameters: {'n_steps': 4676, 'gamma': 0.9679503115366225, 'learning_rate': 8.062701799715265e-05, 'clip_range': 0.1728094857477646, 'gae_lambda': 0.8971613015122417}. Best is trial 0 with value: -1000.0.
[I 2024-10-31 23:07:05,949] Trial 3 finished with value: -1000

# Setup Callback

In [26]:
# Import os for file path management
import os 
# Import Base Callback for saving models
from stable_baselines3.common.callbacks import BaseCallback

In [27]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [28]:
CHECKPOINT_DIR = './train_nodelta/'

In [29]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

# Train Model

In [30]:
env.close()

In [31]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [32]:
model_params = {'n_steps': 2570.949, 'gamma': 0.906, 'learning_rate': 2e-07, 'clip_range': 0.369, 'gae_lambda': 0.891}
#model_params = {'n_steps': 8960, 'gamma': 0.906, 'learning_rate': 2e-03, 'clip_range': 0.369, 'gae_lambda': 0.891}
#model_params = study.best_params
#model_params = {
#    'n_steps': 2048,
#    'gamma': 0.99,
#    'learning_rate': 3e-4,
#    'clip_range': 0.2,
#    'gae_lambda': 0.95
#}
#model_params = {
#    'n_steps': 2048,
#    'gamma': 0.99,
#    'learning_rate': 1e-4,
#    'clip_range': 0.1,
#    'gae_lambda': 0.95,
#    'ent_coef': 0.01,
#    'n_epochs': 5,
#    'target_kl': 0.01
#}
# Early stopping at step 0 due to reaching max kl: 0.02

#model_params = {
#    'n_steps': 2048,
#    'gamma': 0.99,
#    'learning_rate': 5e-5,      # Taxa de aprendizado reduzida
#    'clip_range': 0.1,
#    'gae_lambda': 0.95,
#    'ent_coef': 0.01,
#    'n_epochs': 10,             # Aumentado para permitir atualizações mais suaves
#    'max_grad_norm': 0.3,       # Gradiente máximo reduzido
#    # 'target_kl': 0.05,        # Pode ajustar ou comentar para testes
#}

#model_params = study.best_params

In [33]:
model_params['n_steps'] = 40*64

In [34]:
model_params

{'n_steps': 2560,
 'gamma': 0.906,
 'learning_rate': 2e-07,
 'clip_range': 0.369,
 'gae_lambda': 0.891}

In [35]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)
#model.load('./train_nodelta_backup/best_model_5460000.zip')

Using cuda device
Wrapping the env in a VecTransposeImage.


In [36]:
model.learn(total_timesteps=100000, callback=callback)

Logging to ./logs/PPO_11
-----------------------------
| time/              |      |
|    fps             | 416  |
|    iterations      | 1    |
|    time_elapsed    | 6    |
|    total_timesteps | 2560 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 276          |
|    iterations           | 2            |
|    time_elapsed         | 18           |
|    total_timesteps      | 5120         |
| train/                  |              |
|    approx_kl            | 5.555339e-07 |
|    clip_fraction        | 0            |
|    clip_range           | 0.369        |
|    entropy_loss         | -8.32        |
|    explained_variance   | 1.91e-06     |
|    learning_rate        | 2e-07        |
|    loss                 | 1.65         |
|    n_updates            | 10           |
|    policy_gradient_loss | 5.3e-06      |
|    value_loss           | 1.39e+05     |
----------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x71b891e7d310>

In [37]:
env.close()

# Evaluate the Model

In [38]:
# verificar desepenho no tensorboard rollout/ep_rew_mean
#model = PPO.load('./train_nodelta/best_model_90000')
model = PPO.load('./train_nodelta/best_model_70000')

In [39]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [40]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [41]:
mean_reward

15800.0

In [42]:
env.close()

# Test out the Model

In [43]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [44]:
import time

In [45]:
for episode in range(1): 
    obs = env.reset()
    done = False
    total_reward = 0
    while not done: 
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render()
        time.sleep(0.01)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(total_reward, episode))
    time.sleep(2)

Total Reward for episode [76500.] is 0
