In [12]:
import gym

import stable_baselines3 as sb3
from stable_baselines3 import PPO, DQN
from stable_baselines3.dqn.policies import MlpPolicy
from stable_baselines3.common.env_checker import check_env
from gym.utils.env_checker import check_env as gym_check_env

from game import Game, GameOptions
from wrappers import *
import datetime

Example from sb3

In [13]:
env = gym.make("CartPole-v1")

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10_000)

vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render()
    # VecEnv resets automatically
    # if done:
    #   obs = env.reset()

env.close()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.9     |
|    ep_rew_mean     | 23.9     |
| time/              |          |
|    fps             | 6104     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 26.9         |
|    ep_rew_mean          | 26.9         |
| time/                   |              |
|    fps                  | 4002         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0085681155 |
|    clip_fraction        | 0.109        |
|    clip_range           | 0.2          |
|    en



Tinkering with Game - first instantiate game

In [14]:
options = GameOptions()
options.max_steps = 300
options.max_projectiles_per_turret = 0
options.reward_type = 4


env = Game(render_mode=None, options=options)
gym_check_env(env)
settings = TorchWrapperSettings(normalize=True, flatten_action=True, skip_frames=None, to_tensor=False)
env = TorchWrapper(env, wrapper_settings = settings)
policy_kwargs = dict(   activation_fn=torch.nn.ReLU, 
                        net_arch=[128, 128], 
                        optimizer_class = torch.optim.AdamW, 
                        optimizer_kwargs = dict(amsgrad=True),
                        normalize_images=False)

gym_check_env(env)
check_env(env, warn=True)

  logger.warn(


In [15]:
MEM_SIZE = 10_000
BATCH_SIZE = 128
GAMMA = 0.95
EPS_START = 0.95
EPS_END = 0.05
EPS_DECAY = 4000 # The higher the longer it takes to decay
TAU = 0.005
LR = 1e-4

In [16]:
model = DQN("MlpPolicy", env, verbose=1, 
                    learning_rate=LR, 
                    buffer_size=MEM_SIZE, 
                    batch_size=BATCH_SIZE, 
                    gamma=GAMMA, 
                    tau=TAU, 
                    exploration_initial_eps=EPS_START, 
                    exploration_final_eps=EPS_END, 
                    train_freq=1,
                    learning_starts=BATCH_SIZE,
                    target_update_interval=1,
                    gradient_steps=-1,
                    policy_kwargs=policy_kwargs)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [18]:
model.learn(total_timesteps=100_000, progress_bar=False)


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 300      |
|    ep_rew_mean      | -90.2    |
|    exploration_rate | 0.842    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 935      |
|    time_elapsed     | 1        |
|    total_timesteps  | 1200     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.24     |
|    n_updates        | 1071     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 300      |
|    ep_rew_mean      | -34.6    |
|    exploration_rate | 0.734    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 918      |
|    time_elapsed     | 2        |
|    total_timesteps  | 2400     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 2.17     |
|    n_updates      

<stable_baselines3.dqn.dqn.DQN at 0x152c20190>

: 

In [None]:
#save model with current date
model.save("sb3/dqn_{}".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))


## Train with PPO

In [None]:
model = PPO("MlpPolicy", env, verbose=1,
                    learning_rate=LR, 
                    buffer_size=MEM_SIZE, 
                    batch_size=BATCH_SIZE, 
                    gamma=GAMMA, 
                    tau=TAU, 
                    exploration_initial_eps=EPS_START, 
                    exploration_final_eps=EPS_END, 
                    train_freq=1,
                    learning_starts=BATCH_SIZE,
                    target_update_interval=1,
                    gradient_steps=-1,
                    policy_kwargs=policy_kwargs)
model.learn(total_timesteps=100_000)
#save model with current date
model.save("sb3/ppo_{}".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))

# Play game with the model

In [None]:
env = Game(render_mode="human", options=options)
env = TorchWrapper(env, wrapper_settings = settings)
obs, _ = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, _, info = env.step(action)
    env.render()
    # VecEnv resets automatically
    if done:
      obs, _ = env.reset()

env.close()