In [1]:
import gc
import importlib
import logging
import pathlib
import sys
import time

import gym

ROOT = pathlib.Path.cwd().parent
sys.path.append(str(ROOT))
import drl.agents as agents
import drl.experiments as experiments
import drl.policies as policies

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s: %(message)s',
    datefmt='%H:%M:%S',
)

In [2]:
env_fn = lambda: gym.make('LunarLander-v2')

agent = agents.DQAgent(env_fn=env_fn, device='cpu')

In [3]:
env = env_fn()
t0 = time.perf_counter()
experiments.evaluate_agent(agent, env, num_steps=100_000)
print(time.perf_counter() - t0)
# 57.2s on 1 env

45.744778701


In [4]:
env = gy.vector.SyncVectorEnv([env_fn] * 32)
t0 = time.perf_counter()
experiments.evaluate_agent(agent, env, num_steps=100_000)
print(time.perf_counter() - t0)
env.close()
# 29.2s on 8  envs
# 18.4s on 16 envs
# 15.6s on 32 envs

6.519028534


In [5]:
mp_env = gym.vector.AsyncVectorEnv([env_fn] * 8)
t0 = time.perf_counter()
experiments.evaluate_agent(agent, mp_env, num_steps=100_000)
print(time.perf_counter() - t0)
mp_env.close()
# 24.2s on 8  envs
# 19.4s on 16 envs

14.129570392999995


In [6]:
mp_env.close()

In [7]:
env_fn = lambda: gym.make('LunarLander-v2')
env = env_fn()

agent = agents.DQAgent(
    observation_space=env.observation_space,
    action_space=env.action_space,

    noisy=True, noisy_use_factorized=False, parametrize=False,
    behaviour_policy=policies.BoltzmannPolicy(0.01),
    target_policy=policies.BoltzmannPolicy(0.01),

    mem_size=500_000, min_history=1_000, batch_size=64,
    lr=1e-4, gamma=0.99, n_steps=1, replace_target=100,

    device='cpu',
    fname='DQAgent_model.h5',
)
trainer = experiments.Trainer(
    agent, env_fn,
    samples_per_update=1,
    metrics='all',
    log_dir=pathlib.Path(ROOT).joinpath(
        'logs/LunarLander/Adam_lr=1e-4'),
    num_envs=8,
    multiprocessing=True
)
trainer.train(
    num_steps=10_000_000, eval_freq=20_000, eval_steps=40_000,
    plot=False, to_csv=True
)

16:05:20: Starting training procedure from scratch.
16:06:46: env_step 20k  e=141  train_score=-233.3   eval_score=-235.9   frames=159  time_taken=73.2
16:08:30: env_step 40k  e=210  train_score=-207.8   eval_score=-187.2   frames=247  time_taken=89.6
16:10:04: env_step 60k  e=247  train_score=-205.1   eval_score=-209.6   frames=363  time_taken=77.5
16:11:54: env_step 80k  e=267  train_score=-195.0   eval_score=-164.7   frames=500  time_taken=92.7
16:13:31: env_step 100k  e=287  train_score=-185.5   eval_score=-124.9   frames=649  time_taken=80.2
16:15:11: env_step 120k  e=308  train_score=-175.1   eval_score=-148.0   frames=782  time_taken=80.9
16:16:48: env_step 140k  e=328  train_score=-166.1   eval_score=-121.6   frames=877  time_taken=80.2
16:18:32: env_step 160k  e=350  train_score=-156.1   eval_score=-164.0   frames=975  time_taken=85.9
16:20:19: env_step 180k  e=368  train_score=-152.8   eval_score=-134.0   frames=977  time_taken=85.8
16:22:05: env_step 200k  e=390  train_score

KeyboardInterrupt: 

In [8]:
del trainer

In [10]:
gc.collect()

0