In [1]:
import gc
import importlib
import logging
import pathlib
import sys
import time

import gym

ROOT = pathlib.Path.cwd().parent
sys.path.append(str(ROOT))
import drl.agents as agents
import drl.envs as envs
import drl.experiments as experiments
import drl.policies as policies

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s: %(message)s',
    datefmt='%H:%M:%S',
)

In [10]:
env_fn = lambda: gym.make('LunarLander-v2')
mp_env = envs.MultiprocessVectorEnv(env_fn, 16)

agent = agents.DQAgent(env_fn=env_fn, device='cpu')

In [17]:
t0 = time.perf_counter()
experiments.evaluate_agent(agent, env_fn, num_steps=100_000)
print(time.perf_counter() - t0)
# 57.2s on 1 env

56.65979389200004


In [18]:
t0 = time.perf_counter()
experiments.evaluate_agent(agent, env_fn, num_envs=32, num_steps=100_000)
print(time.perf_counter() - t0)
# 29.2s on 8  envs
# 18.4s on 16 envs
# 15.6s on 32 envs

15.615259636000019


In [14]:
t0 = time.perf_counter()
experiments.evaluate_agent(agent, mp_env, num_steps=100_000)
print(time.perf_counter() - t0)
# 24.2s on 8  envs
# 19.4s on 16 envs

19.183021768999993


In [None]:
mp_env.close()

In [None]:
env_fn = lambda: gym.make('LunarLander-v2')
env = env_fn()

agent = agents.DQAgent(
    observation_space=env.observation_space,
    action_space=env.action_space,

    noisy=True, noisy_use_factorized=False, parametrize=False,
    behaviour_policy=policies.BoltzmannPolicy(0.01),
    target_policy=policies.BoltzmannPolicy(0.01),

    mem_size=500_000, min_history=1_000, batch_size=64,
    lr=1e-3, gamma=0.99, n_steps=1, replace_target=100,

    device='cpu',
    fname='DQAgent_model.h5',
)
trainer = experiments.Trainer(
    agent, env_fn,
    samples_per_update=1,
    metrics='all',
    log_dir=pathlib.Path(ROOT).joinpath(
        'logs/LunarLander/Adam_lr=1e-3'),
    num_envs=8,
    multiprocessing=True
)
trainer.train(
    num_steps=10_000_000, eval_freq=20_000, eval_steps=40_000,
    plot=False, to_csv=True
)

17:32:29: Starting training procedure from scratch.
17:34:03: env_step 20k  e=70  train_score=-265.4   eval_score=-121.6   frames=218  time_taken=75.3
17:35:48: env_step 40k  e=96  train_score=-214.4   eval_score=-40.9   frames=397  time_taken=88.2
17:37:25: env_step 60k  e=123  train_score=-110.3   eval_score=-115.2   frames=528  time_taken=80.1
17:39:07: env_step 80k  e=145  train_score=-72.7   eval_score=-59.7   frames=683  time_taken=83.6
17:40:53: env_step 100k  e=176  train_score=-37.1   eval_score=3.7   frames=771  time_taken=86.6
17:42:32: env_step 120k  e=207  train_score=-20.5   eval_score=26.8   frames=734  time_taken=83.7
17:44:05: env_step 140k  e=258  train_score=70.3   eval_score=163.1   frames=518  time_taken=80.6
17:45:36: env_step 160k  e=324  train_score=157.3   eval_score=198.3   frames=328  time_taken=80.6
17:47:09: env_step 180k  e=391  train_score=151.7   eval_score=96.2   frames=304  time_taken=81.1
17:48:43: env_step 200k  e=448  train_score=174.9   eval_score=

In [None]:
del trainer

In [8]:
gc.collect()

0