Import the required libraries

In [7]:
from d3rlpy.datasets import get_cartpole
from d3rlpy.algos import DiscreteCQL, DQN
from d3rlpy.metrics.scorer import discounted_sum_of_advantage_scorer
from d3rlpy.metrics.scorer import evaluate_on_environment
from d3rlpy.dataset import Episode
from d3rlpy.dataset import MDPDataset

from d3rlpy.metrics.scorer import td_error_scorer
from d3rlpy.metrics.scorer import average_value_estimation_scorer
from sklearn.model_selection import train_test_split

import import_ipynb
import numpy as np
from random import random
from FootballEnv import FootballEnv

Helper function to create a dummy dataset

In [8]:
def create_dataset():

    observations = np.array([[int(random() * 5) for i in range(5)] for i in range(5000)])
    actions = np.array([int(random() * 5) for i in range(5000)])
    rewards = np.array([ random() for i in range(5000)])
    terminals = np.array([[0, 0, 0, 0, 1] for i in range(5000)])

    return MDPDataset(
        observations,
        actions,
        rewards, 
        terminals,
    )

In [9]:
dataset = create_dataset()
train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

In [11]:
# setup CQL algorithm
dqn = DQN(use_gpu=False)

env = FootballEnv()

# start training
output = dqn.fit(
    train_episodes,
    eval_episodes=test_episodes,
    n_epochs=5,
    scorers={
        'environment': evaluate_on_environment(env), # evaluate with Football Env
        'advantage': discounted_sum_of_advantage_scorer, # smaller is better
        'td_error': td_error_scorer, # smaller is better
        'value_scale': average_value_estimation_scorer # smaller is better
    }
)

2022-02-05 18:37.26 [debug    ] RoundIterator is selected.
2022-02-05 18:37.26 [info     ] Directory is created at d3rlpy_logs\DQN_20220205183726
2022-02-05 18:37.26 [debug    ] Building models...
2022-02-05 18:37.26 [debug    ] Models have been built.
2022-02-05 18:37.26 [info     ] Parameters are saved to d3rlpy_logs\DQN_20220205183726\params.json params={'action_scaler': None, 'batch_size': 32, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 0.99, 'generated_maxlen': 100000, 'learning_rate': 6.25e-05, 'n_critics': 1, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'q_func_factory': {'type': 'mean', 'params': {'bootstrap': False, 'share_encoder': False}}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'target_reduction_type': 'min', 'target_update_interval': 8000, 'use_gpu': None, 'algorithm': 'DQN'

Epoch 1/5: 100%|██████████| 100/100 [00:00<00:00, 138.70it/s, loss=0.157]


2022-02-05 18:37.28 [info     ] DQN_20220205183726: epoch=1 step=100 epoch=1 metrics={'time_sample_batch': 0.0001558065414428711, 'time_algorithm_update': 0.006684501171112061, 'loss': 0.14954970140010118, 'time_step': 0.007100090980529785, 'environment': 2.604379423843626, 'advantage': -0.2680866558483103, 'td_error': 0.4113258000879978, 'value_scale': 0.8017962565645576} step=100
2022-02-05 18:37.28 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20220205183726\model_100.pt


Epoch 2/5: 100%|██████████| 100/100 [00:00<00:00, 185.10it/s, loss=0.0803]


2022-02-05 18:37.30 [info     ] DQN_20220205183726: epoch=2 step=200 epoch=2 metrics={'time_sample_batch': 0.00015342235565185547, 'time_algorithm_update': 0.004948756694793701, 'loss': 0.08057872463017703, 'time_step': 0.00528207540512085, 'environment': 2.753932084319636, 'advantage': -0.1960573111112256, 'td_error': 0.37411797992810747, 'value_scale': 0.7767954343184829} step=200
2022-02-05 18:37.30 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20220205183726\model_200.pt


Epoch 3/5: 100%|██████████| 100/100 [00:00<00:00, 170.15it/s, loss=0.0753]


2022-02-05 18:37.32 [info     ] DQN_20220205183726: epoch=3 step=300 epoch=3 metrics={'time_sample_batch': 0.00018527984619140625, 'time_algorithm_update': 0.0053626918792724606, 'loss': 0.07587004896253348, 'time_step': 0.005757534503936767, 'environment': 2.4546655539286713, 'advantage': -0.19016045405110127, 'td_error': 0.36729825610496475, 'value_scale': 0.7781217939779163} step=300
2022-02-05 18:37.32 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20220205183726\model_300.pt


Epoch 4/5: 100%|██████████| 100/100 [00:00<00:00, 128.55it/s, loss=0.0733]


2022-02-05 18:37.34 [info     ] DQN_20220205183726: epoch=4 step=400 epoch=4 metrics={'time_sample_batch': 0.0002709245681762695, 'time_algorithm_update': 0.007178292274475097, 'loss': 0.07315244037657977, 'time_step': 0.007649121284484864, 'environment': 2.5050709269369946, 'advantage': -0.1998170799001695, 'td_error': 0.371508196808612, 'value_scale': 0.816871837452054} step=400
2022-02-05 18:37.34 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20220205183726\model_400.pt


Epoch 5/5: 100%|██████████| 100/100 [00:00<00:00, 150.94it/s, loss=0.0711]


2022-02-05 18:37.36 [info     ] DQN_20220205183726: epoch=5 step=500 epoch=5 metrics={'time_sample_batch': 0.00023967981338500976, 'time_algorithm_update': 0.006065294742584228, 'loss': 0.07150248002260923, 'time_step': 0.006505246162414551, 'environment': 2.349511271003337, 'advantage': -0.18930783787982833, 'td_error': 0.36050550349008376, 'value_scale': 0.7965316144749522} step=500
2022-02-05 18:37.36 [info     ] Model parameters are saved to d3rlpy_logs\DQN_20220205183726\model_500.pt
