In [1]:
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl
import gym
import numpy as np

env = gym.make('CartPole-v0')
print('observation space:', env.observation_space)
print('action space:', env.action_space)

obs = env.reset()
env.render()
print('initial observation:', obs)

action = env.action_space.sample()
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('info:', info)

class QFunction(chainer.Chain):
    def __init__(self, obs_size, n_actions, n_hidden_channels=50):
        super().__init__()
        with self.init_scope():
            self.l0 = L.Linear(obs_size, n_hidden_channels)
            self.l1 = L.Linear(n_hidden_channels, n_hidden_channels)
            self.l2 = L.Linear(n_hidden_channels, n_actions)
            
    def __call__(self, x, test=False):
        """
        Args:
            x (ndarray or chainer.Variable): An observation
            test (bool): a flag indicating whether it is in test mode
        """
        h = F.tanh(self.l0(x))
        h = F.tanh(self.l1(h))
        return chainerrl.action_value.DiscreteActionValue(self.l2(h))

# ========= Defome Q-Function =============
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
q_func = QFunction(obs_size, n_actions)
# comment below if not using GPU
#q_func.to_gpu(0)

# using ChainerRL's predefined Q-functions
_q_func = chainerrl.q_functions.FCStateQFunctionWithDiscreteAction(
    obs_size, n_actions,
    n_hidden_layers=2, n_hidden_channels=50)

# Use Adam to optimize q_func. eps=1e-2 is for stability
optimizer = chainer.optimizers.Adam(eps=1e-2)
optimizer.setup(q_func)

# ========= Define an Agent ============
# Set the discount factor that discounts future rewards
gamma = 0.95

# Use epsilon-greedy for exploration
explorer = chainerrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.3, random_action_func=env.action_space.sample)

# DQN uses Experience Replay.
# Specify a replay buffer and its capacity
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)

# Since observation from CartPole-v0 is numpy.float64 while
# Chainer only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi.
phi = lambda x: x.astype(np.float32, copy=False)

# Now create an agent that will interact with the environment
agent = chainerrl.agents.DoubleDQN(
    q_func, optimizer, replay_buffer, gamma, explorer,
    replay_start_size=500, update_interval=1,
    target_update_interval=100, phi=phi)

# ========= Start training ===========
n_episodes = 200
max_episode_len = 200
for i in range(1, n_episodes + 1):
    obs = env.reset()
    reward = 0
    done = False
    R = 0 # return (sum of rewards)
    t = 0 # time step
    while not done and t < max_episode_len:
        # Uncomment to watch the behaviour
        # env.render()
        action = agent.act_and_train(obs, reward)
        obs, reward, done, _ = env.step(action)
        R += reward
        t += 1
    if i % 10 == 0:
        print('episode:', i,
              'R:', R,
              'statistics:', agent.get_statistics())
    agent.stop_episode_and_train(obs, reward, done)
print('Finished.')
    
# ========= Start test ===========
for i in range(10):
    obs = env.reset()
    done = False
    R = 0
    t = 0
    while not done and t < 200:
        env.render()
        action = agent.act(obs)
        obs, r, done, _ = env.step(action)
        R += r
        t += 1
    print('test episode:', i, 'R:', R)
    agent.stop_episode()

# =========== Saving agent =========
# save an agent to the 'agent' directory
agent.save('agent')

# Uncomment to load an agent from the 'agent' directory
# agent.load('agent')


# Set up the logger to print info messages for understandability
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')

chainerrl.experiments.train_agent_batch_with_evaluation(
    agent, env,
    steps=2000,                # Train the agent for 2000 steps
    eval_n_steps=None,        # We evaluate for episodes, not time
    eval_n_episodes=10,        # 10 episodes are sampled for each evaluation 
    train_max_episode_len=200, # Maximum length of each episode
    eval_interval=1000,        # Evaluate the agent after every 1000 steps
    outdir='result')           # Save everything to 'result' directory


observation space: Box(4,)
action space: Discrete(2)
initial observation: [-0.00283262 -0.03591068 -0.02153046 -0.00371754]
next observation: [-0.00355084 -0.23071734 -0.02160481  0.28209526]
reward: 1.0
info: {}
episode: 10 R: 10.0 statistics: [('average_q', -0.001933498680087825), ('average_loss', 0), ('n_updates', 0)]
episode: 20 R: 12.0 statistics: [('average_q', 0.003869815193436438), ('average_loss', 0), ('n_updates', 0)]
episode: 30 R: 9.0 statistics: [('average_q', 0.007396012595715117), ('average_loss', 0), ('n_updates', 0)]
episode: 40 R: 13.0 statistics: [('average_q', 0.016009313573788958), ('average_loss', 0), ('n_updates', 0)]
episode: 50 R: 11.0 statistics: [('average_q', 0.05582418108204651), ('average_loss', 0.11702799767857335), ('n_updates', 45)]
episode: 60 R: 9.0 statistics: [('average_q', 0.1792929262324794), ('average_loss', 0.22840330179088406), ('n_updates', 147)]
episode: 70 R: 13.0 statistics: [('average_q', 0.3890097717972934), ('average_loss', 0.18895358439

"\n# Set up the logger to print info messages for understandability\nimport logging\nimport sys\nlogging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')\n\nchainerrl.experiments.train_agent_batch_with_evaluation(\n    agent, env,\n    steps=2000,                # Train the agent for 2000 steps\n    eval_n_steps=None,        # We evaluate for episodes, not time\n    eval_n_episodes=10,        # 10 episodes are sampled for each evaluation \n    train_max_episode_len=200, # Maximum length of each episode\n    eval_interval=1000,        # Evaluate the agent after every 1000 steps\n    outdir='result')           # Save everything to 'result' directory\n"