In [None]:

import numpy as np

from tensorforce.agents import *
from tensorforce.execution import Runner
from tensorforce.contrib.openai_gym import OpenAIGym

# Create an OpenAIgym environment.
environment = OpenAIGym('CartPole-v0', visualize=False)

# Network as list of layers
# - Embedding layer:
#   - For Gym environments utilizing a discrete observation space, an
#     "embedding" layer should be inserted at the head of the network spec.
#     Such environments are usually identified by either:
#     - class ...Env(discrete.DiscreteEnv):
#     - self.observation_space = spaces.Discrete(...)

# Note that depending on the following layers used, the embedding layer *may* need a
# flattening layer

network_spec = [
    # dict(type='embedding', indices=100, size=32),
    # dict(type'flatten'),
    dict(type='dense', size=24),
    dict(type='dense', size=24)
]

actions_exploration_spec=dict(
            type="epsilon_decay",
            initial_epsilon=1.0,
            final_epsilon=0.1,
            timesteps=1000
        )
        

memory=dict(
            type='replay',
            include_next_states=True,
            capacity=32
        )

optimizer=dict(
            type='adam',
            learning_rate=1e-3
        )

agent = DQNAgent(
    states=environment.states,
    actions=environment.actions,
    network=network_spec,
    memory = memory,
    optimizer = optimizer,
    actions_exploration = actions_exploration_spec)


# Create the runner
runner = Runner(agent=agent, environment=environment)


# Callback function printing episode statistics
def episode_finished(r):
    if r.episode%100 == 0:
        print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep,
                                                                                 reward=r.episode_rewards[-1]))
    return True


# Start learning
runner.run(episodes=30000, max_episode_timesteps=200, episode_finished=episode_finished)
runner.close()

# Print statistics
print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format(
    ep=runner.episode,
    ar=np.mean(runner.episode_rewards[-100:]))
)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Finished episode 100 after 50 timesteps (reward: 50.0)
Finished episode 200 after 59 timesteps (reward: 59.0)
Finished episode 300 after 200 timesteps (reward: 200.0)
Finished episode 400 after 85 timesteps (reward: 85.0)
Finished episode 500 after 26 timesteps (reward: 26.0)
Finished episode 600 after 17 timesteps (reward: 17.0)
Finished episode 700 after 12 timesteps (reward: 12.0)
Finished episode 800 after 9 timesteps (reward: 9.0)
Finished episode 900 after 10 timesteps (reward: 10.0)
Finished episode 1000 after 9 timesteps (reward: 9.0)
Finished episode 1100 after 12 timesteps (reward: 12.0)
Finished episode 1200 after 10 timesteps (reward: 10.0)
Finished episode 1300 after 10 timesteps (reward: 10.0)
Finished episode 1400 after 9 timesteps (reward: 9.0)
F