In [1]:
import os
os.chdir('/home')

import gym
import tensorflow as tf
from tensorflow.keras import optimizers
import pavlov

Using TensorFlow backend.


In [2]:
# arrange
env = gym.make("CartPole-v0")
base_config = {
    'layer_sizes': [128],
    'activation': 'relu'
}
dqn_config = {
    'gamma': 0.99,
    'tau': 1.0,
    'optimizer': tf.keras.optimizers.Adam(lr=0.0001)
}
topology = pavlov.models.topology.DenseTopology(**base_config)
model = pavlov.models.DQNModel(topology, **dqn_config)

epsilon_schedule = pavlov.auxiliary.schedules.LinearDecaySchedule(1.0, 0.1, 500, -1)
actor = pavlov.actors.EpsilonGreedyActor(epsilon_schedule)
buffer_size = 10000
batch_size = 2

pline = pavlov.pipeline.Pipeline()
agent = pavlov.agents.Agent(env,
                            state_pipeline=pline,
                            model=model, actor=actor,
                            buffer_size=buffer_size, batch_size=batch_size,
                            report_frequency=10, warmup_length=50)


# act
agent.run_episode(render=False, do_logging=True)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
agent.run_indefinitely()

End of episode 10. Last 10 episodes: Average reward: 22.9. Average duration: 22.9.
End of episode 20. Last 10 episodes: Average reward: 20.9. Average duration: 20.9.
End of episode 30. Last 10 episodes: Average reward: 20.7. Average duration: 20.7.
End of episode 40. Last 10 episodes: Average reward: 17.2. Average duration: 17.2.
End of episode 50. Last 10 episodes: Average reward: 19.4. Average duration: 19.4.
End of episode 60. Last 10 episodes: Average reward: 21.6. Average duration: 21.6.
End of episode 70. Last 10 episodes: Average reward: 23.5. Average duration: 23.5.
End of episode 80. Last 10 episodes: Average reward: 21.1. Average duration: 21.1.
End of episode 90. Last 10 episodes: Average reward: 19.4. Average duration: 19.4.
End of episode 100. Last 10 episodes: Average reward: 20.8. Average duration: 20.8.
End of episode 110. Last 10 episodes: Average reward: 21.7. Average duration: 21.7.
End of episode 120. Last 10 episodes: Average reward: 16.0. Average duration: 16.0.
E

In [4]:
env = gym.make('MountainCarContinuous-v0')
base_config = {
    'layer_sizes': [128],
    'activation': 'relu'
}
ddpg_config = {
    'actor_activation': 'softmax',
    'gamma': 0.99,
    'tau': 0.1,
    'actor_optimizer': tf.train.AdamOptimizer(0.0001),
    'critic_optimizer': tf.keras.optimizers.Adam(lr=0.0001)
}
topology = pavlov.models.topology.DenseTopology(**base_config)
model = pavlov.models.DDPGModel(topology, **ddpg_config)

epsilon_schedule = pavlov.auxiliary.schedules.LinearDecaySchedule(1.0, 0.1, 500, -1)
actor = pavlov.actors.EpsilonGreedyActor(epsilon_schedule)
buffer_size = 10000
batch_size = 64

pline = pavlov.pipeline.Pipeline()
agent = pavlov.agents.Agent(env,
                            state_pipeline=pline,
                            model=model, actor=actor,
                            buffer_size=buffer_size, batch_size=batch_size,
                            report_frequency=1, warmup_length=50)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [5]:
agent.run_indefinitely()

End of episode 1. Last 1 episodes: Average reward: -32.86599878779518. Average duration: 999.0.
End of episode 2. Last 1 episodes: Average reward: -31.78039215711873. Average duration: 999.0.
End of episode 3. Last 1 episodes: Average reward: -33.89045084164827. Average duration: 999.0.
SIGINT(2) recognized; finishing current episode before terminating
End of episode 4. Last 1 episodes: Average reward: -32.85705727746956. Average duration: 999.0.
Terminating.


In [None]:
env = gym.make('Breakout-v0')
topology_config = {
    'layer_sizes': [128],
    'activation': 'relu'
}
topology = pavlov.models.topology.DenseTopology(**topology_config)

dqn_config = {
    'gamma': 0.99,
    'tau': 1.0,
    'optimizer': tf.keras.optimizers.Adam(0.0001)
}
model = pavlov.models.DQNModel(topology, **dqn_config)

epsilon_schedule = pavlov.auxiliary.schedules.LinearDecaySchedule(1.0, 0.1, 500, -1)
actor = pavlov.actors.EpsilonGreedyActor(epsilon_schedule)
buffer_size = 10000
batch_size = 64

pline = pavlov.pipeline.Pipeline()
pline.add(pavlov.transformations.rgb_to_grey())
pline.add(pavlov.transformations.downsample(new_shape=(84, 84)))
pline.add(pavlov.transformations.combine_consecutive(2, 'max'))
pline.add(pavlov.transformations.stack_consecutive(4))

agent = pavlov.agents.Agent(env,
                            state_pipeline=pline,
                            model=model, actor=actor,
                            buffer_size=buffer_size, batch_size=batch_size,
                            report_frequency=1, warmup_length=5)
agent.run_episode(render=False, do_logging=True)

In [None]:
agent.episode_to_mp4(1, '/home/videos')