In [1]:
!pip install tensorflow==2.3.0
!pip install gym
!pip install keras
!pip install keras-rl2



In [2]:
!pip install pyglet



# **Environment (MountainCar-v0) with OpenAI Gym**

In [3]:
import gym
import random

In [4]:
env = gym.make('MountainCar-v0')
states = env.observation_space.shape[0]
actions = env.action_space.n

In [5]:
# Position
# Velocity
print('States:', states)

States: 2


In [6]:
# Push left
# Push right
# No push
print('actions:', actions)

actions: 3


# **Test Before Use Deep Reinforcement Learning (DRL)**

In [7]:
episodes = 15 # match number
for episode in range(1, episodes+1):
  state = env.reset()
  done = False
  score = 0

  while not done:
    env.render()
    action = random.choice([0,1])
    n_state, reward, done, info = env.step(action)
    score+=reward
  print('Episode:{} Score:{}'.format(episode, score)) 

Episode:1 Score:-200.0
Episode:2 Score:-200.0
Episode:3 Score:-200.0
Episode:4 Score:-200.0
Episode:5 Score:-200.0
Episode:6 Score:-200.0
Episode:7 Score:-200.0
Episode:8 Score:-200.0
Episode:9 Score:-200.0
Episode:10 Score:-200.0
Episode:11 Score:-200.0
Episode:12 Score:-200.0
Episode:13 Score:-200.0
Episode:14 Score:-200.0
Episode:15 Score:-200.0


# **Deep Learning Model (MLP) with Keras**

In [10]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [23]:
# MLP => 3 hidden layers (1º = 128 neurons, 2º = 64 neurons, 3º = 32 neurons)
#        2 neurons for each state in the input layer
#        3 neurons for each action in the output layer
#        activation function for all the hidden layers = relu 
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(env.action_space.n, activation='linear'))
    return model

In [24]:
# MLP structure
model = build_model(states, actions)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 2)                 0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               384       
_________________________________________________________________
dense_9 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_10 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 99        
Total params: 10,819
Trainable params: 10,819
Non-trainable params: 0
_________________________________________________________________


# **Build Agent with Keras-DRL (Deep Reinforcement Learning)**

In [25]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [26]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=env.action_space.n, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

# **DRL Training**

In [27]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=150000, visualize=False, verbose=2)

Training for 150000 steps ...
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.




    200/150000: episode: 1, duration: 3.389s, episode steps: 200, steps per second:  59, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.980 [0.000, 2.000],  loss: 0.045511, mae: 0.912031, mean_q: -1.233350
    400/150000: episode: 2, duration: 1.304s, episode steps: 200, steps per second: 153, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.955 [0.000, 2.000],  loss: 0.004268, mae: 1.894394, mean_q: -2.791724
    600/150000: episode: 3, duration: 1.840s, episode steps: 200, steps per second: 109, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 1.085 [0.000, 2.000],  loss: 0.029440, mae: 2.989484, mean_q: -4.391519
    800/150000: episode: 4, duration: 2.390s, episode steps: 200, steps per second:  84, episode reward: -200.000, mean reward: -1.000 [-1.000, -1.000], mean action: 0.905 [0.000, 2.000],  loss: 0.043859, mae: 4.117525, mean_q: -6.063605
   1000/150000: episode: 5, duration: 2.367s, ep

<tensorflow.python.keras.callbacks.History at 0x7f6d9851cbe0>

# **DEL Test**

In [28]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: -109.000, steps: 109
Episode 2: reward: -112.000, steps: 112
Episode 3: reward: -85.000, steps: 85
Episode 4: reward: -111.000, steps: 111
Episode 5: reward: -109.000, steps: 109
Episode 6: reward: -112.000, steps: 112
Episode 7: reward: -85.000, steps: 85
Episode 8: reward: -200.000, steps: 200
Episode 9: reward: -90.000, steps: 90
Episode 10: reward: -108.000, steps: 108
Episode 11: reward: -110.000, steps: 110
Episode 12: reward: -110.000, steps: 110
Episode 13: reward: -109.000, steps: 109
Episode 14: reward: -108.000, steps: 108
Episode 15: reward: -85.000, steps: 85
Episode 16: reward: -87.000, steps: 87
Episode 17: reward: -108.000, steps: 108
Episode 18: reward: -112.000, steps: 112
Episode 19: reward: -112.000, steps: 112
Episode 20: reward: -92.000, steps: 92
Episode 21: reward: -103.000, steps: 103
Episode 22: reward: -111.000, steps: 111
Episode 23: reward: -110.000, steps: 110
Episode 24: reward: -84.000, steps: 84
Episode 25

In [29]:
_ = dqn.test(env, nb_episodes=15, visualize=True)

Testing for 15 episodes ...
Episode 1: reward: -111.000, steps: 111
Episode 2: reward: -110.000, steps: 110
Episode 3: reward: -112.000, steps: 112
Episode 4: reward: -91.000, steps: 91
Episode 5: reward: -104.000, steps: 104
Episode 6: reward: -110.000, steps: 110
Episode 7: reward: -110.000, steps: 110
Episode 8: reward: -109.000, steps: 109
Episode 9: reward: -184.000, steps: 184
Episode 10: reward: -110.000, steps: 110
Episode 11: reward: -104.000, steps: 104
Episode 12: reward: -108.000, steps: 108
Episode 13: reward: -89.000, steps: 89
Episode 14: reward: -91.000, steps: 91
Episode 15: reward: -110.000, steps: 110


# **Reloading Agent from Memory (Saving Weights)**

In [30]:
dqn.save_weights('dqn_weights_1.h5f', overwrite=True)

In [31]:
del model
del dqn
del env

In [32]:
env = gym.make('MountainCar-v0')
actions = env.action_space.n
states = env.observation_space.shape[0]
model = build_model(states, actions)
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [34]:
dqn.load_weights('dqn_weights_1.h5f')

In [35]:
_ = dqn.test(env, nb_episodes=15, visualize=True)

Testing for 15 episodes ...
Episode 1: reward: -110.000, steps: 110
Episode 2: reward: -85.000, steps: 85
Episode 3: reward: -92.000, steps: 92
Episode 4: reward: -108.000, steps: 108
Episode 5: reward: -112.000, steps: 112
Episode 6: reward: -103.000, steps: 103
Episode 7: reward: -115.000, steps: 115
Episode 8: reward: -85.000, steps: 85
Episode 9: reward: -110.000, steps: 110
Episode 10: reward: -88.000, steps: 88
Episode 11: reward: -111.000, steps: 111
Episode 12: reward: -109.000, steps: 109
Episode 13: reward: -110.000, steps: 110
Episode 14: reward: -112.000, steps: 112
Episode 15: reward: -110.000, steps: 110
