In [5]:
import numpy as np
import gym
import random

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

import tensorflow.keras.layers as layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

In [99]:
from gym.envs.classic_control.mountain_car import MountainCarEnv

class MountainCarModifiedReward(MountainCarEnv):
    def step(self, action: int):
        previous_state = self.state
        new_state, reward, done, info = super().step(action)
        modified_reward = reward + 300 * (0.95 * abs(new_state[1]) - abs(previous_state[1]))
        if new_state[0] >= 0.5:
            modified_reward += 100
        return new_state, modified_reward, done, info


In [100]:
env =MountainCarModifiedReward()

In [101]:
env.action_space

Discrete(3)

In [102]:
env.observation_space

Box([-1.2  -0.07], [0.6  0.07], (2,), float32)

In [103]:
def build_model(states, actions):
    inputs = layers.Input(shape=(1, states))
    x = layers.Dense(64, activation="relu") (inputs)
    x = layers.Dense(64, activation="relu") (x)
    x = layers.Flatten()(x)
    outputs = layers.Dense(actions, activation="linear")(x)
    return Model(inputs, outputs, name="mountain_car_player")

In [104]:
model = build_model(2,3)

In [105]:
model.summary()

Model: "mountain_car_player"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 1, 2)]            0         
                                                                 
 dense_27 (Dense)            (None, 1, 64)             192       
                                                                 
 dense_28 (Dense)            (None, 1, 64)             4160      
                                                                 
 flatten_9 (Flatten)         (None, 64)                0         
                                                                 
 dense_29 (Dense)            (None, 3)                 195       
                                                                 
Total params: 4,547
Trainable params: 4,547
Non-trainable params: 0
_________________________________________________________________


In [106]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, policy=policy, memory=memory, nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [107]:
dqn = build_agent(model, 3)
dqn.compile(Adam(learning_rate=1e-3), metrics=["mae"])

In [108]:
from callbacks import TrainEpisodeLogger, TrainIntervalLogger, TestLogger
callbacks = [TrainEpisodeLogger("./model/")]

In [109]:
dqn.fit(env,nb_steps=100000,visualize=False,verbose = 1, callbacks=callbacks, nb_max_episode_steps=1000)

Training for 100000 steps ...
Training for 100000 steps ...
Interval 1 (0 steps performed)


  updates=self.state_updates,


  998/10000 [=>............................] - ETA: 2:52 - reward: -1.1621  1000/100000: episode: 1, duration: 19.566s, episode steps: 1000, steps per second: 51, episode reward: -1160.563, mean reward: -1.161 [-2.204, -0.057], mean action: 1.009 [0.000, 2.000], loss: 0.071211, mae: 3.075225, mean_q: -4.418998
The reward is lower than the best one, checkpoint weights not updated
 2000/10000 [=====>........................] - ETA: 2:36 - reward: -1.1946  2000/100000: episode: 2, duration: 19.960s, episode steps: 1000, steps per second: 50, episode reward: -1228.691, mean reward: -1.229 [-5.744, -0.032], mean action: 1.076 [0.000, 2.000], loss: 0.102267, mae: 9.075486, mean_q: -13.351703
The reward is lower than the best one, checkpoint weights not updated
The reward is lower than the best one, checkpoint weights not updated
The reward is lower than the best one, checkpoint weights not updated
The reward is lower than the best one, checkpoint weights not updated
The reward is lower than 

<keras.callbacks.History at 0x1561cc16eb0>

In [41]:
scores = dqn.test(env, nb_episodes=2, visualize=False)

Testing for 100 episodes ...


KeyboardInterrupt: 

In [None]:
dqn.save_weights("./model/model.h5f", overwrite=True)

In [110]:
dqn.load_weights("./model/checkpoint_reward_-17.788243832614583.h5f")

In [None]:
_ = dqn.test(env, nb_episodes=1, visualize=True)