In [None]:
import time
import gym
import numpy as np
from IPython.display import clear_output
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout, Reshape
from keras.layers import Embedding, BatchNormalization
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import Policy, BoltzmannQPolicy
from rl.memory import SequentialMemory

np.set_printoptions(threshold=np.inf)
np.set_printoptions(precision=4)

In [None]:
ENV_NAME = 'FrozenLake-v1'

In [None]:
class DecayEpsGreedyQPolicy(Policy):

    def __init__(self, max_eps=.1, min_eps=.05, lamb=0.001):
        super(DecayEpsGreedyQPolicy, self).__init__()
        self.max_eps = max_eps
        self.lambd = lamb
        self._steps = 0
        self.min_eps = min_eps

    def select_action(self, q_values):
        assert q_values.ndim == 1
        nb_actions = q_values.shape[0]
        eps = self.min_eps + (self.max_eps - self.min_eps) * \
            np.exp(-self.lambd * self._steps)
        self._steps += 1
        if self._steps % 1e3 == 0:
            print("Current eps:", eps)
        if np.random.uniform() < eps:
            action = np.random.random_integers(0, nb_actions - 1)
        else:
            action = np.argmax(q_values)
        return action

In [None]:
desc=["SFFF", "FHFH", "FFFH", "HFFG"]
env = gym.make(ENV_NAME, desc=desc, map_name="4x4", is_slippery=False)

In [None]:
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [None]:
def get_keras_model(action_space_shape):
    model = Sequential()
    model.add(Embedding(16, 4, input_length=1))
    model.add(Reshape((4,)))
    print(model.summary())
    return model

In [None]:
model = get_keras_model(nb_actions)

In [None]:
memory = SequentialMemory(limit=10000, window_length=1)
policy = DecayEpsGreedyQPolicy(max_eps=0.9, min_eps=0, lamb=1 / (1e4))
dqn = DQNAgent(model=model, nb_actions=nb_actions,
               memory=memory, nb_steps_warmup=500,
               target_model_update=1e-2, policy=policy,
               enable_double_dqn=False, batch_size=512
               )
dqn.compile(Adam())

In [None]:
try:
    dqn.load_weights('dqn_{}_weights.h5f'.format(ENV_NAME))
except Exception as e:
    print(e)
    pass

In [None]:
dqn.fit(env, nb_steps=2e4, visualize=False, verbose=1, log_interval=10000)

In [None]:
# After training is done, we save the final weights.
dqn.save_weights(f"dqn_{ENV_NAME}_weights.h5f", overwrite=True)

In [None]:
# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=20, visualize=False)

In [None]:
state = env.reset()
env.render()
done = False
while not done:
    clear_output(wait=True)
    action = dqn.forward(state)  # and chose action from the Q-Table
    state, reward, done, info = env.step(action) # Finally perform the action
    env.render()

    time.sleep(1)

env.close()
print(f"Game over")