In [None]:
import tensorflow as tf
import numpy as np
import random

In [None]:
class Grid:

    def reset(self):
        self.items = {(1, 4), (5, 0), (5, 4)}
        self.view = np.zeros((6, 5))
        self.current_state = [1, 0]
        self.inventory = []
        self.delivered = set()
        self.steps = 0
        return self.vectorize(self.current_state)

    def step(self, action):
        # Right
        if action == 0:
            self.current_state[1] = min(self.current_state[1] + 1, 4)
        # Up
        if action == 1:
            if self.current_state == [1, 2]:
                self.current_state[0] -= 1
            else:
                self.current_state[0] = max(self.current_state[0] - 1, 1)    
        # Left
        if action == 2:
            self.current_state[1] = max(self.current_state[1] - 1, 0)
        # Down
        if action == 3:
            self.current_state[0] = min(self.current_state[0] + 1, 5)
        self.steps += 1

        # Pick up
        s = tuple(self.current_state)
        if s in self.items and not self.inventory:
            self.inventory.append(s)
            self.items.remove(s)

        # Drop off
        if self.current_state == [1, 0] and self.inventory:
            self.delivered.add(self.inventory.pop())
    
        if self.current_state == [0, 2]:
            if self.delivered == {(1, 4), (5, 0), (5, 4)}:
                return self.vectorize(self.current_state), 1.0, True
            return self.vectorize(self.current_state), 0.0, True

        if self.steps == 60:
            return self.vectorize(self.current_state), 0.0, True

        return self.vectorize(self.current_state), 0.0, False

    def vectorize(self, state):
        self.view[0] = [-1, -1, 0, -1, -1]
        self.view[1:] = 0
        self.view[self.current_state[0]][self.current_state[1]] = 1
        return self.view

In [None]:
class hDQN:

    def __init__(self):
        self.memory = []
        self.epsilon = 1.0
        self.epsilon_decay = 0.9997
        self.batch_size = 64
        self.discount_rate = 0.99
        self.learning_rate = 0.001
        self.tau = 0.001
        self.model = self.build_model()
        self.compile_model(self.model)
        self.target_model = self.build_model()
        self.target_model.set_weights(self.model.get_weights())

    def build_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Dense(16, input_shape=(30, ), activation='relu'))
        model.add(tf.keras.layers.Dense(32, activation='relu'))
        model.add(tf.keras.layers.Dense(5, activation='linear'))
        return model

    def compile_model(self, model):
        model.compile(loss="huber_loss",
                    optimizer=tf.keras.optimizers.RMSprop(lr=self.learning_rate, clipnorm=1.0))
        model.summary()

    def select_option(self, state):
        current = loc_to_option(view_to_loc(state))
        if np.random.rand() < self.epsilon:
            applicable = np.delete(np.arange(5), current)
            return np.random.choice(applicable)
        state = np.reshape(state, (1, 30))
        pred = self.model.predict(state)[0]
        pred[current] = np.NINF
        return np.argmax(pred)

    def store(self, state, option, reward, next_state, done):
        self.memory.append((state, option, reward, next_state, done))
        if len(self.memory) > 100000:
            self.memory = self.memory[-100000:]

    def replay(self):
        # state, option, reward, next_state, done
        batch = random.sample(self.memory, self.batch_size)

        x = [transition[0] for transition in batch]
        x = np.reshape(x, (self.batch_size, 30))
        y = self.model.predict(x)
        next_x = [transition[3] for transition in batch]
        next_x = np.reshape(next_x, (self.batch_size, 30))
        next_y = self.target_model.predict(next_x)

        for i, transition in enumerate(batch):
            option = transition[1]
            reward = transition[2]
            done = transition[4]
            if done:
                y[i, option] = reward
            else:
                y[i, option] = reward + self.discount_rate * np.amax(next_y[i])
            
        self.model.fit(x, y, epochs=1, verbose=0)
        self.update_target_model()

    def update_target_model(self):
        model_weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(model_weights)):
            target_weights[i] = target_weights[i] + self.tau * (model_weights[i] - target_weights[i])
        self.target_model.set_weights(target_weights)

In [None]:
def view_to_loc(view):
    index = np.where(view == 1) 
    return [index[0][0], index[1][0]]

def option_to_loc(option):
    if option == 0:
        return [1, 0]
    if option == 1:
        return [1, 4]
    if option == 2:
        return [5, 0]
    if option == 3:
        return [5, 4]
    if option == 4:
        return [0, 2]

def loc_to_option(loc):
    if loc == [1, 0]:
        return 0
    if loc == [1, 4]:
        return 1
    if loc == [5, 0]:
        return 2
    if loc == [5, 4]:
        return 3
    if loc == [0, 2]:
        return 4

def select_action(view, option):
    view_loc = view_to_loc(view)
    option_loc = option_to_loc(option)
    if view_loc[0] < option_loc[0]:
        return 3
    if view_loc[1] < option_loc[1]:
        return 0
    if view_loc[0] > option_loc[0]:
        return 1
    if view_loc[1] > option_loc[1]:
        return 2

In [None]:
def train(start, runs):
    if start == 0:
        rewards = np.zeros((runs, 20000))
        terminal_steps = np.zeros((runs, 20000))
    else:
        rewards = np.load(f"./drive/My Drive/grid/hdqn_rewards.npy")
        terminal_steps = np.load(f"./drive/My Drive/grid/hdqn_steps.npy")
   
    for run in range(start, runs):
        print("\nRun " + str(run))

        env = Grid()
        meta = hDQN()
        steps = 0

        for episode in range(20000):
            # print("\nEpisode " + str(episode))

            done = False
            meta_state = env.reset()
            episode_reward = 0

            while not done:
                option = meta.select_option(meta_state)
                reached = view_to_loc(meta_state) == option_to_loc(option)
                
                steps += 1
                option_reward = 0

                state = meta_state
                while not done and not reached:
                    action = select_action(state, option)
                    next_state, reward, done = env.step(action)
                    
                    option_reward += reward
                    episode_reward += reward
                    
                    reached = view_to_loc(next_state) == option_to_loc(option)
                    state = next_state

                meta.store(meta_state, option, option_reward, state, done)
                meta_state = state

                if len(meta.memory) >= 2000:
                    meta.replay()
                    if meta.epsilon > 0.01:
                        meta.epsilon *= meta.epsilon_decay

            rewards[run, episode] = episode_reward
            terminal_steps[run, episode] = steps

            # print("reward =  " + str(episode_reward))
            # print("terminal step =  " + str(steps))
            # print("epsilon = " + str(meta.epsilon))

            if episode % 100 == 99:
                np.save(f"./drive/My Drive/grid/hdqn_rewards", rewards)
                np.save(f"./drive/My Drive/grid/hdqn_steps", terminal_steps)
        
        meta.model.save(f"./drive/My Drive/grid/hdqn_{run}_{episode}.h5")
        meta.target_model.save(f"./drive/My Drive/grid/hdqn_target_{run}_{episode}.h5")

In [None]:
train(0, 10)


Run 0
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                496       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 165       
Total params: 1,205
Trainable params: 1,205
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Run 1
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 16)  