In [None]:
import tensorflow as tf
import numpy as np

In [None]:
class Grid:

    def reset(self):
        self.items = {(1, 4), (5, 0), (5, 4)}
        self.view = np.zeros((6, 5))
        self.current_state = [1, 0]
        self.inventory = []
        self.delivered = set()
        self.steps = 0
        return self.vectorize(self.current_state)

    def step(self, action):
        # Right
        if action == 0:
            self.current_state[1] = min(self.current_state[1] + 1, 4)
        # Up
        if action == 1:
            if self.current_state == [1, 2]:
                self.current_state[0] -= 1
            else:
                self.current_state[0] = max(self.current_state[0] - 1, 1)    
        # Left
        if action == 2:
            self.current_state[1] = max(self.current_state[1] - 1, 0)
        # Down
        if action == 3:
            self.current_state[0] = min(self.current_state[0] + 1, 5)
        self.steps += 1

        # Pick up
        s = tuple(self.current_state)
        if s in self.items and not self.inventory:
            self.inventory.append(s)
            self.items.remove(s)

        # Drop off
        if self.current_state == [1, 0] and self.inventory:
            self.delivered.add(self.inventory.pop())
    
        if self.current_state == [0, 2]:
            if self.delivered == {(1, 4), (5, 0), (5, 4)}:
                return self.vectorize(self.current_state), 1.0, True
            return self.vectorize(self.current_state), 0.0, True

        if self.steps == 60:
            return self.vectorize(self.current_state), 0.0, True

        return self.vectorize(self.current_state), 0.0, False

    def vectorize(self, state):
        self.view[0] = [-1, -1, 0, -1, -1]
        self.view[1:] = 0
        self.view[self.current_state[0]][self.current_state[1]] = 1
        return self.view

In [None]:
class Recurrent_hREINFORCE:

    def __init__(self):
        self.memory = []
        self.states = []
        self.options = []
        self.rewards = []
        self.discount_rate = 0.99
        self.learning_rate = 0.001
        self.model = self.build_model()

    def build_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.GRU(64, input_shape=(None, 30), return_sequences=True))
        model.add(tf.keras.layers.Dense(5, activation='softmax'))
        model.compile(loss="categorical_crossentropy",
                    optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate, clipnorm=1.0))
        model.summary()
        return model

    def select_option(self, state_list):
        current = loc_to_option(view_to_loc(state_list[-1]))
        applicable = np.delete(np.arange(5), current)
        state_list = np.reshape(state_list, (1, len(state_list), 30))
        prob = np.delete(self.model.predict(state_list)[0, -1], current) 
        if np.sum(prob) == 0:
            return np.random.choice(applicable)
        prob /= np.sum(prob)
        return np.random.choice(applicable, 1, p=prob)[0]

    def store_transition(self, state, option, reward):
        self.states.append(state)
        self.options.append(option)
        self.rewards.append(reward)

    def store_episode(self):
        self.memory.append((self.states, self.options, self.rewards))
        self.states = []
        self.options = []
        self.rewards = []

    def update(self, episode):
        trajectory = self.memory[episode]
        states = trajectory[0]
        options = trajectory[1]
        rewards = trajectory[2]

        T = len(states)

        returns = np.zeros(T)
        returns[-1] = rewards[-1]
        for t in reversed(range(1, T)):
            returns[t - 1] = rewards[t - 1] + self.discount_rate * returns[t]

        x = np.reshape(states, (1, T, 30))
        y = np.zeros((1, T, 5))

        for i in range(T):
            y[0, i, options[i]] = returns[i]

        self.model.fit(x, y, epochs=1, verbose=0)
    
    def batch_update(self, start, end):
        for episode in range(start, end):
            self.update(episode)

In [None]:
def view_to_loc(view):
    index = np.where(view == 1) 
    return [index[0][0], index[1][0]]

def option_to_loc(option):
    if option == 0:
        return [1, 0]
    if option == 1:
        return [1, 4]
    if option == 2:
        return [5, 0]
    if option == 3:
        return [5, 4]
    if option == 4:
        return [0, 2]

def loc_to_option(loc):
    if loc == [1, 0]:
        return 0
    if loc == [1, 4]:
        return 1
    if loc == [5, 0]:
        return 2
    if loc == [5, 4]:
        return 3
    if loc == [0, 2]:
        return 4

def select_action(view, option):
    view_loc = view_to_loc(view)
    option_loc = option_to_loc(option)
    if view_loc[0] < option_loc[0]:
        return 3
    if view_loc[1] < option_loc[1]:
        return 0
    if view_loc[0] > option_loc[0]:
        return 1
    if view_loc[1] > option_loc[1]:
        return 2

In [None]:
def train(start, runs):
    if start == 0:
        rewards = np.zeros((runs, 20000))
    else:
        rewards = np.load(f"./drive/My Drive/grid/mnemonic_rewards.npy")
   
    for run in range(start, runs):
        print("\nRun " + str(run))

        env = Grid()
        meta = Recurrent_hREINFORCE()

        for episode in range(20000):
            # print("\nEpisode " + str(episode))

            done = False
            meta_state = env.reset()
            episode_reward = 0

            state_list = []
            state_list.append(meta_state)

            while not done:
                option = meta.select_option(state_list)
                reached = view_to_loc(meta_state) == option_to_loc(option)
                
                option_reward = 0

                state = meta_state
                while not done and not reached:
                    action = select_action(state, option)
                    next_state, reward, done = env.step(action)
                    
                    option_reward += reward
                    episode_reward += reward
                    
                    reached = view_to_loc(next_state) == option_to_loc(option)
                    state = next_state

                meta.store_transition(meta_state, option, option_reward)
                meta_state = state
                state_list.append(meta_state)

            meta.store_episode()
            meta.update(episode)

            rewards[run, episode] = episode_reward

            # print("reward =  " + str(episode_reward))

            if episode % 1000 == 999:
                np.save(f"./drive/My Drive/grid/mnemonic_rewards", rewards)
        
        meta.model.save(f"./drive/My Drive/grid/mnemonic_{run}_{episode}.h5")

In [None]:
train(0, 10)


Run 0
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, None, 64)          18240     
_________________________________________________________________
dense (Dense)                (None, None, 5)           325       
Total params: 18,565
Trainable params: 18,565
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Run 1
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, None, 64)          18240     
_________________________________________________________________
dense_1 (Dense)              (None, Non