In [None]:
import tensorflow as tf
import numpy as np

In [None]:
class Corridor:
 
    def reset(self):
        self.current_state = 3
        self.steps = 0
        self.s6_visits = 0
        return self.vectorize(self.current_state)

    def step(self, action):
        # Left
        if action == 0:
            self.current_state -= 1
        # Right
        if action == 1 and self.current_state != 6:
            # Move from state 5 to 6
            if self.current_state == 5:
                self.s6_visits += 1
            self.current_state += 1
        self.steps += 1

        if self.current_state == 0:
            if self.s6_visits >= 2:
                return self.vectorize(self.current_state), 1.0, True
            return self.vectorize(self.current_state), 0.01, True

        if self.steps == 20:
            return self.vectorize(self.current_state), 0.0, True

        return self.vectorize(self.current_state), 0.0, False
    
    def vectorize(self, state):
        vector = np.zeros(7)
        vector[state] = 1
        return vector

In [None]:
class RecurrentHReinforce:

    def __init__(self, new=True):
        self.memory = []
        self.states = []
        self.options = []
        self.rewards = []
        self.discount_rate = 0.99
        self.learning_rate = 0.001
        self.model = self.build_model()

    def build_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.GRU(64, input_shape=(None, 7), return_sequences=True))
        model.add(tf.keras.layers.Dense(7, activation='softmax'))
        model.compile(loss="categorical_crossentropy",
                    optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate, clipnorm=1.0))
        model.summary()
        return model

    def select_option(self, state_list):
        applicable = np.delete(np.arange(7), np.argmax(state_list[-1]))
        state_list = np.reshape(state_list, (1, len(state_list), 7))
        prob = np.delete(self.model.predict(state_list)[0, -1], np.argmax(state_list[0, -1]))
        if np.sum(prob) == 0:
            return np.random.choice(applicable)
        prob /= np.sum(prob)
        return np.random.choice(applicable, 1, p=prob)[0]

    def store_transition(self, state, option, reward):
        self.states.append(state)
        self.options.append(option)
        self.rewards.append(reward)

    def store_episode(self):
        self.memory.append((self.states, self.options, self.rewards))
        self.states = []
        self.options = []
        self.rewards = []

    def update(self, episode):
        trajectory = self.memory[episode]
        states = trajectory[0]
        options = trajectory[1]
        rewards = trajectory[2]

        T = len(states)

        returns = np.zeros(T)
        returns[-1] = rewards[-1]
        for t in reversed(range(1, T)):
            returns[t - 1] = rewards[t - 1] + self.discount_rate * returns[t]

        x = np.reshape(states, (1, T, 7))
        y = np.zeros((1, T, 7))

        for i in range(T):
            y[0, i, options[i]] = returns[i]

        self.model.fit(x, y, epochs=1, verbose=0)
    
    def batch_update(self, start, end):
        for episode in range(start, end):
            self.update(episode)

In [None]:
def select_action(state, option):
    return 1 if np.argmax(state) < option else 0

In [None]:
def train(start, runs):
    if start == 0:
        rewards = np.zeros((runs, 10000))
    else:
        rewards = np.load(f"./drive/My Drive/corridor/mnemonic_rewards.npy")

    for run in range(start, runs):
        print("\nRun " + str(run))

        env = Corridor()
        meta = RecurrentHReinforce()

        for episode in range(10000):
            # print("\nEpisode " + str(episode))

            done = False
            meta_state = env.reset()
            episode_reward = 0

            state_list = []
            state_list.append(meta_state)

            while not done:
                if episode < 1000:
                    applicable = np.delete(np.arange(7), np.argmax(meta_state))
                    option = np.random.choice(applicable)
                else:
                    option = meta.select_option(state_list)
                reached = np.argmax(meta_state) == option
                
                option_reward = 0

                state = meta_state
                while not done and not reached:
                    action = select_action(state, option)
                    next_state, reward, done = env.step(action)
                    
                    option_reward += reward
                    episode_reward += reward
                    
                    reached = np.argmax(next_state) == option
                    state = next_state
                
                meta.store_transition(meta_state, option, option_reward)
                meta_state = state
                state_list.append(meta_state)

            meta.store_episode()
            meta.update(episode)

            rewards[run, episode] = episode_reward

            # print("reward =  " + str(episode_reward))

            if episode % 1000 == 999:
                np.save(f"./drive/My Drive/corridor/mnemonic_rewards", rewards)
        
        meta.model.save(f"./drive/My Drive/corridor/mnemonic_{run}_{episode}.h5")

In [None]:
train(0, 10)