In [1]:
import numpy as np
from agent import *
from maze import *
from neural_network import *
from hrr import *

In [2]:
# Number of training cycles
episodes = 10

# Hrr parameters
hrr_length = 1024
normalized = True

# How many steps to take before quiting
steps_till_quit = 100

# Maze parameters
size_of_maze = 5
non_obs_task_switch_rate = 12
num_non_obs_tasks = 3
num_obs_tasks = 3
goals = [[0,2,1], [1,0,2],[2,1,0]]
signals = ["red", "green", "blue"]

# random goal setting
# goals = np.stack([np.random.choice(range(size_of_maze), num_non_obs_tasks, replace=False) for _ in range(num_obs_tasks)])

# Arguments for neural network
input_size = hrr_length
output_size = 1
bias = 1
discount = 0.9
alpha = 0.01

# Reward for temporal difference learning
reward_bad = 0
reward_good = 1

# Expolration rate
e_soft = 0.01

# Threshold for non observable task switching
threshold = -0.2

# Print frequency
p_freq = 1

# Eligibility trace
eligibility = [0] * hrr_length

# Eligibility trace rate
eli_lambda = 0.9

atr = np.random.randint(0, num_non_obs_tasks + 1)
wm = "I"

# Neural network
weights = hrr(hrr_length, normalized)
bias = 0

In [3]:
agent = agent()
maze = maze(size_of_maze, non_obs_task_switch_rate, num_non_obs_tasks, num_obs_tasks, goals)
nn = NeuralNetwork(input_size, output_size, bias, discount, alpha, reward_good, reward_bad)
ltm = LTM("hrrs_1", hrr_length, normalized)

In [4]:
# Pre convolve everything
for non_obs in range(num_non_obs_tasks):
    for signal in ["I"] + signals:
        for state in range(size_of_maze):
            ltm.encode("state_" + str(state) + "*non_obs_" + str(non_obs) + "*" + str(signal) + "*rewardTkn")

I*non_obs_0
I*rewardTkn
I*state_0
non_obs_0*rewardTkn
non_obs_0*state_0
rewardTkn*state_0
I*non_obs_0*rewardTkn
I*non_obs_0*state_0
I*rewardTkn*state_0
non_obs_0*rewardTkn*state_0
I*non_obs_0*rewardTkn*state_0
I*non_obs_0
I*rewardTkn
I*state_1
non_obs_0*rewardTkn
non_obs_0*state_1
rewardTkn*state_1
I*non_obs_0*rewardTkn
I*non_obs_0*state_1
I*rewardTkn*state_1
non_obs_0*rewardTkn*state_1
I*non_obs_0*rewardTkn*state_1
I*non_obs_0
I*rewardTkn
I*state_2
non_obs_0*rewardTkn
non_obs_0*state_2
rewardTkn*state_2
I*non_obs_0*rewardTkn
I*non_obs_0*state_2
I*rewardTkn*state_2
non_obs_0*rewardTkn*state_2
I*non_obs_0*rewardTkn*state_2
I*non_obs_0
I*rewardTkn
I*state_3
non_obs_0*rewardTkn
non_obs_0*state_3
rewardTkn*state_3
I*non_obs_0*rewardTkn
I*non_obs_0*state_3
I*rewardTkn*state_3
non_obs_0*rewardTkn*state_3
I*non_obs_0*rewardTkn*state_3
I*non_obs_0
I*rewardTkn
I*state_4
non_obs_0*rewardTkn
non_obs_0*state_4
rewardTkn*state_4
I*non_obs_0*rewardTkn
I*non_obs_0*state_4
I*rewardTkn*state_4
non_obs_

In [5]:
ltm.print()

<hrr.LTM object at 0x7f5f5986aa90>
non_obs_0*red*state_4 [-0.02787226  0.01557953 -0.04249693 ...  0.01413902  0.00586151
 -0.02297537]
I*non_obs_0*rewardTkn*state_0 [ 0.12974126  0.01907249 -0.06287817 ...  0.02720602  0.00144927
  0.00577233]
blue*non_obs_1*rewardTkn*state_1 [-0.01769202  0.0129035   0.02771174 ...  0.00258259 -0.0129815
  0.02491254]
green*state_3 [-0.00748024  0.00306699 -0.00244908 ...  0.01816009  0.01993458
  0.00620681]
red*rewardTkn*state_2 [-0.00929402  0.04830762  0.00115166 ... -0.02860783  0.04217617
 -0.01738662]
blue*rewardTkn*state_1 [ 0.00500625  0.01624354 -0.01755492 ... -0.04497646  0.00965552
 -0.04933164]
non_obs_1*rewardTkn*state_1 [ 0.01753642  0.04107387  0.00621419 ...  0.01802922 -0.02490978
  0.02956483]
green*non_obs_0*state_1 [ 0.00023417  0.02830707  0.0228328  ...  0.01227618  0.05007778
 -0.00992205]
non_obs_1*red*rewardTkn*state_4 [ 0.01483585  0.0346751  -0.05427706 ...  0.00561071 -0.03190413
  0.00016848]
I*non_obs_1*rewardTkn [-0.0

In [6]:
def move_policy(moves, wms, non_obs, rand_on):
    
    val = -9999
    temp = -9999
    
    # Random move
    if((np.random.random() < e_soft) and (rand_on == 1)):
        return np.random.choice(moves), wms[1], non_obs[1]
    
    # Loops through everything possibility
    for move in moves:
        for wm in wms:
            for non_ob in non_obs:
                temp = np.dot(weights, ltm.encode("state_" + str(current) + "*non_obs_" + str(non_ob) + "*" + str(signal))) + bias
                if temp > val:
                    val = temp
                    s_move = move
                    s_wm = wm
                    
    s_non_ob = non_obs[1]
    return s_move, s_wm, s_non_ob

def context_policy(moves, atr):
    return atr[1]

In [7]:
for x in range(episodes):
    
    # Starting state
    current = random.randint(0, size_of_maze - 1)
    
    # Signal for the maze run
    signal = np.random.choice(signals)
    
    # Maze progresses
    non_obs, goal = maze.step_maze(signals.index(signal))
    
    # Reset trace
    eligibility = [0] * hrr_length
    
    for y in range(steps_till_quit):
        
        # Store info about previous state
        previous = current
        previous_state = ltm.encode(str(wm) + "*non_obs_" + str(non_obs) + "*state_" + str(previous))
        previous_value = np.dot(weights, previous_state) + bias
        
        eligibility = [x * eli_lambda for x in eligibility]
        
        left, right = agent.get_moves(4, size_of_maze)
        move, wm, atr = move_policy([left, right], [signal, wm], [0, atr], 1)
        
        # Make the move
        current = move
        current_state = ltm.encode(str(wm) + "*non_obs_" + str(non_obs) + "*state_" + str(current))
        current_value = np.dot(weights, current_state) + bias
        
        # Goal reached
        if (current == goal):
            # Get temporal difference error and update weights of neural network
            goal_hrr = ltm.encode(str(wm) + "*non_obs_" + str(non_obs) + "*state_" + str(current) + "*rewardTkn")
            goal_value = np.dot(weights, goal_hrr) + bias
            
            error = (reward_bad + discount * value - previous_value)
            eligibility = eligibility + previous_state
            weights = np.add(weights, (alpha * error * eligibility))
            
            error = reward_good - value
            eligibility = eligibility + goal_hrr
            weights = np.add(weights, (alpha * error * eligibility))
            
            break
        
        # Weight update for goal not found
        error = (reward_bad + discount * current_value) - previous_value
        eligibility = eligibility + previous_state
        if error < threshold:
                atr = context_policy([move], [0, atr])
                break
        weights = np.add(weights, (alpha * error * eligibility))
        
    if (x+1)%p_freq == 0:
        print("Episode" , x+1, "done")

blue*non_obs_3
blue*state_2
non_obs_3*state_2
blue*non_obs_3*state_2
blue*non_obs_3
blue*state_3
non_obs_3*state_3
blue*non_obs_3*state_3
blue*non_obs_3
blue*state_0
non_obs_3*state_0
blue*non_obs_3*state_0
Episode 1 done
blue*non_obs_3
blue*state_1
non_obs_3*state_1
blue*non_obs_3*state_1
Episode 2 done
green*non_obs_3
green*state_4
non_obs_3*state_4
green*non_obs_3*state_4
green*non_obs_3
green*state_3
non_obs_3*state_3
green*non_obs_3*state_3
Episode 3 done
green*non_obs_3
green*state_0
non_obs_3*state_0
green*non_obs_3*state_0
Episode 4 done
green*non_obs_3
green*state_1
non_obs_3*state_1
green*non_obs_3*state_1
Episode 5 done
Episode 6 done
Episode 7 done
Episode 8 done
non_obs_3*red
non_obs_3*state_2
red*state_2
non_obs_3*red*state_2
non_obs_3*red
non_obs_3*state_3
red*state_3
non_obs_3*red*state_3
Episode 9 done
non_obs_3*red
non_obs_3*state_0
red*state_0
non_obs_3*red*state_0
Episode 10 done


In [8]:
ltm.clean()
del ltm, agent, maze, nn