In [1]:
import numpy as np
from agent import *
from maze import *
from neural_network import *
from hrr import *

In [2]:
# Number of training cycles
episodes = 10

# Hrr parameters
hrr_length = 1024
normalized = True

# How many steps to take before quiting
steps_till_quit = 100

# Maze parameters
size_of_maze = 5
non_obs_task_switch_rate = 12
num_non_obs_tasks = 3
num_obs_tasks = 3
goals = [[0,2,1], [1,0,2],[2,1,0]]
signals = ["red", "green", "blue"]

# random goal setting
# goals = np.stack([np.random.choice(range(size_of_maze), num_non_obs_tasks, replace=False) for _ in range(num_obs_tasks)])

# Arguments for neural network
input_size = hrr_length
output_size = 1
bias = 1
discount = 0.9
alpha = 0.01

# Reward for temporal difference learning
reward_bad = 0
reward_good = 1

# Expolration rate
e_soft = 0.01

# Threshold for non observable task switching
threshold = -0.2

# Print frequency
p_freq = 1

# Eligibility trace
eligibility = [0] * hrr_length

# Eligibility trace rate
eli_lambda = 0.9

atr = np.random.randint(0, num_non_obs_tasks + 1)
wm = "I"

# Neural network
weights = hrr(hrr_length, normalized)
bias = 0

In [3]:
agent = agent()
maze = maze(size_of_maze, non_obs_task_switch_rate, num_non_obs_tasks, num_obs_tasks, goals)
nn = NeuralNetwork(input_size, output_size, bias, discount, alpha, reward_good, reward_bad)
ltm = LTM("hrrs_1", hrr_length, normalized)

In [4]:
# Pre convolve everything
for non_obs in range(num_non_obs_tasks):
    for signal in ["I"] + signals:
        for state in range(size_of_maze):
            ltm.encode("state_" + str(state) + "*non_obs_" + str(non_obs) + "*" + str(signal))

I*non_obs_0
I*state_0
non_obs_0*state_0
I*non_obs_0*state_0
I*non_obs_0
I*state_1
non_obs_0*state_1
I*non_obs_0*state_1
I*non_obs_0
I*state_2
non_obs_0*state_2
I*non_obs_0*state_2
I*non_obs_0
I*state_3
non_obs_0*state_3
I*non_obs_0*state_3
I*non_obs_0
I*state_4
non_obs_0*state_4
I*non_obs_0*state_4
non_obs_0*red
non_obs_0*state_0
red*state_0
non_obs_0*red*state_0
non_obs_0*red
non_obs_0*state_1
red*state_1
non_obs_0*red*state_1
non_obs_0*red
non_obs_0*state_2
red*state_2
non_obs_0*red*state_2
non_obs_0*red
non_obs_0*state_3
red*state_3
non_obs_0*red*state_3
non_obs_0*red
non_obs_0*state_4
red*state_4
non_obs_0*red*state_4
green*non_obs_0
green*state_0
non_obs_0*state_0
green*non_obs_0*state_0
green*non_obs_0
green*state_1
non_obs_0*state_1
green*non_obs_0*state_1
green*non_obs_0
green*state_2
non_obs_0*state_2
green*non_obs_0*state_2
green*non_obs_0
green*state_3
non_obs_0*state_3
green*non_obs_0*state_3
green*non_obs_0
green*state_4
non_obs_0*state_4
green*non_obs_0*state_4
blue*non_o

In [5]:
ltm.print()

<hrr.LTM object at 0x7fd406b33c88>
non_obs_0*red*state_4 [-0.0315872   0.00902552  0.02847763 ...  0.01411746 -0.00390782
  0.0103941 ]
non_obs_1*red*state_2 [-0.02818052 -0.00540804 -0.0016045  ...  0.0010057  -0.01267563
 -0.01347573]
blue*non_obs_1*state_4 [ 0.04980096  0.00086214 -0.03939737 ... -0.0224396   0.03622689
 -0.01051049]
green*state_3 [ 0.02731688  0.01529385 -0.0007393  ...  0.00237507 -0.03657621
 -0.00075835]
non_obs_2*red*state_4 [ 0.02523417 -0.01395139  0.05146162 ...  0.00599825  0.07517403
  0.05021077]
I*non_obs_1*state_1 [-0.00297001  0.00335334  0.01373985 ...  0.04272902  0.03173524
 -0.0159834 ]
red*state_2 [ 0.0308517  -0.00036419  0.03949805 ...  0.02522013 -0.00239037
 -0.00391244]
green*non_obs_0*state_1 [ 0.00361567 -0.04541603  0.00856334 ...  0.029412    0.00441688
  0.03047601]
non_obs_2*state_1 [ 0.01284967  0.01041931 -0.05377809 ...  0.04091686 -0.01280309
  0.02471823]
blue*state_3 [ 0.01383852  0.01576935  0.02404218 ...  0.05100238 -0.01380966

In [6]:
def policy(moves, wms, non_obs, rand_on):
    
    val = -9999
    temp = -9999
    
    # Random move
    if((np.random.random() < e_soft) and (rand_on == 1)):
        return np.random.choice(moves), wms[1], non_obs[1]
    
    # Loops through everything possibility
    for move in moves:
        for wm in wms:
            for non_ob in non_obs:
                temp = np.dot(weights, ltm.encode("state_" + str(current) + "*non_obs_" + str(non_ob) + "*" + str(signal))) + bias
                if temp > val:
                    val = temp
                    s_move = move
                    s_wm = wm
                    
    s_non_ob = non_obs[1]
    return s_move, s_wm, s_non_ob

In [7]:
for x in range(episodes):
    
    # Starting state
    current = random.randint(0, size_of_maze - 1)
    
    # Signal for the maze run
    signal = np.random.choice(signals)
    
    # Maze progresses
    non_obs, goal = maze.step_maze(signals.index(signal))
    
    for y in range(steps_till_quit):

        current_view = ltm.encode(str(wm) + "*non_obs_" + str(non_obs) + "*state_" + str(current))
        
        left, right = agent.get_moves(4, size_of_maze)
        
        move, wm, atr = policy([left, right], [signal, wm], [0, atr], 1)
        
    if (x+1)%p_freq == 0:
        print("Episode" , x+1, "done")

Episode 1 done
Episode 2 done
Episode 3 done
Episode 4 done
Episode 5 done
Episode 6 done
Episode 7 done
Episode 8 done
Episode 9 done
Episode 10 done


In [8]:
ltm.clean()
del ltm, agent, maze, nn