In [None]:
import numpy as np
import random
import pylab
from agent import *
from maze import *
from neural_network import *
from hrr import *
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
# Number of training cycles
episodes = 5000

# Hrr parameters
hrr_length = 2048
normalized = True

# How many steps to take before quiting
steps_till_quit = 30

# Maze parameters
size_of_maze = 10
non_obs_task_switch_rate = 50
num_non_obs_tasks = 1
num_obs_tasks = 1
# goals = [[0,2,1], [1,0,2],[2,1,0]]
# signals = ["red", "green", "blue"]
goals = [5]
signals = ["red"]

# random goal setting
# goals = np.stack([np.random.choice(range(size_of_maze), num_non_obs_tasks, replace=False) for _ in range(num_obs_tasks)])

# Arguments for neural network
input_size = hrr_length
output_size = 1
discount = 0.5
alpha = 0.1

# Reward for temporal difference learning
reward_bad = 0
reward_good = 1

# Expolration rate
e_soft = 0.02

# Threshold for non observable task switching
threshold = -0.2

# Print frequency
p_freq = 100

# Eligibility trace
eligibility = np.zeros(hrr_length)

# Eligibility trace rate
eli_lambda = 0.5

# atr = np.random.randint(0, num_non_obs_tasks + 1)
atr = 0
wm = "I"

# Neural network
weights = hrr(hrr_length, normalized)
bias = 0

In [None]:
agent = agent()
maze = maze(size_of_maze, non_obs_task_switch_rate, num_non_obs_tasks, num_obs_tasks, goals)
nn = NeuralNetwork(input_size, output_size, bias, discount, alpha, reward_good, reward_bad)
ltm = LTM("hrrs_1_" + str(hrr_length), hrr_length, normalized)

In [None]:
ltm.print()

In [None]:
def move_policy(moves, wms, non_obs, rand_on):
    
    val = -9999
#     print(moves, wms, non_obs)
    
    # Random move
    if((np.random.random() < e_soft) and (rand_on == 1)):
        return np.random.choice(moves), random.choice(wms), random.choice(non_obs)
    
    # Loops through everything possibility
    for move in moves:
        for wm in wms:
            if(wm == "red" or wm == "blue" or wm == "green"):
                wm = wm + "In"
            temp = np.dot(weights, ltm.encode("state_" + str(move) + "*non_obs_" + str(non_obs[0]) + "*" + str(wm))) + bias
#             print(move, non_obs[0], wm, temp)
            if temp > val:
                val = temp
                s_move = move
                s_wm = wm
                    
    s_non_ob = non_obs[0]
    return s_move, s_wm, s_non_ob

def context_policy(atr):
    return (atr + 1)%num_non_obs_tasks

In [None]:
for x in range(episodes):
    
    # Starting state
    current = random.randint(0, size_of_maze - 1)
    
    # Signal for the maze run
    signal = np.random.choice(signals)
#     print(signal)
    
    # Maze progresses
#     non_obs, goal = maze.step_maze(signals.index(signal))
    non_obs = 0
    atr = 0
    goal = goals[0]
    
    # Reset trace
    eligibility *= 0.0
    
    wm = "I"
    
 #   print("Expected: Goal: " + str(goal) + " Signal: " + str(signal) + " Non_Observable: " + str(non_obs))
    
    for y in range(steps_till_quit):
        
        # Store info about previous state
        previous = current
        previous_state = ltm.encode(str(wm) + "*non_obs_" + str(atr) + "*state_" + str(previous))
        previous_value = np.dot(weights, previous_state) + bias
        
       # print("Started with: State: " + str(previous) + " State Value: " + str(previous_value) + " WM: " + str(wm) + " Atr: " + str(atr))
        
        eligibility *= eli_lambda
        
        left, right = agent.get_moves(previous, size_of_maze)
        move, wm, atr = move_policy([left, right], list(set([wm, signal])), [atr], 1)
        
        # Make the move
        current = move
        current_state = ltm.encode(str(wm) + "*non_obs_" + str(atr) + "*state_" + str(current))
        current_value = np.dot(weights, current_state) + bias
        
       # print("Moved: State: " + str(current) + " State Value: " + str(current_value) + " WM: " + str(wm) + " Atr: " + str(atr))
       
        # Goal reached
        if (current == goal):
            # Get temporal difference error and update weights of neural network
            goal_hrr = ltm.encode(str(wm) + "*non_obs_" + str(atr) + "*state_" + str(current) + "*rewardTkn")
            goal_value = np.dot(weights, goal_hrr) + bias
            
            error = (reward_bad + discount * goal_value - previous_value)
            eligibility = eligibility + previous_state
            weights = np.add(weights, (alpha * error * eligibility))
            
     #       print("In Goal: State Value: " + str(goal_value) + " WM: " + str(wm) + " Atr: " + str(atr))
     #       input("Press Enter to continue...")
            
            error = reward_good - goal_value
            eligibility = eligibility + goal_hrr
            weights = np.add(weights, (alpha * error * eligibility))
            
            break
        
        # Weight update for goal not found
        error = (reward_bad + discount * current_value) - previous_value
        eligibility = eligibility + previous_state
        if error < threshold:
                atr = context_policy(atr)
                eligibility = eligibility * 0.0
    #            print("Atr after policy: " + str(atr))
                
        weights = np.add(weights, (alpha * error * eligibility))
        
        signal = "I"
        
        
  #      input("Press Enter to continue...")
      
    if((x+1)%p_freq == 0):
        print("Episode" , x+1, "done")
#     if y >= steps_till_quit:
#         print("Failed and broke out")

In [None]:
internal = [sig + "In" for sig in signals]
position = np.arange(size_of_maze)
value = np.zeros(size_of_maze)
atr = 0
for extS in (["I"] + signals):
    lab = str(extS) + "*non_obs_" + str(atr)
    for state in range(size_of_maze):
        if state in goals:
            value[state] = np.dot(weights, ltm.encode(lab + "*state_" + str(state) + "*rewardTkn")) + bias
        else:
            value[state] = np.dot(weights, ltm.encode(lab + "*state_" + str(state))) + bias
    plt.plot(position, value, label=lab)
value = np.zeros(size_of_maze)
for intS in (["I"] + internal):
    lab = str(intS) + "*non_obs_" + str(atr)
    for state in range(size_of_maze):
        if state in goals:
            value[state] = np.dot(weights, ltm.encode(lab + "*state_" + str(state) + "*rewardTkn")) + bias
        else:
            value[state] = np.dot(weights, ltm.encode(lab + "*state_" + str(state))) + bias
    plt.plot(position, value, label=lab)
pylab.legend(loc='upper left')

In [None]:
ltm.print()

In [None]:
ltm.clean()
del ltm, agent, maze, nn