In [None]:
import numpy as np
import time
import random
import pylab
from hrr import *
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
# Number of training cycles
episodes = 10000

# Hrr parameters
hrr_length = 2048
normalized = True

# How many steps to take before quiting
steps_till_quit = 15

goals = [2, 6, 12]

# Maze parameters
size_of_maze = 15
non_obs_task_switch_rate = 100
num_non_obs_tasks = len(goals)

# Arguments for neural network
input_size = hrr_length
output_size = 1
discount = 0.5
alpha = 0.01

# Reward for temporal difference learning
reward_bad = 0
reward_good = 1

# Expolration rate
e_soft = 0.0001

# Threshold for non observable task switching
threshold = -0.2

# Print frequency
p_freq = 100

# Eligibility trace
eligibility = np.zeros(hrr_length)

# Eligibility trace rate
eli_lambda = 0.01

# Neural network
weights = hrr(hrr_length, normalized)
bias = 1

rand_on = 1

In [None]:
def get_moves(state, size_of_maze):
    if(state == 0):
        return size_of_maze - 1, 1
    elif(state == size_of_maze - 1):
        return size_of_maze - 2, 0
    else:
        return state - 1, state + 1

In [None]:
ltm = LTM("test" + str(hrr_length), hrr_length, normalized)

In [None]:
ltm.print()

In [None]:
def context_policy(atr):
    return (atr + 1)%num_non_obs_tasks

def build_hrr_string(state, atr):
    return "State:" + str(state) + "*" + "Atr:" + str(atr)
    
def move_policy(moves, atr, rand_on):
    
    val = -9999
    temp = -9999
    
    for move in moves:
        if move == goal:
            encode_str = build_hrr_string(str(move) + "*rewardTkn", atr)
        else:
            encode_str = build_hrr_string(move, atr)
        temp = np.dot(weights, ltm.encode(encode_str)) + bias
        if temp > val:
            val = temp
            s_move = move
    # Random move
    if((np.random.random_sample() < e_soft) and (rand_on == 1)):
        return np.random.choice(moves)
    
    return s_move

def logmod(x):
    return np.sign(x)*np.log(abs(x)+1)

In [None]:
t0 = time.time()
non_obs = 0
current_atr = 0


for x in range(episodes):
    # Starting state
    current_state = random.randint(0, size_of_maze - 1)
    start = current_state
    
    if x%non_obs_task_switch_rate == 0:
        non_obs = (non_obs+1)%num_non_obs_tasks
    
    goal = goals[non_obs]
    
    # Reset trace
    eligibility *= 0.0
    
    for y in range(steps_till_quit):
        # Goal reached
        if (current_state == goal):
            encode_str = build_hrr_string(str(current_state) + "*rewardTkn", current_atr)
            goal_hrr = ltm.encode(encode_str)
            goal_value = np.dot(weights, goal_hrr) + bias  
            
            error = reward_good - goal_value
            eligibility *= eli_lambda
            eligibility = eligibility + goal_hrr
            weights = np.add(weights, (alpha * logmod(error) * eligibility))
            
            break
            
        previous_state = current_state
        previous_atr = current_atr 
        
        encode_str = build_hrr_string(previous_state, previous_atr)
        previous_state_hrr = ltm.encode(encode_str)
        previous_value = np.dot(weights, previous_state_hrr) + bias
        
        left, right = get_moves(previous_state, size_of_maze)
        move = move_policy([left, right], previous_atr, rand_on)
        
        current_state = move
        current_atr = previous_atr
        
        if current_state == goal:
            encode_str = build_hrr_string(str(current_state) + "*rewardTkn", current_atr)
        else:
            encode_str = build_hrr_string(str(current_state), current_atr)
            
        current_state_hrr = ltm.encode(encode_str)
        current_value = np.dot(weights, current_state_hrr) + bias
        
        error = (reward_bad + discount * current_value) - previous_value
#         eligibility *= eli_lambda
#         eligibility = eligibility + previous_state_hrr
        if error < threshold:
            current_atr = context_policy(current_atr)
            weights = np.add(weights, (alpha * logmod(error) * eligibility))
            eligibility *= 0.0
        weights = np.add(weights, (alpha * error * eligibility))
        
    if((x+1)%p_freq == 0):
        print("Episode" , x+1, "done")

In [None]:
fig, axes = plt.subplots(nrows=num_non_obs_tasks * 2, ncols=1)
fig.set_figwidth(20)
fig.set_figheight(20)

for x in range(num_non_obs_tasks):
    
    x_ind = x * 2
    
    y_for_rwd = 0
    y_for_no_rwd = 0
    
    position = np.arange(size_of_maze)
    value = np.zeros(size_of_maze)
    lab = "rewardTkn*Atr:" + str(x)
    for state in range(size_of_maze):
        encode_str = build_hrr_string(str(state) + "*rewardTkn", x)
        value[state] = np.dot(weights, ltm.encode(encode_str)) + bias
    axes[x_ind].title.set_text("rewardTkn*" + "Atr: " + str(x))
    axes[x_ind].set_ylim([-0.2, 1.2])
    axes[x_ind].plot(position, value, label=lab)
        
    axes[x_ind].legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
                  fancybox=True, shadow=True, ncol=1, prop={'size': 10})
    
    
    y = x + 1
    
    position = np.arange(size_of_maze)
    value = np.zeros(size_of_maze)
    lab = "Atr:" + str(x)
    for state in range(size_of_maze):
        encode_str = build_hrr_string(str(state), x)
        value[state] = np.dot(weights, ltm.encode(encode_str)) + bias
    axes[x_ind+1].title.set_text("Atr: " + str(x))
    axes[x_ind+1].set_ylim([-0.2, 1.2])
    axes[x_ind+1].plot(position, value, label=lab)
    
    axes[x_ind+1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
        fancybox=True, shadow=True, ncol=1, prop={'size': 10})
    y_for_no_rwd += 1
    
    
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
t1 = time.time()

In [None]:
total = t1-t0
print(total / 60)

In [None]:
ltm.print()

In [None]:
ltm.clean()
del ltm