In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import time, sys, random, pylab, os, glob, math

from math import fabs
from random import randrange
from random import choice
from IPython.display import clear_output
from sys import argv
from os import path
from bayes_opt import BayesianOptimization
from bayes_opt import JSONLogger
from bayes_opt import Events
from bayes_opt.util import load_logs

from hrr import *

np.set_printoptions(threshold=np.inf)

In [2]:
def get_opt_steps(start, goal, size_of_maze):
    opt = abs(goal - start)
    if opt > size_of_maze / 2:
        opt = size_of_maze - opt
    return opt

In [3]:
def build_hrr_string(wm, signal, state, atr):
    if wm == "I" and signal == "I":
        return "State:" + str(state) + "*" + "Atr:" + str(atr)
    elif wm == "I":
        return "Signal:" + str(signal) + "*" + "State:" + str(state) + "*" + "Atr:" + str(atr)
    elif signal == "I":
        return "WM:" + str(wm) + "*" + "State:" + str(state) + "*" + "Atr:" + str(atr)
    else:
        return "WM:" + str(wm) + "*" + "Signal:" + str(signal) + "*" + "State:" + str(state) + "*" + "Atr:" + str(atr)

In [4]:
def get_moves(state, size_of_maze):
    if(state == 0):
        return size_of_maze - 1, 1
    elif(state == size_of_maze - 1):
        return size_of_maze - 2, 0
    else:
        return state - 1, state + 1

In [5]:
def move_policy(goal, moves, wms, signals, atr, rand_on, debug, weights, bias, ltm, e_soft):
    val = -9999
    for move in moves:
        for wm in list(dict.fromkeys(wms + ["I"])):
            for signal in list(dict.fromkeys(signals + ["I"])):
                if move == goal:
                    encode_str = build_hrr_string(wm, signal, str(move) + reward_tkn(), atr)
                else:
                    encode_str = build_hrr_string(wm, signal, move, atr)
                if (debug):
                    print(encode_str)
                temp = np.dot(weights, ltm.encode(encode_str)) + bias
                if debug:
                    if signal != "I":
                        print("Move: {0}, WM: {1}, Signal: {2}In, Atr: {3}, Value: {4}".format(move, wm, signal, atr, temp))
                    else:
                        print("Move: {0}, WM: {1}, Signal: {2}, Atr: {3}, Value: {4}".format(move, wm, signal, atr, temp))
                if temp > val:
                    val = temp
                    s_move = move
                    if signal != "I":
                        s_wm = signal + "In"
                    else:
                        s_wm = wm
    if(np.random.random_sample() < e_soft) and rand_on:
        if(debug):
            print("RANDOM MOVE")
        return (np.random.choice(moves), wms[0], atr, True)
    
    return (s_move, s_wm, atr, False)

In [6]:
def logmod(x):
    return np.sign(x)*np.log(abs(x)+1)

In [7]:
def reward_tkn():
    return "*rewardTkn"

In [8]:
def context_policy_negative(atr, num_of_atrs):
    return np.random.randint(low=0, high=num_of_atrs)

def context_policy_positive(wm, signal, state, atr, num_of_atrs, weights, ltm, bias):
    val = -9999
    for atr in range(0, num_of_atrs):
        encode_str = build_hrr_string(wm, signal, state, atr)
        temp = np.dot(weights, ltm.encode(encode_str)) + bias
        if temp > val:
            val = temp
            s_atr = atr
    return s_atr

In [9]:
def update_progress(progress, episode):
    bar_length = 50
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Episode {0}, Progress: [{1}] {2:.1f}%".format(episode, "=" * block + "." * (bar_length - block), progress * 100)
    print(text)

In [10]:
def reset(method, seed_val, num_of_atrs, atr_values, threshold, hrr_length, ltm, weights, eligibility, reward_good, dynamic_threshold, normalized):

#     seed(seed_val)
    num_of_atrs += 1
    atr_values = [1 * reward_good] * num_of_atrs
    if dynamic_threshold:
        threshold = 1
        
    if method == 1:
        hrr_length = (num_of_atrs * hrr_length) / (num_of_atrs - 1)
        store_old = ltm.getStore()
        weights_new = hrr(int(hrr_length), normalized)
        ltm_new = LTM(int(hrr_length), normalized)

        inv = np.linalg.pinv(np.atleast_2d(weights_new))
        for key in store_old.keys():
            key_val = store_old[key]
            val = np.dot(weights, key_val)
            guess = np.dot(inv, val).ravel()
            ltm_new.encode_val(key, guess)      
        
    elif method == 2:
        hrr_length = (num_of_atrs * hrr_length) / (num_of_atrs - 1)
        store_old = ltm.getStore()
        ltm_new = LTM(int(hrr_length), normalized)

        new_hrrs = hrrs(hrr_length, ltm.count(), normalized)
        vals = []
        for key in store_old.keys():
            key_val = store_old[key]
            vals += [np.dot(weights, key_val)]  
        s = np.linalg.pinv(new_hrrs)
        weights_new = np.asarray(np.dot(s,np.atleast_2d(vals).T)).ravel()

        i = 0
        for key in store_old.keys():
            ltm_new.encode_val(key, new_hrrs[i])
            i+=1
            
    ltm = ltm_new
    weights = weights_new
    eligibility = np.zeros(int(hrr_length))

    return num_of_atrs, atr_values, threshold, hrr_length, ltm, weights, eligibility

In [11]:
def seed(seed):
    random.seed(seed)
    np.random.seed(seed)

In [12]:
def start_testing(): 
    testing = True
    rand_on = 0
    alpha = 0.01
    threshold_alpha = 0
    atr_alpha = 0
    return testing, rand_on, alpha, threshold_alpha, atr_alpha

In [13]:
def run(method_val, episodes_val, hrr_length_val, steps_till_quit_val, non_obs_task_switch_rate_val, discount_val, alpha_val, atr_alpha_val, atr_threshold_val, threshold_alpha_val, e_soft_val, eli_lambda_val):
    
    seed_val = 1343765
    seed(seed_val)
    
    if method_val < 0.5:
        method = 1
    else:
        method = 2
    episodes = math.floor(episodes_val)

    hrr_length = math.floor(hrr_length_val)
    steps_till_quit = math.floor(steps_till_quit_val)
    non_obs_task_switch_rate = math.floor(non_obs_task_switch_rate_val)
    
    normalized = True
    
    signals = ["I"]
    goals = [[0], [4], [7], [10], [13]]
    size_of_maze = 20
    
    num_non_obs_tasks = len(goals)
    num_obs_tasks = len(signals)
    input_size = hrr_length
    output_size = 1
    
    discount = discount_val
    alpha = alpha_val
    
    reward_bad = -1
    reward_good = 0
    
    num_of_atrs = 1
    atr_alpha = atr_alpha_val
    atr_values = (np.ones(num_of_atrs) * reward_good).tolist()
    atr_threshold = -atr_threshold_val
    
    threshold = 1
    threshold_alpha = threshold_alpha_val
    dynamic_threshold = True
    
    e_soft = e_soft_val
    rand_on = 1
    eli_lambda = eli_lambda_val
    
    weights = hrr(hrr_length, normalized)
    bias = 1
    eligibility = np.zeros(hrr_length)
    
    percent_check = 9
    non_obs = 0
    current_atr = 0
    current_wm = "I"
    
    changed = False
    debug = False
    testing = False
    create_plots = True
    episodic_memory = False
    step_store = []
    if create_plots:
        pos_err_store = []
        neg_err_store = []
        total_error = []
        total_goal_error = []
        switch_error = []
        norm_error = []
        threshold_vals = []
    live_graph = False
    ltm = LTM(hrr_length, normalized)
    
    for x in range(episodes):

        current_state = random.randint(0, size_of_maze - 1)
        start = current_state
        current_signal = np.random.choice(signals)
        eligibility *= 0.0

        if episodic_memory:
            episode_memory = []

        changed = False

        if x%non_obs_task_switch_rate == 0:
            non_obs = choice([i for i in range(len(goals)) if i not in [non_obs]])
            changed = True
        if num_obs_tasks == 1:
            goal = goals[non_obs][0]
        else:
            goal = goals[non_obs][signals.index(current_signal)]

        steps = 0
        opt_steps = get_opt_steps(current_state, goal, size_of_maze)

        if testing == False and x > ((episodes*percent_check) / 10):
            testing, rand_on, alpha, threshold_alpha, atr_alpha = start_testing()

        for y in range(steps_till_quit):
            if create_plots:
                threshold_vals += [threshold]
            if (current_state == goal):
                encode_str = build_hrr_string(current_wm, current_signal, str(current_state) + reward_tkn(), current_atr)
                goal_hrr = ltm.encode(encode_str)
                goal_value = np.dot(weights, goal_hrr) + bias

                if episodic_memory:
                    episode_memory += [[current_state, goal_value, goal]]

                error = reward_good - goal_value
                eligibility *= eli_lambda
                eligibility = eligibility + goal_hrr
                weights = np.add(weights, (alpha * logmod(error) * eligibility))

                if dynamic_threshold:
                    threshold += threshold_alpha * logmod(error)

                atr_values[current_atr] += atr_alpha * logmod(error)

                if create_plots:
                    total_goal_error += [error]

                if(debug):
                    print("In goal with value {0}".format(goal_value))

                break
    
            previous_wm = current_wm
            previous_signal = current_signal
            previous_state = current_state
            previous_atr = current_atr

            if debug:
                print("Previous WM:, {0}, Signal:, {1}, State, {2}, ATR:, {3}".format(previous_wm, previous_signal, previous_state, previous_atr))

            encode_str = build_hrr_string(previous_wm, previous_signal, previous_state, previous_atr)
            previous_state_hrr = ltm.encode(encode_str)
            previous_value = np.dot(weights, previous_state_hrr) + bias

            if debug:
                print("Started with state: {0}, State Value: {1}, WM: {2},  Atr: {3}".format(previous_state, previous_value, previous_wm, previous_atr))

            current_signal = "I"
            left, right = get_moves(previous_state, size_of_maze)
            if previous_signal != "I":
                previous_signal += "In"

            move, wm, current_atr, random_move = move_policy(goal, [left, right], [previous_wm, previous_signal], [current_signal], previous_atr, rand_on, debug, weights, bias, ltm, e_soft)
            steps += 1
            current_wm = wm
            current_state = move

            if random_move:
                eligibility *= 0.0

            if(debug):
                print("Moves {0}, taken {1}".format([left, right], move))

            if debug:
                print("Current WM {0}, Current Signal {1}, Current state {2}, Current ATR {3}".format(current_wm, current_signal, current_state, current_atr))

            if current_state == goal:
                encode_str = build_hrr_string(current_wm, current_signal, str(current_state) + reward_tkn(), current_atr)     
                if debug:
                    print("In goal: WM: {1}, ATR: {2}".format(current_wm, current_atr))
            else:
                encode_str = build_hrr_string(current_wm, current_signal, current_state, current_atr)

            current_state_hrr = ltm.encode(encode_str)
            current_value = np.dot(weights, current_state_hrr) + bias

            sarsa_error = (reward_bad + discount * current_value) - previous_value
            eligibility *= eli_lambda
            eligibility = eligibility + previous_state_hrr
            weights = np.add(weights, (alpha * logmod(sarsa_error) * eligibility))

            atr_values[current_atr] += atr_alpha * logmod(sarsa_error)

            if dynamic_threshold:
                threshold += threshold_alpha * logmod(sarsa_error)

            if create_plots:
                total_error += [sarsa_error]
                norm_error += [sarsa_error]

            if sarsa_error > fabs(threshold) or sarsa_error < -fabs(threshold):

                if np.mean(atr_values) < atr_threshold:
                    num_of_atrs, atr_values, threshold, hrr_length, ltm, weights, eligibility = reset(method, seed_val, num_of_atrs, atr_values, threshold, hrr_length, ltm, weights, eligibility, reward_good, dynamic_threshold, normalized)

                if create_plots:
                    switch_error += [sarsa_error]

                if create_plots:
                    if testing and sarsa_error > fabs(threshold):
                        pos_err_store += [sarsa_error]
                    elif testing and sarsa_error < -fabs(threshold):
                        neg_err_store += [sarsa_error]

                if sarsa_error > fabs(threshold):
                    current_atr = context_policy_positive(current_wm, current_signal, current_state, current_atr, num_of_atrs, weights, ltm, bias)
                elif sarsa_error < -fabs(threshold):
                    current_atr = context_policy_negative(previous_atr, num_of_atrs)

                eligibility *= 0.0

                if changed:
                    steps = 0
                    start = current_state
                    opt_steps = get_opt_steps(current_state, goal, size_of_maze)

                if(debug):
                    print("Changed atr from {0} to {1}".format(previous_atr, current_atr))

            if debug:
                input("")

        if testing:
            if current_state == goal:
                step_store += [steps - opt_steps]
            else:
                step_store += [steps_till_quit]
                
    return (len(step_store)-np.count_nonzero(step_store))*100.0 / len(step_store)

In [14]:
pbounds = {
    'method_val': (0.6, 0.9),
    'episodes_val': (5000, 100000),
    'hrr_length_val': (1024, 6144),
    'steps_till_quit_val': (100, 200),
    'non_obs_task_switch_rate_val': (250, 500),
    'discount_val': (0.1, 0.7),
    'alpha_val': (0.01, 0.2),
    'atr_alpha_val': (0.0001, 0.0003),
    'atr_threshold_val': (0.3, 0.5),
    'threshold_alpha_val': (0.0001, 0.0002),
    'e_soft_val': (0.0001, 0.0005),
    'eli_lambda_val': (0, 0.2)
}

In [15]:
optimizer = BayesianOptimization(
    f=run,
    pbounds=pbounds,
    verbose=2,
    random_state=1,
)

In [16]:
# logger = JSONLogger(path="/home/nibraas/Coding/Mingo/logs/wm_logs_2_method_2.json")
# optimizer.subscribe(Events.OPTMIZATION_STEP, logger)
load_logs(optimizer, logs=["/home/nibraas/Coding/Mingo/logs/wm_logs_2_method_2.json"])
optimizer.maximize(n_iter=10)

|   iter    |  target   | alpha_val | atr_al... | atr_th... | discou... | e_soft... | eli_la... | episod... | hrr_le... | method... | non_ob... | steps_... | thresh... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------


KeyboardInterrupt: 

In [None]:
optimizer.max

In [None]:
for i, res in enumerate(optimizer.res):
    print("Iteration {}: \n\t{}".format(i, res))

In [None]:
# {'target': 93.7748344370861,
#  'params': {'alpha_val': 0.1198577662209101,
#   'atr_alpha_val': 0.0001730660106626197,
#   'atr_threshold_val': 0.45555336687067005,
#   'discount_val': 0.5828716209960245,
#   'e_soft_val': 0.0005,
#   'eli_lambda_val': 0.13055952287831313,
#   'episodes_val': 7559.054944609811,
#   'hrr_length_val': 4696.480303672614,
#   'method_val': 0.9655270732767963,
#   'non_obs_task_switch_rate_val': 372.6904809738739,
#   'steps_till_quit_val': 159.86389051414127,
#   'threshold_alpha_val': 0.0001401220235475343}}