# Vagueness, repeated
This notebook contains a task very similar to the one outlines in `vagueness.ipynb`, but with a large-scale generator to test a variety of parameter values systematically.

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import imageio.v2 as imageio
from IPython.display import HTML
import os
import time
from itertools import product
from tqdm.notebook import tqdm

In [4]:
import matplotlib
matplotlib.use('Agg')

# Setup

In [None]:
# rewards are multiplied by the output of this function
def reward_dist(n: int) -> float:
    # Gaussian with FWHM of 2.
    return np.exp(- (n**2) / (4 / np.log(2)))

In [None]:
class World:
    def __init__(self, n_states: int, 
                 n_signals: int, n_actions: int, 
                 reward_mod=(1,1), seed: int = 0) -> None:
        self.setup = (n_signals, n_actions)
        self.pos, self.neg = reward_mod
        self.positive, self.negative = reward_mod
        self.n_states = n_states
        self.state = 0
        self.random = np.random.RandomState(seed)

    def get_state(self) -> int:
        self.state = self.random.randint(self.n_states)
        return self.state

    def evaluate(self, action: int) -> int:
        step = self.n_states / self.setup[0]
        correct = self.state // step
        return self.pos if action == correct else -self.neg


In [None]:
class Sender:
    def __init__(self, n_stimuli: int, n_signals: int, q_not: float = 1e-6) -> None:
        # n_stimuli: number of possible states in the world,
        #            each corresponding to a stimulus
        # n_signals: number of signals that can be sent in response,
        #            usually equal to the number of states in the world
        # q_not:     initial signal propensity values. Final value of null signal.
        self.n_signals = n_signals + 1      # +1 here represents null signal.
        self.signal_weights = np.zeros((self.n_signals, n_stimuli))
        self.signal_weights.fill(q_not)
        self.last_situation = (0, 0)

    def get_signal(self, stimulus: int) -> int:
        # exponential calculation
        num = np.exp(self.signal_weights[:, stimulus])
        den = np.sum(np.exp(self.signal_weights[:, stimulus]))
        probabilities = num / den
        signal = np.random.choice(self.n_signals, p=probabilities)
        if signal == self.n_signals-1:
            # null action
            return -1
        self.last_situation = (stimulus, signal)
        return signal

    def update(self, reward: int) -> None:
        # I am capping weight values at 308 due to overflow errors.
        stimulus, signal = self.last_situation
        self.signal_weights[signal, stimulus] += reward

        # after updating the first weight, we must reinforce the surrouding weights
        # using a gaussian distribution with a height of 1 and a width of 2
        # so that stimulus+2 and stimulus-2 are updated with 1/2 the reward.
        for i in range(1, 4):
            r = reward * reward_dist(i)

            # reward right
            if stimulus + i < self.signal_weights.shape[1]:
                q_last = self.signal_weights[signal, stimulus + i]
                self.signal_weights[signal, stimulus +
                                    i] = min(q_last + r, 308)

            # reward left
            if stimulus - i >= 0:
                q_last = self.signal_weights[signal, stimulus - i]
                self.signal_weights[signal, stimulus -
                                    i] = min(q_last + r, 308)


In [None]:
class Receiver:
    def __init__(self, n_signals, n_actions, q_not: float = 1e-6) -> None:
        # n_signals: number of signals that can be sent in response,
        #            usually equal to the number of states in the world
        # n_actions: number of actions that can be taken in response,
        #            usually equal to the number of states in the world
        # q_not:     initial action propensity value
        self.n_actions = n_actions
        self.action_weights = np.zeros((n_signals, n_actions))
        self.action_weights.fill(q_not)
        self.last_situation = (0, 0)

    def get_action(self, signal: int) -> int:
        # exponential calculation
        num = np.exp(self.action_weights[signal, :])
        den = np.sum(np.exp(self.action_weights[signal, :]))
        probabilities = num / den
        action = np.random.choice(self.n_actions, p=probabilities)
        self.last_situation = (signal, action)

        return action
    
    def update(self, reward: int) -> None:
        signal, action = self.last_situation
        q_last = self.action_weights[signal, action]
        self.action_weights[signal, action] = min(q_last + reward, 308)


In [None]:
class History:
    def __init__(self, epochs, states, signals, actions):
        self.send_hist = np.zeros((epochs // 25, signals+1, states))
        self.reci_hist = np.zeros((epochs // 25, signals, actions))
        self.epochs = epochs
        self.ep = 0
        # TODO: Genralize functions to work mid-run

    def add(self, send_weights, reci_weights):
        self.send_hist[self.ep] = send_weights
        self.reci_hist[self.ep] = reci_weights
        self.ep += 1

    def make_gif(self, fps, seed, filename_base): 
        for i in range(epochs // 25):
            fig, axs = plt.subplots(1, 2, figsize=(12, 5))
            plt.tight_layout(pad=3)

            sns.heatmap(
                np.exp(self.send_hist[i]) /
                np.exp(self.send_hist[i]).sum(axis=0),
                square=True, cbar=False, annot=True, fmt='.1f', ax=axs[0])
            axs[0].set_ylabel('messages')
            axs[0].set_xlabel('world states')
            axs[0].set_title(f'Sender\'s weights')

            sns.heatmap(
                np.exp(self.reci_hist[i]) /
                np.exp(self.reci_hist[i]).sum(axis=0),
                square=True, cbar=False, annot=True, fmt='.2f', ax=axs[1])
            axs[1].set_xlabel('actions')
            axs[1].set_ylabel('messages')
            axs[1].set_title(f'Receiver\'s weights')
            
            
            fig.suptitle(f'Rollout {i*25}')
            plt.savefig(f"images/game_{i*25}.png")
            plt.close(fig)

        images = []
        for filename in [f'images/game_{j*25}.png' for j in range(epochs // 25)]:
            images.append(imageio.imread(filename))
        if not os.path.exists(f'gifs/{seed}'):
            os.mkdir(f'gifs/{seed}')
        imageio.mimsave(f'gifs/{seed}/{filename_base}.gif', images, fps=fps)
        # no return

    def make_graph(self, seed):
        fig, axs = plt.subplots(2, 3, figsize=(10, 8), sharey=True)

        ran = range(0, self.epochs, 25)
        axs[0, 0].plot(ran, self.send_hist[:, 0, 0], label='action 0')
        axs[0, 0].plot(ran, self.send_hist[:, 1, 0], label='action 1')
        axs[0, 0].plot(ran, self.send_hist[:, 2, 0], label='null action')
        axs[0, 0].set_title('state 0')

        axs[0, 1].plot(ran, self.send_hist[:, 0, 5], label='action 0')
        axs[0, 1].plot(ran, self.send_hist[:, 1, 5], label='action 1')
        axs[0, 1].plot(ran, self.send_hist[:, 2, 5], label='null action')
        axs[0, 1].set_title('state 5')

        axs[0, 2].plot(ran, self.send_hist[:, 0, 9], label='action 0')
        axs[0, 2].plot(ran, self.send_hist[:, 1, 9], label='action 1')
        axs[0, 2].plot(ran, self.send_hist[:, 2, 9], label='null action')
        axs[0, 2].set_title('state 9')

        axs[1, 0].plot(ran, self.send_hist[:, 0, 10], label='action 0')
        axs[1, 0].plot(ran, self.send_hist[:, 1, 10], label='action 1')
        axs[1, 0].plot(ran, self.send_hist[:, 2, 10], label='null action')
        axs[1, 0].set_title('state 10')

        axs[1, 1].plot(ran, self.send_hist[:, 0, 14], label='action 0')
        axs[1, 1].plot(ran, self.send_hist[:, 1, 14], label='action 1')
        axs[1, 1].plot(ran, self.send_hist[:, 2, 14], label='null action')
        axs[1, 1].set_title('state 14')

        axs[1, 2].plot(ran, self.send_hist[:, 0, 19], label='action 0')
        axs[1, 2].plot(ran, self.send_hist[:, 1, 19], label='action 1')
        axs[1, 2].plot(ran, self.send_hist[:, 2, 19], label='null action')
        axs[1, 2].set_title('state 19')

        fig.suptitle(f'Sum action propensities over {self.epochs} epochs')
        fig.savefig(f'gifs/{seed}/weights.png')
        plt.close(fig)


## Experiment


In [None]:
multipliers = range(1, 6) # 1, 2, 3, 4, 5
negatives = [m*(10**x) for m in multipliers for x in range(-3, 1)]
weights = [m*(10**x) for m in multipliers for x in range(-6, 3)]

In [None]:
# constants
positive = 0.01          # reward for correct action
epochs = 20_000          # Number of epochs to train for
world_states = 20        # number of world states. evenly split among signals
signals = 2              # number of signals sender can send (not including null)
actions = 2              # number of actions reciever can respond with
gif_fps = 10             # frames per second for gif

# world states should be evenly divisible by action and signals
assert world_states % signals == world_states % actions == 0

In [None]:
for i, (negative, weight) in tqdm(enumerate(product(negatives, weights)), total=len(negatives)*len(weights)):
    rew = (positive, negative)
    seed = int(time.time())
    S = Sender(world_states, signals, weight)
    R = Receiver(signals, actions, weight)
    W = World(world_states, signals, actions, rew, seed)
    H = History(epochs, world_states, signals, actions)
    past_rewards = slow = 0
    for epoch in range(epochs):
        stimulus = W.get_state()
        signal = S.get_signal(stimulus)
        if signal != -1:
            action = R.get_action(signal)
            reward = W.evaluate(action)
            past_rewards += reward
            S.update(reward)
            R.update(reward)
        # else null action

        if epoch % 25 == 0:
            # save history
            H.add(S.signal_weights, R.action_weights)
            

        if epoch % 100 == 0:
            # print(f'Epoch {epoch}, last 100 epochs reward: {past_rewards/100:e}')
            slow = past_rewards / 100
            past_rewards = 0

    # pdb.set_trace()
    # now we decide if this is a run to flag
    if ((np.argmax(S.signal_weights[:, 9]) == 2 or 
        np.argmax(S.signal_weights[:, 10]) == 2 or
        slow < 0) and (np.argmax(S.signal_weights[:, 5]) != 2 or 
        np.argmax(S.signal_weights[:, 14]) != 2)):
        H.make_gif(gif_fps, seed, f'{world_states}-{actions}-{signals}-game')
        H.make_graph(seed)
        with open(f'gifs/{seed}/params.txt', 'w') as f:
            f.write(f'negative: {negative},\ninitial weight: {weight}\nfinal average reward: {slow}')
        print(f"Saved with negative: {negative} and initial weight: {weight}")

## Analysis
This analysis assumes the kernel has been reset, or that History objects are otherwise not available - I will only examine the contents of `params.txt` files left in seed folders in the same directory as this notebook, rather than the specified save location above.

In [21]:
initial_weights = []
final_rewards = []
negatives = []

files = os.listdir("./")

for folder in files:
    if not os.path.isdir("./" + folder):
        continue

    with open("./" + folder + "/params.txt", "r") as f:
        params = f.read().splitlines()
        initial_weights.append(float(params[1].split(": ")[1]))
        final_rewards.append(float(params[2].split(": ")[1]))
        negatives.append(float(params[0].strip(',').split(": ")[1]))

In [6]:
set(negatives)

{0.02, 0.03, 0.04, 0.05}

In [7]:
set(sorted(initial_weights))

{3e-06,
 4.9999999999999996e-06,
 1e-05,
 3.0000000000000004e-05,
 4e-05,
 5e-05,
 0.0001,
 0.0002,
 0.00030000000000000003,
 0.0004,
 0.0005,
 0.002,
 0.003,
 0.004,
 0.005,
 0.01,
 0.02,
 0.03,
 0.05,
 0.1,
 0.2,
 0.30000000000000004,
 0.4,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 10.0,
 40.0,
 50.0}

In [25]:
print("Rewards over last 100 epochs")
print(f"Max: {max(final_rewards):.2e}, Seed {files[np.argmax(final_rewards)]}")
print(f"Min: {min(final_rewards):.2e}, Seed {files[np.argmin(final_rewards)]}")
print(f"Difference: {max(final_rewards) - min(final_rewards):.2e}")

Rewards over last 100 epochs
Max: 9.50e-03, Seed 1680917024
Min: 3.00e-04, Seed 1680927428
Difference: 9.20e-03


In [46]:
print(f"{np.average(final_rewards):.2e}")

5.17e-03


In [45]:
# all seeds scoring between 0.00516 and 0.00517 - about average
np.array(files)[list(set(np.where(np.array(final_rewards) > 0.00516)[0]) - set(np.where(np.array(final_rewards) < 0.00517)[0]))]

array(['1680925080', '1680926085', '1680911029', '1680911531',
       '1680912522', '1680913006', '1680913981', '1680914954',
       '1680915992', '1680917024', '1680917538', '1680918059',
       '1680919607', '1680921491', '1680923564'], dtype='<U16')

All of these seeds have a negative score of 0.03 or 0.04. Initial weights range from 3e-6 to 40 for 0.04 and 5e-5 to 5 for 0.03.
Given this ratio, I would then expect to see the occurrence of this case broadly in this setup. While close to the desired behavior, all runs highlighted above as close to the average still appear to be trending towards the correct signal in the borderline states, 9 and 10.