# Investigating Vagueness
Can we create the conditions for a signaling system to learn to send the null signal in border states?

In [17]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import imageio.v2 as imageio
from IPython.display import HTML
import os

In [2]:
def reward_dist(n: int) -> float:
    return np.exp(- (n**2) / (4 / np.log(2)))

In [3]:
def early_stop(epochs, rewards, threshold=0.95):
    return np.sum(rewards[-epochs:]) / epochs > threshold

In [4]:
def make_gif(filename_base, epochs, seed, fps=10):
    images = []
    nm = filename_base.split('-')[-1]
    for filename in [f'images/{nm}_{i}.png' for i in range(epochs) if i % 25 == 0]:
        images.append(imageio.imread(filename))
    if not os.path.exists(f'gifs/{seed}'):
        os.mkdir(f'gifs/{seed}')
    imageio.mimsave(f'gifs/{seed}/{filename_base}.gif', images, fps=fps)
    display(HTML('<img src="{}">'.format(f'{filename_base}.gif')))

This next block is optional, but will make numpy warnings throw like errors that can be caught. I have used this mainly for debugging unexpected instances of infinity or NaN in Sender or Reciever weight calcultation, as `np.exp(309)` or `1e309` both overflow to infinity. 

In [5]:
import warnings
np.seterr(all='warn')
warnings.filterwarnings('error')

## World Setup
Initially inspired by https://tomekkorbak.com/2019/10/08/lewis-signaling-games/.

In [6]:
class World:
    def __init__(self, n_states: int, 
                 n_signals: int, n_actions: int, 
                 reward_mod=(1,1), seed: int = 0) -> None:
        self.setup = (n_signals, n_actions)
        self.pos, self.neg = reward_mod
        self.positive, self.negative = reward_mod
        self.n_states = n_states
        self.state = 0
        self.random = np.random.RandomState(seed)

    def get_state(self) -> int:
        self.state = self.random.randint(self.n_states)
        return self.state

    def evaluate(self, action: int) -> int:
        step = self.n_states / self.setup[0]
        correct = self.state // step
        return self.pos if action == correct else self.neg


In [7]:
class Sender:
    def __init__(self, n_stimuli: int, n_signals: int, q_not: float = 1e-6) -> None:
        # n_stimuli: number of possible states in the world,
        #            each corresponding to a stimulus
        # n_signals: number of signals that can be sent in response,
        #            usually equal to the number of states in the world
        # q_not:     initial signal propensity values. Final value of null signal.
        self.n_signals = n_signals + 1      # +1 here represents null signal.
        self.signal_weights = np.zeros((self.n_signals, n_stimuli))
        self.signal_weights.fill(q_not)
        self.last_situation = (0, 0)

    def get_signal(self, stimulus: int) -> int:
        # exponential calculation
        num = np.exp(self.signal_weights[:, stimulus])
        den = np.sum(np.exp(self.signal_weights[:, stimulus]))
        probabilities = num / den
        signal = np.random.choice(self.n_signals, p=probabilities)
        if signal == self.n_signals-1:
            # null action
            return -1
        self.last_situation = (stimulus, signal)
        return signal

    def update(self, reward: int) -> None:
        # I am capping weight values at 308 due to overflow errors.
        stimulus, signal = self.last_situation
        self.signal_weights[signal, stimulus] += reward

        # after updating the first weight, we must reinforce the surrouding weights
        # using a gaussian distribution with a height of 1 and a width of 2
        # so that stimulus+2 and stimulus-2 are updated with 1/2 the reward.
        for i in range(1, 4):
            r = reward * reward_dist(i)

            # reward right
            if stimulus + i < self.signal_weights.shape[1]:
                q_last = self.signal_weights[signal, stimulus + i]
                self.signal_weights[signal, stimulus +
                                    i] = min(q_last + r, 308)

            # reward left
            if stimulus - i >= 0:
                q_last = self.signal_weights[signal, stimulus - i]
                self.signal_weights[signal, stimulus -
                                    i] = min(q_last + r, 308)

In [8]:
class Receiver:
    def __init__(self, n_signals, n_actions, q_not: float = 1e-6) -> None:
        # n_signals: number of signals that can be sent in response,
        #            usually equal to the number of states in the world
        # n_actions: number of actions that can be taken in response,
        #            usually equal to the number of states in the world
        # q_not:     initial action propensity value
        self.n_actions = n_actions
        self.action_weights = np.zeros((n_signals, n_actions))
        self.action_weights.fill(q_not)
        self.last_situation = (0, 0)

    def get_action(self, signal: int) -> int:
        # exponential calculation
        num = np.exp(self.action_weights[signal, :])
        den = np.sum(np.exp(self.action_weights[signal, :]))
        probabilities = num / den
        action = np.random.choice(self.n_actions, p=probabilities)
        self.last_situation = (signal, action)

        return action
    
    def update(self, reward: int) -> None:
        signal, action = self.last_situation
        q_last = self.action_weights[signal, action]
        self.action_weights[signal, action] = min(q_last + reward, 308)

## Experiment Setup

In [13]:
# constants
positive_reward = 1   # "Full reward" for a success in some state. Modifier.
negative_reward = -1  # "Full reward" for a failure in some state. Modifier.
epochs = 4_000       # Number of epochs to train for
seed = 0              # Random Number Generator seed. numpy algorithm.
world_states = 20     # number of world states. evenly split among signals
signals = 2           # number of signals sender can send (not including null)
actions = 2           # number of actions reciever can respond with
initial_q = 1         # initial propensities. final null action score.
gif_fps = 10          # frames per second for gif

rew = (positive_reward, negative_reward)
# world states should be evenly divisible by action and signals
assert world_states % signals == world_states % actions == 0

In [14]:
# intialize world
S = Sender(world_states, signals, initial_q)
R = Receiver(signals, actions, initial_q)
W = World(world_states, signals, actions, rew, seed)

In [15]:
# conduct experiment loop
past_rewards = 0
for epoch in range(epochs):
    stimulus = W.get_state()
    signal = S.get_signal(stimulus)
    if signal != -1:
        action = R.get_action(signal)
        reward = W.evaluate(action)
        past_rewards += reward
        S.update(reward)
        R.update(reward)
    # else null action

    if epoch % 25 == 0:
        plt.tight_layout(pad=0)
        plot = sns.heatmap(
            np.exp(R.action_weights) /
            np.exp(R.action_weights).sum(axis=0),
            square=True, cbar=False, annot=True, fmt='.1f'
        ).get_figure()
        plt.xlabel('actions')
        plt.ylabel('messages')
        plt.title(f'Receiver\'s weights, rollout {epoch}')
        plt.savefig(f"images/receiver_{epoch}.png")
        plt.clf()
        
        
        plot = sns.heatmap(
            np.exp(S.signal_weights) /
            np.exp(S.signal_weights).sum(axis=0),
            square=True, cbar=False, annot=True, fmt='.1f'
        ).get_figure()
        plt.ylabel('messages')
        plt.xlabel('world states')
        plt.title(f'Sender\'s weights, rollout {epoch}')
        plt.savefig(f"images/sender_{epoch}.png")
        plt.clf()

    if epoch % 100 == 0:
        print(f'Epoch {epoch}, last 100 epochs reward: {past_rewards/100}')
        past_rewards = 0

    # TODO: reshape this early stop mechanism
    # if early_stop(epochs, past_rewards):
    #     break

Epoch 0, last 100 epochs reward: 0.01
Epoch 100, last 100 epochs reward: 0.54
Epoch 200, last 100 epochs reward: 1.0
Epoch 300, last 100 epochs reward: 1.0
Epoch 400, last 100 epochs reward: 1.0
Epoch 500, last 100 epochs reward: 1.0
Epoch 600, last 100 epochs reward: 1.0
Epoch 700, last 100 epochs reward: 1.0
Epoch 800, last 100 epochs reward: 1.0
Epoch 900, last 100 epochs reward: 1.0
Epoch 1000, last 100 epochs reward: 1.0
Epoch 1100, last 100 epochs reward: 1.0
Epoch 1200, last 100 epochs reward: 1.0
Epoch 1300, last 100 epochs reward: 1.0
Epoch 1400, last 100 epochs reward: 1.0
Epoch 1500, last 100 epochs reward: 1.0
Epoch 1600, last 100 epochs reward: 1.0
Epoch 1700, last 100 epochs reward: 1.0
Epoch 1800, last 100 epochs reward: 1.0
Epoch 1900, last 100 epochs reward: 1.0
Epoch 2000, last 100 epochs reward: 1.0
Epoch 2100, last 100 epochs reward: 1.0
Epoch 2200, last 100 epochs reward: 1.0
Epoch 2300, last 100 epochs reward: 1.0
Epoch 2400, last 100 epochs reward: 1.0
Epoch 2500

<Figure size 640x480 with 0 Axes>

In [18]:
make_gif(f'{world_states}-{actions}-{signals}-sender', epochs, seed, gif_fps)

In [19]:
make_gif(f'{world_states}-{actions}-{signals}-receiver', epochs, seed, gif_fps)

In [20]:
print("Observation to message mapping:")
print(S.signal_weights.argmax(0))
print("Message to action mapping:")
print(R.action_weights.argmax(1))

Observation to message mapping:
[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
Message to action mapping:
[1 0]
