In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
plt.rcParams['figure.figsize'] = np.array([12, 8])
sb.set(rc={"xtick.bottom" : True,
           "ytick.left" : True,
           'axes.titlesize': 21,
           'axes.labelsize': 20,
           'xtick.labelsize': 18,
           'ytick.labelsize': 18,
           'legend.fontsize': 16,
          }, style='whitegrid')

In [None]:
from games import rock_paper_scissors, chicken_or_dare

In [None]:
class MWagent:
    def __init__(self, n_actions, gamma):
        self.num_actions = n_actions
        self.weights = np.ones(n_actions)
        self.gamma = gamma
        self.action_space = np.array(range(self.num_actions))

    def get_action(self):
        probs = (1 - self.gamma) * self.weights / np.sum(self.weights) + self.gamma / self.num_actions
        return np.random.choice(self.action_space, p=probs)

    def update_weights(self, action, score):
        x_hat = np.zeros(self.num_actions)
        probs = (1 - self.gamma) * self.weights / np.sum(self.weights) + self.gamma / self.num_actions
        x_hat[action] = score / probs[action]
        self.weights *= np.exp(self.gamma * x_hat / self.num_actions)
        self.weights /= np.sum(self.weights)

In [None]:
def simulate_game(T, agents, game):
    num_agents = len(agents)
    game_states = {
        'weights': np.ones((num_agents, T, agents[0].num_actions)),
        'scores': np.ones((num_agents, T)),
        'actions': np.ones((num_agents, T)),
    }
    assert(len(game_states['scores']) == num_agents)
    for t in range(T):
        agent_actions = {}
        for i, agent in enumerate(agents):
            agent_actions[f'p{i + 1}_action'] = agent.get_action()
            game_states['actions'][i, t] = agent_actions[f'p{i + 1}_action']
        
        scores = game(**agent_actions)
        
        assert(len(scores) == num_agents)
        
        for i, agent in enumerate(agents):
            agent.update_weights(agent_actions[f'p{i + 1}_action'], scores[i])
            game_states['weights'][i, t, :] = agent.weights.copy()
            game_states['scores'][i, t] = scores[i]
    return game_states

# External Regret

## Rock, Paper, Scissors experiments

In [None]:
gamma = 0.01
p1 = MWagent(
    n_actions=3,
    gamma=gamma,
)
p2 = MWagent(
    n_actions=3,
    gamma=gamma,
)

T = 1000000

game_states_rps_exp3 = simulate_game(T, [p1, p2], rock_paper_scissors)

In [None]:
plt.figure()
plt.plot(game_states_rps_exp3['weights'][0, :, 0], label='p1 0')
plt.plot(game_states_rps_exp3['weights'][0, :, 1], label='p1 1')
plt.plot(game_states_rps_exp3['weights'][0, :, 2], label='p1 2')
plt.legend()
plt.title('Weights over time')
plt.xlabel('iteration')
plt.ylabel('weight value')
plt.plot()

In [None]:
plt.figure()
plt.plot(np.cumsum(game_states_rps_exp3['weights'][0, :, 0]) / np.arange(1, T + 1), label='rock')
plt.plot(np.cumsum(game_states_rps_exp3['weights'][0, :, 1]) / np.arange(1, T + 1), label='paper')
plt.plot(np.cumsum(game_states_rps_exp3['weights'][0, :, 2]) / np.arange(1, T + 1), label='scissors')
#plt.axhline(1/3, linestyle='dashed', label='expectation')
plt.legend()
plt.title('Rock Paper Scissors Cumulative Mean of Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('mean weight value')
plt.plot()

###### player 1 external regret

In [None]:
alt_cum_scores = {}
for a in [0, 1, 2]:
    alt_actions = [a] * T
    opp_actions = game_states_rps_exp3['actions'][1, :].astype(int)
    alt_scores = []
    for i in range(T):
        alt_scores.append(rock_paper_scissors(alt_actions[i], opp_actions[i]))
    alt_cum_scores[a] = np.cumsum(np.array(alt_scores), axis=0)

In [None]:
last_alt_cum_scores = [alt_cum_scores[i][-1, 0] for i in [0, 1, 2]]

In [None]:
best_action = np.argmax(last_alt_cum_scores)

In [None]:
# regret for rock paper scissors is 1 - score
# if loss, can choose a win (1 - 0) = 1
# if tie, can choose win (1 - 0.5) = 0.5
plt.figure()
plt.plot(alt_cum_scores[best_action][:, 0] - np.cumsum(1 - game_states_rps_exp3['scores'][0, :]), label='player 1')
plt.legend()
plt.title('Cumulative External Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

## Chicken or Dare

In [None]:
gamma = 0.001
p1 = MWagent(
    n_actions=2,
    gamma=gamma,
)
p2 = MWagent(
    n_actions=2,
    gamma=gamma,
)

T = int(1e6)

game_states_cd_exp3 = simulate_game(T, [p1, p2], chicken_or_dare)

In [None]:
plt.figure()
plt.plot(game_states_cd_exp3['weights'][0, :, 0], label='p1 0')
plt.plot(game_states_cd_exp3['weights'][0, :, 1], label='p1 1')
plt.title('actual weights over time')
plt.legend()
plt.plot()

In [None]:
plt.figure()
plt.plot(np.cumsum(game_states_cd_exp3['weights'][0, :, 0]) / np.arange(1, T + 1), label='chicken')
plt.plot(np.cumsum(game_states_cd_exp3['weights'][0, :, 1]) / np.arange(1, T + 1), label='dare')
plt.legend()
plt.title('Chicken or Dare Cumulative Mean of Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('mean weight value')
plt.plot()

###### player 1 external regret

In [None]:
alt_cum_scores = {}
for a in [0, 1]:
    alt_actions = [a] * T
    opp_actions = game_states_cd_exp3['actions'][1, :].astype(int)
    alt_scores = []
    for i in range(T):
        alt_scores.append(chicken_or_dare(alt_actions[i], opp_actions[i]))
    alt_cum_scores[a] = np.cumsum(np.array(alt_scores), axis=0)

In [None]:
last_alt_cum_scores = [alt_cum_scores[i][-1, 0] for i in [0, 1]]

In [None]:
best_action = np.argmax(last_alt_cum_scores)

In [None]:
plt.figure()
plt.plot(alt_cum_scores[best_action][:, 0] - np.cumsum(1 - game_states_cd_exp3['scores'][0, :]), label='player 1')
plt.legend()
plt.title('Cumulative External Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

# swap regret

In [None]:
class RMagent:
    def __init__(self, n_actions, mu):
        self.num_actions = n_actions
        self.weights = np.ones(n_actions) / n_actions
        # diagonal will be null
        self.regrets = np.zeros((n_actions, n_actions))
        self.mu = mu
        self.action_space = np.array(range(self.num_actions))
        self.T = 0

    def get_action(self):
        probs = self.weights / np.sum(self.weights)
        return np.random.choice(self.action_space, p=probs)

    def update_weights(self, action, scores):
        for swap in range(self.num_actions):
            if swap == action:
                continue
            self.regrets[action, swap] = self.T / (self.T + 1) * self.regrets[action, swap] + 1 / (self.T + 1) * (scores[swap] - scores[action])
        
        for a in range(self.num_actions):
            if a == action:
                # element-wise max to account for 'positive' requirement
                self.weights[a] = 1 - np.nansum(np.fmax(self.regrets[action, :], 0)) / self.mu
                assert(self.weights[a] >= 0), self.weights[a]
            else:
                self.weights[a] = np.max([self.regrets[action, a], 0]) / self.mu
                assert(self.weights[a] >= 0), self.weights[a]
        self.T += 1

In [None]:
def simulate_game_rm(T, agents, game):
    num_agents = len(agents)
    game_states = {
        'weights': np.ones((num_agents, T, agents[0].num_actions)),
        'scores': np.ones((num_agents, T)),
        'actions': np.ones((num_agents, T)),
        'regrets': np.ones((num_agents, T, agents[0].num_actions, agents[0].num_actions)),
    }
    # assumes 2 agents for now
    scores_matrix = np.ones((agents[0].num_actions, agents[0].num_actions, 2))
    for i in range(agents[0].num_actions):
        for j in range(agents[0].num_actions):
            scores_matrix[i, j, :] = game(i, j)
    assert(len(game_states['scores']) == num_agents)
    for t in range(T):
        agent_actions = {}
        for i, agent in enumerate(agents):
            agent_actions[f'p{i + 1}_action'] = agent.get_action()
            game_states['actions'][i, t] = agent_actions[f'p{i + 1}_action']
        
        scores = scores_matrix[agent_actions['p1_action'], agent_actions['p2_action'], :]
        
        assert(len(scores) == num_agents)
        
        agents[0].update_weights(agent_actions['p1_action'], scores_matrix[:, agent_actions['p2_action'], 0])
        game_states['weights'][0, t, :] = agents[0].weights.copy()
        game_states['regrets'][0, t, :] = agents[0].regrets.copy()
        
        agents[1].update_weights(agent_actions['p2_action'], scores_matrix[agent_actions['p1_action'], :, 1])
        game_states['weights'][1, t, :] = agents[1].weights.copy()
        game_states['regrets'][1, t, :] = agents[1].regrets.copy()
        
        game_states['scores'][:, t] = scores
    return game_states

## Rock Paper Scissors

In [None]:
mu = 1.0001
p1 = RMagent(
    n_actions=3,
    mu=mu,
)
p2 = RMagent(
    n_actions=3,
    mu=mu,
)

T = int(1e7)

game_states_rps_rm = simulate_game_rm(T, [p1, p2], rock_paper_scissors)

In [None]:
plt.figure()
plt.plot(game_states_rps_rm['weights'][0, 100:300, 0], label='p1 0')
plt.plot(game_states_rps_rm['weights'][0, 100:300, 1], label='p1 1')
plt.plot(game_states_rps_rm['weights'][0, 100:300, 2], label='p1 2')
plt.legend()
plt.title('Rock Paper Scissors Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('weight value')
plt.plot()

In [None]:
plt.figure()
plt.plot(np.cumsum(game_states_rps_rm['weights'][0, :, 0]) / np.arange(1, T + 1), label='rock')
plt.plot(np.cumsum(game_states_rps_rm['weights'][0, :, 1]) / np.arange(1, T + 1), label='paper')
plt.plot(np.cumsum(game_states_rps_rm['weights'][0, :, 2]) / np.arange(1, T + 1), label='scissors')
#plt.axhline(1/3, linestyle='dashed', label='expectation')
plt.legend()
plt.title('Rock Paper Scissors Cumulative Mean of Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('mean weight value')
plt.plot()

###### cumulative internal regret

In [None]:
regrets = game_states_rps_rm['regrets'].copy()

In [None]:
regrets[:, :, 0, 0] = np.nan
regrets[:, :, 1, 1] = np.nan
regrets[:, :, 2, 2] = np.nan

In [None]:
plt.figure()
plt.plot(np.sum(np.nanmin(regrets[0, :, :, :], 1), 1) * np.arange(1, T + 1), label='player 1')
plt.legend()
plt.title('Cumulative Internal Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

## Chicken or Dare

In [None]:
mu = 1.1
p1 = RMagent(
    n_actions=2,
    mu=mu,
)
p2 = RMagent(
    n_actions=2,
    mu=mu,
)

T = int(1e4)

game_states_cd_rm = simulate_game_rm(T, [p1, p2], chicken_or_dare)

In [None]:
plt.figure()
plt.plot(game_states_cd_rm['weights'][0, :, 0], label='p1 0')
plt.plot(game_states_cd_rm['weights'][0, :, 1], label='p1 1')
plt.legend()
plt.title('Chicken or Dare Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('weight value')
plt.plot()

In [None]:
plt.figure()
plt.plot(np.cumsum(game_states_cd_rm['weights'][0, :, 0]) / np.arange(1, T + 1), label='p1 0')
plt.plot(np.cumsum(game_states_cd_rm['weights'][0, :, 1]) / np.arange(1, T + 1), label='p1 1')
plt.legend()
plt.title('Chicken or Dare Cumulative Mean of Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('mean weight value')
plt.plot()

###### cumulative internal regret

In [None]:
regrets = game_states_cd_rm['regrets'].copy()

In [None]:
regrets[:, :, 0, 0] = np.nan
regrets[:, :, 1, 1] = np.nan

In [None]:
plt.figure()
plt.plot(np.sum(np.nanmin(regrets[0, :, :, :], 1), 1) * np.arange(1, T + 1), label='player 1')
plt.legend()
plt.title('Cumulative Internal Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

TODO

- cumulative regret vs time plot
    - plot function $C\sqrt{T}$ (cumulative)
    - choose large $C$ so that value is always below
    - Keegan thinks for simple games it will be much better than $\sqrt{T}$
- hyperparameters for rock paper scissors
- calculate rolling average strategy over time (save strategy at all time step)
    - want this to converge
- run for >> 100k

for regret, any accumulation, take max of 0 and that value (strictly positive deltas)

plots to show in report
1. *RPS avg policy over time for exp3 for player 1 (3 lines)
1. *chicken avg policy over time for exp3 for player 1 (2 lines)
1. cumulative external and swap (internal) regret (both games on same plot) so four lines
1. RPS avg policy over time for Regret Matching for player 1 (3 lines)
1. chicken avg policy over time for Regret Matching for player 1 (2 lines)