In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
plt.rcParams['figure.figsize'] = np.array([12, 8])
sb.set(rc={"xtick.bottom" : True,
           "ytick.left" : True,
           'axes.titlesize': 21,
           'axes.labelsize': 20,
           'xtick.labelsize': 18,
           'ytick.labelsize': 18,
           'legend.fontsize': 16,
          }, style='whitegrid')

In [None]:
from games import rock_paper_scissors, chicken_or_dare

In [None]:
class MWagent:
    def __init__(self, n_actions, gamma):
        self.num_actions = n_actions
        self.weights = np.ones(n_actions)
        self.gamma = gamma
        self.action_space = np.array(range(self.num_actions))

    def get_action(self):
        probs = (1 - self.gamma) * self.weights / np.sum(self.weights) + self.gamma / self.num_actions
        return np.random.choice(self.action_space, p=probs)

    def update_weights(self, action, score):
        x_hat = np.zeros(self.num_actions)
        probs = (1 - self.gamma) * self.weights / np.sum(self.weights) + self.gamma / self.num_actions
        x_hat[action] = score / probs[action]
        self.weights *= np.exp(self.gamma * x_hat / self.num_actions)
        self.weights /= np.sum(self.weights)

In [None]:
def simulate_game(T, agents, game):
    num_agents = len(agents)
    game_states = {
        'weights': np.ones((num_agents, T, agents[0].num_actions)),
        'scores': np.ones((num_agents, T)),
        'actions': np.ones((num_agents, T)),
    }
    assert(len(game_states['scores']) == num_agents)
    for t in range(T):
        agent_actions = {}
        for i, agent in enumerate(agents):
            agent_actions[f'p{i + 1}_action'] = agent.get_action()
            game_states['actions'][i, t] = agent_actions[f'p{i + 1}_action']
        
        scores = game(**agent_actions)
        
        assert(len(scores) == num_agents)
        
        for i, agent in enumerate(agents):
            agent.update_weights(agent_actions[f'p{i + 1}_action'], scores[i])
            game_states['weights'][i, t, :] = agent.weights.copy()
            game_states['scores'][i, t] = scores[i]
    return game_states

# External Regret

## Rock, Paper, Scissors experiments

In [None]:
gamma = 0.01
p1 = MWagent(
    n_actions=3,
    gamma=gamma,
)
p2 = MWagent(
    n_actions=3,
    gamma=gamma,
)

T = int(1e6)

game_states_rps_exp3 = simulate_game(T, [p1, p2], rock_paper_scissors)

In [None]:
plt.figure()
plt.plot(game_states_rps_exp3['weights'][0, :, 0], label='p1 0')
plt.plot(game_states_rps_exp3['weights'][0, :, 1], label='p1 1')
plt.plot(game_states_rps_exp3['weights'][0, :, 2], label='p1 2')
plt.legend()
plt.title('Weights over time')
plt.xlabel('iteration')
plt.ylabel('weight value')
plt.plot()

In [None]:
plt.figure()
plt.plot(np.cumsum(game_states_rps_exp3['weights'][0, :, 0]) / np.arange(1, T + 1), label='rock')
plt.plot(np.cumsum(game_states_rps_exp3['weights'][0, :, 1]) / np.arange(1, T + 1), label='paper')
plt.plot(np.cumsum(game_states_rps_exp3['weights'][0, :, 2]) / np.arange(1, T + 1), label='scissors')
#plt.axhline(1/3, linestyle='dashed', label='expectation')
plt.legend()
plt.title('Rock Paper Scissors Cumulative Mean of Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('mean weight value')
plt.plot()

###### player 1 external regret

In [None]:
alt_scores_rps = {}
for a in [0, 1, 2]:
    opp_actions = game_states_rps_exp3['actions'][1, :].astype(int)
    alt_scores_rps[a] = []
    for i in range(1000000):
        alt_scores_rps[a].append(rock_paper_scissors(a, opp_actions[i]))
    alt_scores_rps[a] = np.array(alt_scores_rps[a]).T

In [None]:
last_alt_cum_scores = [np.sum(alt_scores_rps[i][0, :]) for i in [0, 1, 2]]

In [None]:
best_action_rps = np.argmax(last_alt_cum_scores)

In [None]:
plt.figure()
plt.plot(np.cumsum(np.fmax(alt_scores_rps[best_action_rps][0, :] - game_states_rps_exp3['scores'][0, :], 0)), label='Rock Paper Scissors External')
plt.legend()
plt.title('Cumulative External Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

In [None]:
plt.figure()
plt.plot(np.cumsum(alt_scores_rps[best_action_rps][0, :] - game_states_rps_exp3['scores'][0, :]), label='Rock Paper Scissors External')
plt.legend()
plt.title('Cumulative External Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

## Chicken or Dare

In [None]:
gamma = 0.001
p1 = MWagent(
    n_actions=2,
    gamma=gamma,
)
p2 = MWagent(
    n_actions=2,
    gamma=gamma,
)

T = int(1e6)

game_states_cd_exp3 = simulate_game(T, [p1, p2], chicken_or_dare)

In [None]:
plt.figure()
plt.plot(game_states_cd_exp3['weights'][0, :, 0], label='p1 0')
plt.plot(game_states_cd_exp3['weights'][0, :, 1], label='p1 1')
plt.title('actual weights over time')
plt.legend()
plt.plot()

In [None]:
plt.figure()
plt.plot(np.cumsum(game_states_cd_exp3['weights'][0, :, 0]) / np.arange(1, T + 1), label='dare')
plt.plot(np.cumsum(game_states_cd_exp3['weights'][0, :, 1]) / np.arange(1, T + 1), label='chicken')
plt.legend()
plt.title('Chicken or Dare Cumulative Mean of Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('mean weight value')
plt.plot()

###### player 1 external regret

In [None]:
alt_scores_cd = {}
for a in [0, 1]:
    opp_actions = game_states_cd_exp3['actions'][1, :].astype(int)
    alt_scores_cd[a] = []
    for i in range(int(1e6)):
        alt_scores_cd[a].append(chicken_or_dare(a, opp_actions[i]))
    alt_scores_cd[a] = np.array(alt_scores_cd[a]).T

In [None]:
last_alt_cum_scores = [np.sum(alt_scores_cd[i][0, :]) for i in [0, 1]]

In [None]:
best_action_cd = np.argmax(last_alt_cum_scores)

In [None]:
plt.figure()
plt.plot(np.cumsum(np.fmax(alt_scores[best_action_cd][0, :] - game_states_cd_exp3['scores'][0, :], 0)), label='Chicken or Dare External')
plt.legend()
plt.title('Cumulative External Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

In [None]:
plt.figure()
plt.plot(np.cumsum(alt_scores_cd[best_action_cd][0, :] - game_states_cd_exp3['scores'][0, :]), label='Chicken or Dare External')
plt.legend()
plt.title('Cumulative External Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

# swap regret

In [None]:
class RMagent:
    def __init__(self, n_actions, mu):
        self.num_actions = n_actions
        self.weights = np.ones(n_actions) / n_actions
        # diagonal will be null
        self.regrets = np.zeros((n_actions, n_actions))
        self.mu = mu
        self.action_space = np.array(range(self.num_actions))
        self.T = 0

    def get_action(self):
        probs = self.weights / np.sum(self.weights)
        return np.random.choice(self.action_space, p=probs)

    def update_weights(self, action, scores):
        for swap in range(self.num_actions):
            if swap == action:
                continue
            self.regrets[action, swap] = self.T / (self.T + 1) * self.regrets[action, swap] + 1 / (self.T + 1) * (scores[swap] - scores[action])
        
        for a in range(self.num_actions):
            if a == action:
                # element-wise max to account for 'positive' requirement
                self.weights[a] = 1 - np.nansum(np.fmax(self.regrets[action, :], 0)) / self.mu
                assert(self.weights[a] >= 0), self.weights[a]
            else:
                self.weights[a] = np.max([self.regrets[action, a], 0]) / self.mu
                assert(self.weights[a] >= 0), self.weights[a]
        self.T += 1

In [None]:
def simulate_game_rm(T, agents, game):
    num_agents = len(agents)
    game_states = {
        'weights': np.ones((num_agents, T, agents[0].num_actions)),
        'scores': np.ones((num_agents, T)),
        'actions': np.ones((num_agents, T)),
        'regrets': np.ones((num_agents, T, agents[0].num_actions, agents[0].num_actions)),
    }
    # assumes 2 agents for now
    scores_matrix = np.ones((agents[0].num_actions, agents[0].num_actions, 2))
    for i in range(agents[0].num_actions):
        for j in range(agents[0].num_actions):
            scores_matrix[i, j, :] = game(i, j)
    assert(len(game_states['scores']) == num_agents)
    for t in range(T):
        agent_actions = {}
        for i, agent in enumerate(agents):
            agent_actions[f'p{i + 1}_action'] = agent.get_action()
            game_states['actions'][i, t] = agent_actions[f'p{i + 1}_action']
        
        scores = scores_matrix[agent_actions['p1_action'], agent_actions['p2_action'], :]
        
        assert(len(scores) == num_agents)
        
        agents[0].update_weights(agent_actions['p1_action'], scores_matrix[:, agent_actions['p2_action'], 0])
        game_states['weights'][0, t, :] = agents[0].weights.copy()
        game_states['regrets'][0, t, :] = agents[0].regrets.copy()
        
        agents[1].update_weights(agent_actions['p2_action'], scores_matrix[agent_actions['p1_action'], :, 1])
        game_states['weights'][1, t, :] = agents[1].weights.copy()
        game_states['regrets'][1, t, :] = agents[1].regrets.copy()
        
        game_states['scores'][:, t] = scores
    return game_states

## Rock Paper Scissors

In [None]:
mu = 0.5
p1 = RMagent(
    n_actions=3,
    mu=mu,
)
p2 = RMagent(
    n_actions=3,
    mu=mu,
)

T = int(2e6)

i = 0
passed = False
while not passed:
    try:
        p1 = RMagent(
            n_actions=3,
            mu=mu,
        )
        p2 = RMagent(
            n_actions=3,
            mu=mu,
        )
        print(i)
        game_states_rps_rm2 = simulate_game_rm(T, [p1, p2], rock_paper_scissors)
        passed = True
    except AssertionError:
        i += 1
        continue

In [None]:
plt.figure()
plt.plot(game_states_rps_rm2['weights'][0, :, 0], label='p1 0')
plt.plot(game_states_rps_rm2['weights'][0, :, 1], label='p1 1')
plt.plot(game_states_rps_rm2['weights'][0, :, 2], label='p1 2')
plt.legend()
plt.title('Rock Paper Scissors Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('weight value')
plt.plot()

In [None]:
plt.figure()
plt.plot(np.cumsum(game_states_rps_rm2['weights'][0, :, 0]) / np.arange(1, T + 1), label='rock')
plt.plot(np.cumsum(game_states_rps_rm2['weights'][0, :, 1]) / np.arange(1, T + 1), label='paper')
plt.plot(np.cumsum(game_states_rps_rm2['weights'][0, :, 2]) / np.arange(1, T + 1), label='scissors')
#plt.axhline(1/3, linestyle='dashed', label='expectation')
plt.legend()
plt.title('Rock Paper Scissors Cumulative Mean of Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('mean weight value')
plt.plot()

###### internal regret positive

In [None]:
regrets = game_states_rps_rm['regrets'].copy()

regrets[:, :, 0, 0] = 0
regrets[:, :, 1, 1] = 0
regrets[:, :, 2, 2] = 0

regrets_copy = regrets.copy()

for i in range(regrets_copy.shape[1]):
    regrets_copy[0, i, :, :] *= (i + 1)

reg_diff = np.diff(regrets_copy[0, :, :, :], axis=0)

reg_diff_pos = np.fmax(reg_diff, 0)

reg_argmax = np.nanargmax(regrets[0, :, :, :], 2)

rps_rm_pos_reg = []
for reg_mat, arg_vec in zip(reg_diff_pos, reg_argmax):
    rps_rm_pos_reg.append(reg_mat[[[0], [1], [2]], arg_vec.reshape(3, 1)])
rps_rm_pos_reg = np.array(rps_rm_pos_reg)

In [None]:
plt.figure()
plt.plot(np.cumsum(np.sum(rps_rm_pos_reg, 1).reshape((-1, ))), label='Rock Paper Scissors Internal')
plt.legend()
plt.title('Cumulative Internal Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

###### internal regret regular

In [None]:
regrets = game_states_rps_rm2['regrets'].copy()

regrets[:, :, 0, 0] = 0
regrets[:, :, 1, 1] = 0
regrets[:, :, 2, 2] = 0

regrets_copy = regrets.copy()

for i in range(regrets_copy.shape[1]):
    regrets_copy[0, i, :, :] *= (i + 1)

reg_diff = np.diff(regrets_copy[0, :, :, :], axis=0)

reg_argmax = np.nanargmax(regrets[0, :, :, :], 2)

rps_rm_reg = []
for reg_mat, arg_vec in zip(reg_diff, reg_argmax):
    rps_rm_reg.append(reg_mat[[[0], [1], [2]], arg_vec.reshape(3, 1)])
rps_rm_reg = np.array(rps_rm_reg)

In [None]:
plt.figure()
plt.plot(np.cumsum(np.sum(rps_rm_reg, 1).reshape((-1, ))), label='Rock Paper Scissors Internal')
plt.legend()
plt.title('Cumulative Internal Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

## Chicken or Dare

In [None]:
mu = 1.001
p1 = RMagent(
    n_actions=2,
    mu=mu,
)
p2 = RMagent(
    n_actions=2,
    mu=mu,
)

T = int(1e4)

game_states_cd_rm = simulate_game_rm(T, [p1, p2], chicken_or_dare)

In [None]:
plt.figure()
plt.plot(game_states_cd_rm['weights'][0, :, 0], label='p1 0')
plt.plot(game_states_cd_rm['weights'][0, :, 1], label='p1 1')
plt.legend()
plt.title('Chicken or Dare Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('weight value')
plt.plot()

In [None]:
plt.figure()
plt.plot(np.cumsum(game_states_cd_rm['weights'][0, :, 0]) / np.arange(1, 10000 + 1), label='dare')
plt.plot(np.cumsum(game_states_cd_rm['weights'][0, :, 1]) / np.arange(1, 10000 + 1), label='chicken')
plt.legend()
plt.title('Chicken or Dare Cumulative Mean of Agent 1 Weights')
plt.xlabel('iteration')
plt.ylabel('mean weight value')
plt.plot()

###### internal regret positive

In [None]:
regrets = game_states_cd_rm['regrets'].copy()

In [None]:
regrets[:, :, 0, 0] = 0
regrets[:, :, 1, 1] = 0

In [None]:
regrets_copy = regrets.copy()

In [None]:
for i in range(regrets_copy.shape[1]):
    regrets_copy[0, i, :, :] *= (i + 1)

In [None]:
reg_diff = np.diff(regrets_copy[0, :, :, :], axis=0)

In [None]:
reg_diff_pos = np.fmax(reg_diff, 0)

In [None]:
reg_argmax = np.nanargmax(regrets_copy[0, :, :, :], 2)

In [None]:
cd_rm_pos_reg = []
for reg_mat, arg_vec in zip(reg_diff_pos, reg_argmax):
    cd_rm_pos_reg.append(reg_mat[[[0], [1]], arg_vec.reshape(2, 1)])
cd_rm_pos_reg = np.array(cd_rm_pos_reg)

In [None]:
plt.figure()
plt.plot(np.cumsum(np.sum(cd_rm_pos_reg, 1).reshape((-1, ))), label='Chicken or Dare Internal')
plt.legend()
plt.title('Cumulative Internal Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

###### internal regret regular

In [None]:
regrets = game_states_cd_rm['regrets'].copy()

regrets[:, :, 0, 0] = 0
regrets[:, :, 1, 1] = 0

regrets_copy = regrets.copy()

for i in range(regrets_copy.shape[1]):
    regrets_copy[0, i, :, :] *= (i + 1)

reg_diff = np.diff(regrets_copy[0, :, :, :], axis=0)

reg_argmax = np.nanargmax(regrets_copy[0, :, :, :], 2)

cd_rm_reg = []
for reg_mat, arg_vec in zip(reg_diff, reg_argmax):
    cd_rm_reg.append(reg_mat[[[0], [1]], arg_vec.reshape(2, 1)])
cd_rm_reg = np.array(cd_rm_reg)

In [None]:
plt.figure()
plt.plot(np.cumsum(np.sum(cd_rm_reg, 1).reshape((-1, ))), label='Chicken or Dare Internal')
plt.legend()
plt.title('Cumulative Internal Regret Player 1')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()

# REGRET PLOT

## positive

In [None]:
fig = plt.figure()
cd_rm_T = cd_rm_pos_reg.shape[0]
rps_rm_T = rps_rm_pos_reg.shape[0]
cd_exp3_T = alt_scores_cd[best_action_cd].shape[1]
rps_exp3_T = alt_scores_rps[best_action_rps].shape[1]
min_T = np.min([cd_rm_T, rps_rm_T, cd_exp3_T, rps_exp3_T])
plt.plot(np.linspace(0, 1, cd_rm_T), min_T / cd_rm_T * np.cumsum(np.sum(cd_rm_pos_reg, 1).reshape((-1, ))), label='Chicken or Dare Internal')
plt.plot(np.linspace(0, 1, rps_rm_T), min_T / rps_rm_T * np.cumsum(np.sum(rps_rm_pos_reg, 1).reshape((-1, ))), label='Rock Paper Scissors Internal')
plt.plot(np.linspace(0, 1, cd_exp3_T), min_T / cd_exp3_T * np.cumsum(np.fmax(alt_scores_cd[best_action_cd][0, :] - game_states_cd_exp3['scores'][0, :], 0)), label='Chicken or Dare External')
plt.plot(np.linspace(0, 1, rps_exp3_T), min_T / rps_exp3_T * np.cumsum(np.fmax(alt_scores_rps[best_action_rps][0, :] - game_states_rps_exp3['scores'][0, :], 0)), label='Rock Paper Scissors External')
ax = fig.get_axes()
#plt.tick_params(labelbottom=False)
ax[0].set_xticklabels(['','','','', '', '', 'T'])
plt.legend()
plt.yscale("log")
plt.title('Cumulative Regret')
plt.xlabel('iteration')
plt.ylabel('regret (log scale)')
plt.plot()

## normal

In [None]:
plt.plot(np.cumsum(alt_scores_cd[best_action_cd][0, :] - game_states_cd_exp3['scores'][0, :]), label='Chicken or Dare External')

In [None]:
fig = plt.figure()
#cd_rm_T = cd_rm_reg.shape[0]
#rps_rm_T = rps_rm_reg.shape[0]
cd_exp3_T = alt_scores_cd[best_action_cd].shape[1]
rps_exp3_T = alt_scores_rps[best_action_rps].shape[1]
min_T = np.min([cd_exp3_T, rps_exp3_T])#cd_rm_T, rps_rm_T, 
#plt.plot(np.linspace(0, 1, cd_rm_T), min_T / cd_rm_T * np.cumsum(np.sum(cd_rm_reg, 1).reshape((-1, ))), label='Chicken or Dare Internal')
#plt.plot(np.linspace(0, 1, rps_rm_T), min_T / rps_rm_T * np.cumsum(np.sum(rps_rm_reg, 1).reshape((-1, ))), label='Rock Paper Scissors Internal')
plt.plot(np.linspace(0, 1, cd_exp3_T), min_T / cd_exp3_T * np.cumsum(alt_scores_cd[best_action_cd][0, :] - game_states_cd_exp3['scores'][0, :]), label='Chicken or Dare External')
plt.plot(np.linspace(0, 1, rps_exp3_T), min_T / rps_exp3_T * np.cumsum(alt_scores_rps[best_action_rps][0, :] - game_states_rps_exp3['scores'][0, :]), label='Rock Paper Scissors External')
plt.plot(np.linspace(0, 1, 1000), 4000 * np.sqrt(np.linspace(0, 1, 1000)), label=r'$C\sqrt{T}$')
ax = fig.get_axes()
#plt.tick_params(labelbottom=False)
ax[0].set_xticklabels(['', '', '', '', '', '', 'T'])
plt.legend()
plt.title('Cumulative Regret')
plt.xlabel('iteration')
plt.ylabel('regret')
plt.plot()