In [24]:
%load_ext autoreload
%autoreload 2
%matplotlib inline



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import sys
import glob
import pandas as pd
import os
import seaborn as sns
import numpy as np
import torch.nn as nn

from tqdm import tqdm
from statsmodels.distributions.empirical_distribution import ECDF
from collections import defaultdict
# import pickle
import re
import json
from pathlib import Path

from open_spiel.python.algorithms.exploitability import nash_conv, best_response
from open_spiel.python.examples.ubc_plotting_utils import *
from open_spiel.python.examples.ubc_cma import analyze_checkpoint

import dill as pickle
import types

from auctions.webutils import *

os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

output_notebook()

game = pyspiel.load_game('python_clock_auction', dict(filename='/apps/open_spiel/configs/small_sats_complete_symmetric.json'))


# Functions

In [26]:
def get_opening_probabilities(agent, precision=2):
    # TODO: assumes agent is player 0
    e = EnvParams(num_envs=1, sync=False, normalize_rewards=False).make_env(game)
    e.reset()

    if agent.player_id == 1:
        e.step([1])

    step_output = agent.step(e.get_time_step(), is_evaluation=True)
    probs = step_output.probs
    return np.round(probs, precision)

In [27]:
def compute_approx_kl(logratio, ratio):
    with torch.no_grad():
        # calculate approx_kl http://joschu.net/blog/kl-approx.html
        approx_kl = ((ratio - 1) - logratio).mean()
        return approx_kl

def learn(self, time_step, replace_idx=None, verbose=True):
    if replace_idx is None:
        replace_idx = {}
    
    next_obs = torch.Tensor(np.array([ts.observations['info_state'][self.player_id] for ts in time_step])).reshape((-1,) + self.input_shape).to(self.device)

    # bootstrap value if not done
    with torch.no_grad():
        next_value = self.get_value(next_obs).reshape(1, -1)
        # advantages = torch.zeros_like(self.rewards).to(self.device)
        # lastgaelam = 0
        # for t in reversed(range(self.steps_per_batch)):
        #     nextvalues = next_value if t == self.steps_per_batch - 1 else self.values[t + 1]
        #     nextnonterminal = 1.0 - self.dones[t]
        #     delta = self.rewards[t] + nextvalues * nextnonterminal - self.values[t]
        #     advantages[t] = lastgaelam = delta + self.gae_lambda * nextnonterminal * lastgaelam
        # returns = advantages + self.values
        
        returns = torch.zeros_like(self.rewards).to(self.device)
        for t in reversed(range(self.steps_per_batch)):
            next_return = next_value if t == self.steps_per_batch - 1 else returns[t + 1]
            nextnonterminal = 1.0 - self.dones[t]
            returns[t] = self.rewards[t] + self.gamma * nextnonterminal * next_return
        advantages = returns - self.values

    # flatten the batch
    b_legal_actions_mask = self.legal_actions_mask.reshape((-1, self.num_actions))
    b_obs = self.obs.reshape((-1,) + self.input_shape)
    b_logprobs = self.logprobs.reshape(-1)
    b_actions = self.actions.reshape(-1)
    b_probs = self.probs.reshape((-1, self.num_actions))
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = self.values.reshape(-1)

    # Optimizing the policy and value network
    b_inds = np.arange(self.batch_size)
    b_inds = np.array([(replace_idx[i] if i in replace_idx else i) for i in b_inds])

    clipfracs = []
    should_early_stop = False
    for epoch in range(self.update_epochs):
        if verbose:
            print('Epoch: ', epoch)
        if should_early_stop:
            break
        np.random.shuffle(b_inds)
        for start in range(0, self.batch_size, self.minibatch_size):
            end = start + self.minibatch_size
            mb_inds = b_inds[start:end]

            _, newlogprob, entropy, newvalue, probs = self.get_action_and_value(b_obs[mb_inds], legal_actions_mask=b_legal_actions_mask[mb_inds], action=b_actions.long()[mb_inds])
            logratio = newlogprob - b_logprobs[mb_inds]
            ratio = logratio.exp()

            approx_kl = compute_approx_kl(logratio, ratio)
            with torch.no_grad():
                old_approx_kl = (-logratio).mean()
                clipfracs += [((ratio - 1.0).abs() > self.clip_coef).float().mean().item()]

            mb_advantages = b_advantages[mb_inds]
            if self.normalize_advantages:
                mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

            # Policy loss
            pg_loss1 = -mb_advantages * ratio
            pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - self.clip_coef, 1 + self.clip_coef)
            pg_loss = torch.max(pg_loss1, pg_loss2).mean()
            # pg_loss = pg_loss1.mean()

            # Neil loss: squared error between new and old probs
            # policy_change_loss = ((probs - b_probs[mb_inds])**2).mean()


            # Value loss
            newvalue = newvalue.view(-1)
            if self.clip_vloss:
                v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                v_clipped = b_values[mb_inds] + torch.clamp(
                    newvalue - b_values[mb_inds],
                    -self.clip_coef,
                    self.clip_coef,
                )
                v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                v_loss = 0.5 * v_loss_max.mean()
            else:
                v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
                
            entropy_loss = entropy.mean()
            # loss = pg_loss + self.penalty_coef * policy_change_loss - self.entropy_coef * entropy_loss + v_loss * self.value_coef
            loss = pg_loss - self.entropy_coef * entropy_loss + v_loss * self.value_coef


            self.optimizer.zero_grad()
            loss.backward()
            self.grad_norm = nn.utils.clip_grad_norm_(self.parameters(), self.max_grad_norm)
            self.optimizer.step()

            # Now take some stats over the batch. Possibly very slow. Possibly impossible with memory if batches are too big.
            _, newlogprob, entropy, newvalue, probs = self.get_action_and_value(b_obs, legal_actions_mask=b_legal_actions_mask, action=b_actions.long())
            logratio = newlogprob - b_logprobs
            old_approx_kl = (-logratio).mean() # Want to log this, the final one. Not some arbitarary metric about the minibatch
            ratio = logratio.exp()
            approx_kl = compute_approx_kl(logratio, ratio)

            # Print probabilities here
            if verbose:
                print(f"Approx KL {approx_kl}")
                print(f"Value loss {v_loss}")
                print(f'Mean value diff {torch.abs(newvalue - b_values).mean()}')
                print(f"Opening probabilities: {get_opening_probabilities(agent, 3)}")
                print(f"")
            if self.target_kl is not None:
                if approx_kl > self.target_kl:
                    should_early_stop = True
                    # TODO: Is it fair to also be stopping the value network from learning here?
                    break

    y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
    var_y = np.var(y_true)
    explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

    # compute L_\infty change in probabilities
    # TODO: split into minibatches?
    _, _, _, _, new_probs = self.get_action_and_value(b_obs, legal_actions_mask=b_legal_actions_mask, action=b_actions.long())
    with torch.no_grad():
        self.max_policy_diff = (new_probs - b_probs).abs().max().item()
        self.mean_policy_diff = (new_probs - b_probs).abs().sum(axis=1).mean().item() # Sum over the actions, mean over the batch
        self.mean_value_diff = (newvalue - b_returns[mb_inds]).mean().item() / 2




In [28]:
with open(f"/shared/pickles/ppo_agent_0_41.pkl", 'rb') as f:
    agent = pickle.load(f)
    agent.learn = types.MethodType(learn, agent)


ModuleNotFoundError: No module named '__builtin__'

Python: 3.8.2 (default, Feb 20 2022, 22:22:47) 
[GCC 7.5.0]


In [5]:
def load_agent(update_num, player_id=0):
    with open(f"/shared/pickles/ppo_agent_{player_id}_{update_num}.pkl", 'rb') as f:
        agent = pickle.load(f)
        agent.learn = types.MethodType(learn, agent)

    with open(f'/shared/pickles/time_step_{player_id}_{update_num}.pkl', 'rb') as f:
        time_step = pickle.load(f)

    return agent, time_step



In [6]:
agent, time_step = load_agent(update_num=41, player_id=0)
print(get_opening_probabilities(agent, 3))

ModuleNotFoundError: No module named '__builtin__'

In [15]:
# agent.clip_coef = 0.01

# agent.update_epochs = 1
# agent.minibatch_size = agent.batch_size

# agent.penalty_coef = 10

In [16]:
np.random.seed(1234)
agent.learn(time_step)

Epoch:  0
Approx KL 0.0023220854345709085
Value loss 0.004373705480247736
Mean value diff 0.16078387200832367
Opening probabilities: [0.    0.    0.    0.001 0.351 0.419 0.056 0.109 0.064]

Approx KL 0.006278935354202986
Value loss 0.0046813348308205605
Mean value diff 0.02245188131928444
Opening probabilities: [0.    0.    0.    0.001 0.394 0.397 0.052 0.098 0.058]

Approx KL 0.010528855957090855
Value loss 0.003777453675866127
Mean value diff 0.15443585813045502
Opening probabilities: [0.    0.    0.001 0.002 0.421 0.344 0.055 0.111 0.066]

Approx KL 0.0008966201567091048
Value loss 0.003903985256329179
Mean value diff 0.027677245438098907
Opening probabilities: [0.    0.    0.    0.001 0.323 0.439 0.057 0.11  0.07 ]

Epoch:  1
Approx KL 0.006854294333606958
Value loss 0.002895337762311101
Mean value diff 0.1371176838874817
Opening probabilities: [0.    0.    0.001 0.003 0.381 0.419 0.054 0.091 0.052]

Approx KL 0.012451764196157455
Value loss 0.00272082700394094
Mean value diff 0.03

In [35]:
features = agent.obs.reshape((128*16, 23*9))

In [40]:
np.unique(features, axis=0, return_counts=True, return_index=True, return_inverse=True)

(array([[0.5       , 0.5       , 0.5       , ..., 0.39772728, 0.52272725,
         0.64772725],
        [0.5       , 0.5       , 0.5       , ..., 0.4375    , 0.5511364 ,
         0.66477275],
        [0.5       , 0.5       , 0.5       , ..., 0.4375    , 0.5511364 ,
         0.66477275],
        ...,
        [1.        , 1.        , 1.        , ..., 0.4375    , 0.5625    ,
         0.6875    ],
        [1.        , 1.        , 1.        , ..., 0.4375    , 0.5625    ,
         0.6875    ],
        [1.        , 1.        , 1.        , ..., 0.48125   , 0.61875   ,
         0.75625   ]], dtype=float32),
 array([  12,   25,  289,   26,  114,    2,  736,  708, 1313,  616, 1553,
        1233,   60, 1852,  426, 1868, 1884,   43,   75,  285,  303, 1069,
        1450, 1547, 1466,   81,   10,    5,   19,  865,   56,   44,  222,
         142,  291,  742, 1848,  402, 1271, 1001,  322,  329,  256, 1531,
        1609, 1454, 1168,  343,  874,   49,  275,  726,  102, 1217,  429,
         207,  613, 1787

In [43]:
unique_features, indices, inverse, counts = np.unique(features, axis=0, return_counts=True, return_index=True, return_inverse=True)

In [8]:
update_num = 41
player_id = 0
agent, time_step = load_agent(update_num=update_num, player_id=player_id)
print('Initial:', get_opening_probabilities(agent, 3))

mode_index = indices[np.argmax(counts)]
for i, count in enumerate(counts):
    if count < 10:
        replace_dict = {idx: mode_index for idx in np.where(inverse == i)[0]}
        agent, time_step = load_agent(update_num=update_num, player_id=player_id)
        
        np.random.seed(1234)
        agent.learn(time_step, replace_dict, verbose=False)

        print(f'{i:3d} ({count}):', get_opening_probabilities(agent, 3))


Initial: [0.    0.    0.    0.001 0.335 0.43  0.053 0.113 0.068]


NameError: name 'indices' is not defined

In [20]:
def learn_batch_setup(self, time_step):
    next_obs = torch.Tensor(np.array([ts.observations['info_state'][self.player_id] for ts in time_step])).reshape((-1,) + self.input_shape).to(self.device)

    # bootstrap value if not done
    with torch.no_grad():
        next_value = self.get_value(next_obs).reshape(1, -1)
        # advantages = torch.zeros_like(self.rewards).to(self.device)
        # lastgaelam = 0
        # for t in reversed(range(self.steps_per_batch)):
        #     nextvalues = next_value if t == self.steps_per_batch - 1 else self.values[t + 1]
        #     nextnonterminal = 1.0 - self.dones[t]
        #     delta = self.rewards[t] + nextvalues * nextnonterminal - self.values[t]
        #     advantages[t] = lastgaelam = delta + self.gae_lambda * nextnonterminal * lastgaelam
        # returns = advantages + self.values
        
        returns = torch.zeros_like(self.rewards).to(self.device)
        for t in reversed(range(self.steps_per_batch)):
            next_return = next_value if t == self.steps_per_batch - 1 else returns[t + 1]
            nextnonterminal = 1.0 - self.dones[t]
            returns[t] = self.rewards[t] + self.gamma * nextnonterminal * next_return
        advantages = returns - self.values

    # flatten the batch
    b_legal_actions_mask = self.legal_actions_mask.reshape((-1, self.num_actions))
    b_obs = self.obs.reshape((-1,) + self.input_shape)
    b_logprobs = self.logprobs.reshape(-1)
    b_actions = self.actions.reshape(-1)
    b_probs = self.probs.reshape((-1, self.num_actions))
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = self.values.reshape(-1)

    return {
        'b_legal_actions_mask': b_legal_actions_mask,
        'b_obs': b_obs,
        'b_logprobs': b_logprobs,
        'b_actions': b_actions,
        'b_probs': b_probs,
        'b_advantages': b_advantages,
        'b_returns': b_returns,
        'b_values': b_values,
    }

In [21]:
agent, time_step = load_agent(update_num=update_num, player_id=player_id)
agent.learn = types.MethodType(learn_batch_setup, agent)

batch_data = agent.learn(time_step)

In [22]:
df = pd.DataFrame.from_dict({k: batch_data[k].tolist() for k in batch_data})

In [23]:
df['b_advantages'].describe()

count    2048.000000
mean       -0.081769
std         0.043474
min        -0.439254
25%        -0.099874
50%        -0.077146
75%        -0.052944
max         0.122829
Name: b_advantages, dtype: float64

In [25]:
(df['b_advantages'] > 0).sum()

26

In [24]:
df.to_csv(f'/shared/pickles/batch_data_{player_id}_{update_num}_no_gae.csv', index=False)

In [31]:
update_num = 41
player_id = 0

agent, time_step = load_agent(update_num=update_num, player_id=player_id)
agent.learn = types.MethodType(learn, agent)

features = agent.obs.reshape((128*16, 23*9))
unique_features, indices, inverse, counts = np.unique(features, axis=0, return_counts=True, return_index=True, return_inverse=True)

mode_index = inverse[indices[np.argmax(counts)]]
opening_indices = np.where(inverse == mode_index)[0]

In [32]:
np.random.seed(1234)
replace_index = {idx: np.random.choice(opening_indices) for idx in np.where(inverse != mode_index)[0]}
agent.clip_coef = 0.5
agent.learn(time_step, replace_index)

Epoch:  0
Approx KL 0.004109615460038185
Value loss 0.00462238397449255
Mean value diff 0.17810019850730896
Opening probabilities: [0.    0.    0.    0.002 0.395 0.394 0.051 0.097 0.061]

Approx KL 0.0151225496083498
Value loss 0.029047949239611626
Mean value diff 0.0975572019815445
Opening probabilities: [0.    0.    0.001 0.003 0.451 0.349 0.054 0.088 0.055]

Approx KL 0.05426757037639618
Value loss 0.027956552803516388
Mean value diff 0.226043701171875
Opening probabilities: [0.    0.    0.001 0.006 0.556 0.27  0.051 0.068 0.049]

Approx KL 0.09363732486963272
Value loss 0.02631310373544693
Mean value diff 0.105606809258461
Opening probabilities: [0.    0.    0.002 0.016 0.612 0.223 0.05  0.055 0.042]

Epoch:  1
Approx KL 0.11969111114740372
Value loss 0.019174430519342422
Mean value diff 0.25056275725364685
Opening probabilities: [0.    0.001 0.012 0.101 0.544 0.161 0.06  0.07  0.052]

Approx KL 0.10825421661138535
Value loss 0.04401135444641113
Mean value diff 0.07464641332626343


In [134]:
len(list(map(str, features)))

2048

In [135]:
len(set(map(str, features)))

74

In [138]:
pd.value_counts(list(map(str, features)))

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,\n        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n        0.0000, 0.0000, 0.0000, 0.5000, 0.5000, 0.5000, 1.0000, 1.0000, 1.0000,\n        0.0000, 0.5000, 1.0000, 0.0000, 0.5000, 1.0000, 0.0000, 0.5000, 1.0000,\n        0.0000, 0.1818, 0.3636, 0.3182, 0.5000, 0.6818, 0.6364, 0.8182, 1.0000,\n        0.0000, 0.1565, 0.3054, 0.2739, 0.5125, 0.6898, 0.5344, 0.8227, 1.0000,\n        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,\n        0.0000, 0.0000, 0.00

In [None]:
features