In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline



In [14]:
import sys
import glob
import pandas as pd
import os
import seaborn as sns
import numpy as np

from tqdm import tqdm
from statsmodels.distributions.empirical_distribution import ECDF
from collections import defaultdict
import pickle
import re
import json
from pathlib import Path

from open_spiel.python.algorithms.exploitability import nash_conv, best_response
from open_spiel.python.examples.ubc_plotting_utils import *
from open_spiel.python.examples.ubc_cma import analyze_checkpoint


from auctions.webutils import *

os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

output_notebook()

In [3]:
game = pyspiel.load_game('python_clock_auction', dict(filename='/apps/open_spiel/configs/small_sats_complete_symmetric.json'))

def get_opening_probabilities(agent, precision=2):
    # TODO: assumes agent is player 0
    e = EnvParams(num_envs=1, sync=False, normalize_rewards=False).make_env(game)
    e.reset()

    if agent.player_id == 1:
        e.step([1])

    step_output = agent.step(e.get_time_step(), is_evaluation=True)
    probs = step_output.probs
    return np.round(probs, precision)

In [4]:

import torch.nn as nn

def compute_approx_kl(logratio, ratio):
    with torch.no_grad():
        # calculate approx_kl http://joschu.net/blog/kl-approx.html
        approx_kl = ((ratio - 1) - logratio).mean()
        return approx_kl

def learn(self, time_step):
    next_obs = torch.Tensor(np.array([ts.observations['info_state'][self.player_id] for ts in time_step])).reshape((-1,) + self.input_shape).to(self.device)

    # bootstrap value if not done
    with torch.no_grad():
        next_value = self.get_value(next_obs).reshape(1, -1)
        advantages = torch.zeros_like(self.rewards).to(self.device)
        lastgaelam = 0
        for t in reversed(range(self.steps_per_batch)):
            nextvalues = next_value if t == self.steps_per_batch - 1 else self.values[t + 1]
            nextnonterminal = 1.0 - self.dones[t]
            delta = self.rewards[t] + nextvalues * nextnonterminal - self.values[t]
            advantages[t] = lastgaelam = delta + self.gae_lambda * nextnonterminal * lastgaelam
        returns = advantages + self.values

    # flatten the batch
    b_legal_actions_mask = self.legal_actions_mask.reshape((-1, self.num_actions))
    b_obs = self.obs.reshape((-1,) + self.input_shape)
    b_logprobs = self.logprobs.reshape(-1)
    b_actions = self.actions.reshape(-1)
    b_probs = self.probs.reshape((-1, self.num_actions))
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = self.values.reshape(-1)

    # Optimizing the policy and value network
    b_inds = np.arange(self.batch_size)
    clipfracs = []
    should_early_stop = False
    for epoch in range(self.update_epochs):
        print('Epoch: ', epoch)
        if should_early_stop:
            break
        np.random.shuffle(b_inds)
        for start in range(0, self.batch_size, self.minibatch_size):
            end = start + self.minibatch_size
            mb_inds = b_inds[start:end]

            _, newlogprob, entropy, newvalue, probs = self.get_action_and_value(b_obs[mb_inds], legal_actions_mask=b_legal_actions_mask[mb_inds], action=b_actions.long()[mb_inds])
            logratio = newlogprob - b_logprobs[mb_inds]
            ratio = logratio.exp()

            approx_kl = compute_approx_kl(logratio, ratio)
            with torch.no_grad():
                old_approx_kl = (-logratio).mean()
                clipfracs += [((ratio - 1.0).abs() > self.clip_coef).float().mean().item()]

            mb_advantages = b_advantages[mb_inds]
            if self.normalize_advantages:
                mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

            # Policy loss
            pg_loss1 = -mb_advantages * ratio
            pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - self.clip_coef, 1 + self.clip_coef)
            pg_loss = torch.max(pg_loss1, pg_loss2).mean()
            # pg_loss = pg_loss1.mean()

            # Neil loss: squared error between new and old probs
            # policy_change_loss = ((probs - b_probs[mb_inds])**2).mean()


            # Value loss
            newvalue = newvalue.view(-1)
            if self.clip_vloss:
                v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                v_clipped = b_values[mb_inds] + torch.clamp(
                    newvalue - b_values[mb_inds],
                    -self.clip_coef,
                    self.clip_coef,
                )
                v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                v_loss = 0.5 * v_loss_max.mean()
            else:
                v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
                
            entropy_loss = entropy.mean()
            # loss = pg_loss + self.penalty_coef * policy_change_loss - self.entropy_coef * entropy_loss + v_loss * self.value_coef
            loss = pg_loss - self.entropy_coef * entropy_loss + v_loss * self.value_coef


            self.optimizer.zero_grad()
            loss.backward()
            self.grad_norm = nn.utils.clip_grad_norm_(self.parameters(), self.max_grad_norm)
            self.optimizer.step()

            # Now take some stats over the batch. Possibly very slow. Possibly impossible with memory if batches are too big.
            _, newlogprob, entropy, newvalue, probs = self.get_action_and_value(b_obs, legal_actions_mask=b_legal_actions_mask, action=b_actions.long())
            logratio = newlogprob - b_logprobs
            old_approx_kl = (-logratio).mean() # Want to log this, the final one. Not some arbitarary metric about the minibatch
            ratio = logratio.exp()
            approx_kl = compute_approx_kl(logratio, ratio)

            # Print probabilities here

            print(f"Approx KL {approx_kl}")
            print(f"Value loss {v_loss}")
            print(f'Mean value diff {torch.abs(newvalue - b_values).mean()}')
            print(f"Opening probabilities: {get_opening_probabilities(agent, 3)}")
            print(f"")
            if self.target_kl is not None:
                if approx_kl > self.target_kl:
                    should_early_stop = True
                    # TODO: Is it fair to also be stopping the value network from learning here?
                    break

    y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
    var_y = np.var(y_true)
    explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

    # compute L_\infty change in probabilities
    # TODO: split into minibatches?
    _, _, _, _, new_probs = self.get_action_and_value(b_obs, legal_actions_mask=b_legal_actions_mask, action=b_actions.long())
    with torch.no_grad():
        self.max_policy_diff = (new_probs - b_probs).abs().max().item()
        self.mean_policy_diff = (new_probs - b_probs).abs().sum(axis=1).mean().item() # Sum over the actions, mean over the batch
        self.mean_value_diff = (newvalue - b_returns[mb_inds]).mean().item() / 2




In [5]:
import dill as pickle
import types

player_id = 0
update_num = 41

with open(f"/shared/pickles/ppo_agent_{player_id}_{update_num}.pkl", 'rb') as f:
    agent = pickle.load(f)
    agent.learn = types.MethodType(learn, agent)


print(get_opening_probabilities(agent, 3))

with open(f'/shared/pickles/time_step_{player_id}_{update_num}.pkl', 'rb') as f:
    time_step = pickle.load(f)

# for update_num in range(0, 50):
#     print(update_num)
#     with open(f"/shared/pickles/ppo_agent_{player_id}_{update_num}.pkl", 'rb') as f:
#         agent = pickle.load(f)

#     print(get_opening_probabilities(agent))

#     with open(f'/shared/pickles/time_step_{player_id}_{update_num}.pkl', 'rb') as f:
#         time_step = pickle.load(f)

[0.    0.    0.    0.001 0.335 0.43  0.053 0.113 0.068]


In [None]:
print(get_opening_probabilities(agent, 3))

In [None]:
# agent.clip_coef = 0.01

# agent.update_epochs = 1
# agent.minibatch_size = agent.batch_size

agent.penalty_coef = 10

In [None]:
np.random.seed(1234)
agent.learn(time_step)

In [None]:
print(get_opening_probabilities(agent, 3))

In [None]:
features = agent.obs.reshape((128*16, 23*9))

In [None]:
agent.values.reshape((128*16, 1))

In [None]:
len(list(map(str, features)))

In [None]:
len(set(map(str, features)))

In [None]:
pd.value_counts(list(map(str, features)))

In [None]:
import torch
a = torch.tensor([3,4,5])

In [None]:
import glob

In [None]:
import re

In [None]:
bad = []
for fname in glob.glob('/shared/pickles/*'):
    try:
        idx = int(fname.split('_')[-1].replace('.pkl', ''))
        if idx > 100:
            bad.append(fname)
    except:
        continue


In [None]:
!rm {' '.join(bad)}

In [None]:
bad

In [6]:
import pandas as pd
df = pd.read_csv('/shared/pickles/batch_data_1_39_no_gae.csv')

In [7]:
open_obs = df.iloc[0]['b_obs']

In [8]:
open_df = df.query(f'b_obs == "{open_obs}"')

In [9]:
open_df['b_values'].describe()

count    1007.00000
mean        0.53495
std         0.00000
min         0.53495
25%         0.53495
50%         0.53495
75%         0.53495
max         0.53495
Name: b_values, dtype: float64

In [10]:
open_df['b_returns'].describe()

count    1007.000000
mean        0.487624
std         0.044510
min         0.050758
25%         0.480303
50%         0.503030
75%         0.503030
max         0.703030
Name: b_returns, dtype: float64

In [None]:
 df.query('b_actions == 4')['b_advantages'].describe()

In [None]:
import pyspiel
game = pyspiel.load_game('python_clock_auction', dict(filename='/apps/open_spiel/configs/small_sats_complete_symmetric.json'))


In [11]:
open_df['b_actions'].value_counts()

5.0    427
4.0    405
6.0    115
7.0     32
8.0     28
Name: b_actions, dtype: int64

In [13]:
open_df.groupby('b_actions')['b_advantages'].mean()

b_actions
4.0   -0.035967
5.0   -0.047394
6.0   -0.062992
7.0   -0.085646
8.0   -0.102466
Name: b_advantages, dtype: float64

In [15]:
game = pyspiel.load_game('python_clock_auction', dict(filename='/apps/open_spiel/configs/small_sats_complete_symmetric.json'))


In [18]:
game.auction_params.all_bids

array([[0, 0],
       [0, 1],
       [0, 2],
       [1, 0],
       [1, 1],
       [1, 2],
       [2, 0],
       [2, 1],
       [2, 2]])