In [308]:
%load_ext autoreload
%autoreload 2
%pylab inline

import sys
import glob
import pandas as pd
import os
import seaborn as sns
# from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
import pickle
from statsmodels.distributions.empirical_distribution import ECDF
from collections import defaultdict
import logging
from open_spiel.python.examples.ubc_mccfr_cpp_example import action_to_bids
from open_spiel.python.examples.ubc_nfsp_example import policy_from_checkpoint
from open_spiel.python.examples.ubc_br import BR_DIR, make_dqn_agent
from open_spiel.python.examples.ubc_utils import *
from open_spiel.python.examples.ubc_decorators import CachingAgentDecorator

from open_spiel.python.pytorch.ubc_nfsp import NFSP
import bokeh
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import HoverTool, ColumnDataSource, ColorBar, LogColorMapper, LinearColorMapper
from bokeh.transform import linear_cmap, log_cmap

import yaml

output_notebook()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [309]:
# Why is best response not at least discovering that it can get 0 reward?

In [310]:
experiment_dir = '/shared/outputs/jan2_big_game_2/lstm_deep'
checkpoint_name = 'checkpoint_3000000'
br_player = 0
report_freq = 5_000

In [332]:
checkpoint_dir = os.path.join(experiment_dir, BR_DIR)
env_and_model = policy_from_checkpoint(experiment_dir, checkpoint_suffix=checkpoint_name)
game, policy, env, trained_agents, game_config = env_and_model.game, env_and_model.nfsp_policies, env_and_model.env, env_and_model.agents, env_and_model.game_config
config_path = f'{experiment_dir}/config.yml' 

with open(config_path, 'rb') as fh:
    config = yaml.load(fh, Loader=yaml.FullLoader)
    config['num_training_episodes'] = 500_000
num_training_episodes = config['num_training_episodes']

In [333]:
agents = []
for i in range(game.num_players()):
    if i == br_player:
        agents.append(make_dqn_agent(i, config, env, game, game_config))
    else:
        agent = trained_agents[i]
        agent = CachingAgentDecorator(agent)
        agents.append(agent)

logging.info(f"Training for {num_training_episodes} episodes")
# TRAINING PHASE

In [334]:
agents[0]

<open_spiel.python.pytorch.ubc_dqn.DQN at 0x7f4dc6c1cb38>

In [335]:
import torch

In [317]:
sum(list(map(lambda p: torch.prod(torch.tensor(p.shape)).item(), agents[0]._q_network.parameters())))

79372

In [318]:
512*128

65536

In [336]:
def check_on_q_values(agent, game):
    q_network = agent._q_network
    state = game.new_initial_state().child(1).child(0)
    legal_actions = state.legal_actions()
    it = state.information_state_tensor()
    info_state = q_network.prep_batch([q_network.reshape_infostate(it)])
    q_values = q_network(info_state).detach()[0]
    legal_q_values = q_values[legal_actions]
    legal_q_values = [agent.unmapRange(v) for v in legal_q_values]
    action_dict = get_actions(game)
    return {s: q for s,q in zip(action_dict.values(), legal_q_values)}

In [337]:
check_on_q_values(agents[br_player], game)

{'Bid for 0,0,0 licenses @ $0 with activity 0': tensor(-108.6960),
 'Bid for 0,0,1 licenses @ $25 with activity 25': tensor(-102.2518),
 'Bid for 0,1,0 licenses @ $50 with activity 50': tensor(-100.3396),
 'Bid for 0,1,1 licenses @ $75 with activity 75': tensor(-87.1786),
 'Bid for 1,0,0 licenses @ $75 with activity 75': tensor(-112.2526),
 'Bid for 1,0,1 licenses @ $100 with activity 100': tensor(-147.0244),
 'Bid for 1,1,0 licenses @ $125 with activity 125': tensor(-55.9490),
 'Bid for 1,1,1 licenses @ $150 with activity 150': tensor(-53.2103),
 'Bid for 2,0,0 licenses @ $150 with activity 150': tensor(-113.5410),
 'Bid for 2,0,1 licenses @ $175 with activity 175': tensor(-100.8988),
 'Bid for 2,1,0 licenses @ $200 with activity 200': tensor(-159.1750),
 'Bid for 2,1,1 licenses @ $225 with activity 225': tensor(-96.9344)}

In [338]:
# fill buffer with some data with opponent dropping out
for action in range(12):
    for _ in range(10):
        time_step_1 = env.reset()
        env.step([action])
        time_step_2 = env.step([0])

        agents[br_player].add_transition(
            time_step_1, action, time_step_2
        )

# switch learning settings temporarily
min_buffer_size = agents[br_player]._min_buffer_size_to_learn
batch_size = agents[br_player]._batch_size
agents[br_player]._min_buffer_size_to_learn = 1
agents[br_player]._batch_size = 12

# train a bunch
for iteration in tqdm(range(3000)):
    agents[br_player].learn()
    
# change settings back
agents[br_player]._min_buffer_size_to_learn = min_buffer_size
agents[br_player]._batch_size = batch_size

# clear buffer
agents[br_player]._replay_buffer.clear()

100%|██████████| 3000/3000 [00:09<00:00, 330.98it/s]


In [322]:
# for d in agents[br_player]._replay_buffer._data:
#     print(d.raw_reward)

In [325]:
len(agents[br_player]._replay_buffer._data)

0

In [326]:
# agents[br_player]._min_buffer_size_to_learn = 1
# agents[br_player]._batch_size = 12
# for g in agents[br_player]._optimizer.param_groups:
#     g['lr'] = 1e-2


In [327]:
for iteration in range(1000):
    agents[br_player].learn()

In [339]:
check_on_q_values(agents[br_player], game)

{'Bid for 0,0,0 licenses @ $0 with activity 0': tensor(0.0282),
 'Bid for 0,0,1 licenses @ $25 with activity 25': tensor(17.5503),
 'Bid for 0,1,0 licenses @ $50 with activity 50': tensor(20.8990),
 'Bid for 0,1,1 licenses @ $75 with activity 75': tensor(42.5817),
 'Bid for 1,0,0 licenses @ $75 with activity 75': tensor(23.3687),
 'Bid for 1,0,1 licenses @ $100 with activity 100': tensor(44.4017),
 'Bid for 1,1,0 licenses @ $125 with activity 125': tensor(47.3201),
 'Bid for 1,1,1 licenses @ $150 with activity 150': tensor(70.7930),
 'Bid for 2,0,0 licenses @ $150 with activity 150': tensor(49.8554),
 'Bid for 2,0,1 licenses @ $175 with activity 175': tensor(74.3150),
 'Bid for 2,1,0 licenses @ $200 with activity 200': tensor(71.8852),
 'Bid for 2,1,1 licenses @ $225 with activity 225': tensor(96.8533)}

In [329]:
# fill buffer with some data with opponent dropping out
for action in range(12):
    for _ in range(10):
        time_step_1 = env.reset()
        env.step([action])
        time_step_2 = env.step([0])

        agents[br_player].add_transition(
            time_step_1, action, time_step_2
        )

# switch learning settings temporarily
min_buffer_size = agents[br_player]._min_buffer_size_to_learn
batch_size = agents[br_player]._batch_size
agents[br_player]._min_buffer_size_to_learn = 1
agents[br_player]._batch_size = 12

# train a bunch
for iteration in tqdm(range(3000)):
    agents[br_player].learn()
    
# change settings back
agents[br_player]._min_buffer_size_to_learn = min_buffer_size
agents[br_player]._batch_size = batch_size

# clear buffer
agents[br_player]._replay_buffer.clear()


100%|██████████| 3000/3000 [00:09<00:00, 332.14it/s]


In [341]:
agents[br_player]._min_buffer_size_to_learn

5000

In [340]:
agents[br_player]._batch_size

256

In [342]:
import pprofile
profiler = pprofile.Profile()
with profiler:
    for i in tqdm(range(5000)):
    #     if i % 1000 == 0:
    #         logging.info(f"----Episode {i} ---")
    #         if i > 1_000:
    #             loss = agents[br_player].loss
    #             logging.info(f"[P{br_player}] Loss: {loss}")
    #             print(check_on_q_values(agents[br_player], game))

        time_step = env.reset()

        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent = agents[player_id]
            agent_output = agent.step(time_step, is_evaluation=player_id != br_player)
            action_list = [agent_output.action]
            time_step = env.step(action_list)

        # Episode is over, step all agents with final info state.
        for player_id, agent in enumerate(agents):
            agent.step(time_step, is_evaluation=player_id != br_player)


# profiler.print_stats()
profiler.dump_stats("output/profile.txt")



100%|██████████| 5000/5000 [01:20<00:00, 62.00it/s] 


In [345]:
def sample_train():
    for i in tqdm(range(5000)):
    #     if i % 1000 == 0:
    #         logging.info(f"----Episode {i} ---")
    #         if i > 1_000:
    #             loss = agents[br_player].loss
    #             logging.info(f"[P{br_player}] Loss: {loss}")
    #             print(check_on_q_values(agents[br_player], game))

        time_step = env.reset()

        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent = agents[player_id]
            agent_output = agent.step(time_step, is_evaluation=player_id != br_player)
            action_list = [agent_output.action]
            time_step = env.step(action_list)

        # Episode is over, step all agents with final info state.
        for player_id, agent in enumerate(agents):
            agent.step(time_step, is_evaluation=player_id != br_player)

In [346]:
import cProfile
cProfile.run('sample_train()', sort='tottime')

100%|██████████| 5000/5000 [01:13<00:00, 67.98it/s] 


         3044580 function calls (3039128 primitive calls) in 73.503 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      305   27.685    0.091   27.685    0.091 {method 'run_backward' of 'torch._C._EngineBase' objects}
    90238   10.568    0.000   10.568    0.000 {built-in method tensor}
    30210    4.042    0.000    4.139    0.000 rl_environment.py:224(get_time_step)
     2379    3.046    0.001    3.046    0.001 {built-in method lstm}
    39893    2.788    0.000    4.102    0.000 __init__.py:91(__contains__)
    90238    2.242    0.000    2.242    0.000 {method 'reshape' of 'torch._C._TensorBase' objects}
    45119    2.156    0.000   16.512    0.000 ubc_rnn.py:55(reshape_infostate)
        1    1.887    1.887   73.549   73.549 <ipython-input-345-73873dd4a314>:1(sample_train)
     2379    1.827    0.001    2.133    0.001 ubc_rnn.py:114(<listcomp>)
    20105    1.722    0.000   56.860    0.003 ubc_dqn.py:279(step)
    81

In [347]:
import cProfile
cProfile.run('sample_train()', sort='cumtime')

100%|██████████| 5000/5000 [01:17<00:00, 64.88it/s] 


         3162017 function calls (3156157 primitive calls) in 77.019 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   77.068   77.068 {built-in method builtins.exec}
        1    0.000    0.000   77.068   77.068 <string>:1(<module>)
        1    1.895    1.895   77.068   77.068 <ipython-input-345-73873dd4a314>:1(sample_train)
    20297    1.747    0.000   60.482    0.003 ubc_dqn.py:279(step)
      329    0.660    0.002   38.624    0.117 ubc_dqn.py:415(learn)
      329    0.003    0.000   30.326    0.092 _tensor.py:251(backward)
      329    0.004    0.000   30.323    0.092 __init__.py:69(backward)
      329   30.312    0.092   30.312    0.092 {method 'run_backward' of 'torch._C._EngineBase' objects}
    45651    2.150    0.000   16.567    0.000 ubc_rnn.py:55(reshape_infostate)
    15297    0.828    0.000   11.919    0.001 ubc_dqn.py:339(add_transition)
    91302   10.607    0.000   10.607    0.