In [1]:
from Player import Player
from Trainer import Trainer
from TetrisModel import TetrisModel
from Pretrainer import Pretrainer
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import glob
import time

In [2]:
piece_dim = 8
key_dim = 12
depth = 16
gamma = 0.99
lam = 0.95

In [3]:
# Use lambda instead of gamma to immitate shape of gae without value predictions
pretrainer = Pretrainer(gamma=lam)

In [4]:
players_data = pretrainer._load_data()

In [5]:
pretrainer._load_dset(players_data)

1.00

In [6]:
max_len = pretrainer._max_len

In [9]:
gt_dset = pretrainer._cache_dset()

Done Caching


In [7]:
agent = TetrisModel(piece_dim=piece_dim,
                    key_dim=key_dim,
                    depth=depth,
                    num_heads=4,
                    num_layers=4,
                    max_length=max_len)

In [8]:
logits, values = agent((tf.random.uniform((1, 28, 10, 1)),
                        tf.random.uniform((1, 7), minval=0, maxval=8, dtype=tf.int32),
                        tf.random.uniform((1, max_len-1), minval=0, maxval=key_dim, dtype=tf.int32)))
agent.summary(), tf.shape(logits), tf.shape(values)

Model: "tetris_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (1, 70, 16)               4800      
                                                                 
 seq_embedding (SeqEmbedding  multiple                 128       
 )                                                               
                                                                 
 seq_embedding_1 (SeqEmbeddi  multiple                 192       
 ng)                                                             
                                                                 
 pdec_0 (DecoderLayer)       multiple                  9776      
                                                                 
 pdec_1 (DecoderLayer)       multiple                  9776      
                                                                 
 pdec_2 (DecoderLayer)       multiple                 

(None,
 <tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 1, 21, 12])>,
 <tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 1, 21,  1])>)

In [9]:
agent_optimizer = keras.optimizers.Adam()
agent.compile(optimizer=agent_optimizer)

In [10]:
epochs = 10

In [11]:
agent.load_weights('agent_weights/agent')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1c86b9f8190>

In [None]:
losses, accs = pretrainer.train(agent, gt_dset, epochs)

In [None]:
plt.plot(losses)
plt.plot(accs)

In [16]:
if 'y' in input('YOU SURE?????'):
    agent.save_weights('agent_weights/agent')

YOU SURE????? y


In [12]:
%matplotlib qt

In [13]:
trainer = Trainer(model=agent,
                  seq_len=max_len,
                  gamma=gamma,
                  lam=lam,
                  max_episode_steps=500,
                  buffer_cap=2000)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmichaelsherrick[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
trainer.fill_replay_buffer()

Done filling replay buffer


In [None]:
trainer.train(gens=10000, train_steps=10, training_actor=False)

Current Gen: 1	|	Avg Reward: 0.0	|	Total Reward: 10.8	|
Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.
Current Gen: 2	|	Avg Reward: 0.0	|	Total Reward: 2.8	|
Current Gen: 3	|	Avg Reward: 0.0	|	Total Reward: 13.2	|
Current Gen: 4	|	Avg Reward: 0.0	|	Total Reward: 0.5	|
Critic Loss: 3.96	|	

In [16]:
episode_boards, episode_pieces, episode_inputs, episode_actions, episode_probs, episode_values, episode_rewards = trainer.player.run_episode(agent, 500)

In [17]:
episode_boards

<tf.Tensor: shape=(276, 28, 10, 1), dtype=float32, numpy=
array([[[[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        ...,

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]]],


       [[[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.],
         ...,
         [0.],
    

In [117]:
advantages, returns = trainer._compute_gae(episode_values, episode_rewards, gamma, lam)

In [118]:
plt.plot(returns)

[<matplotlib.lines.Line2D at 0x270a31684c0>]

In [45]:
plt.plot(episode_rewards)

[<matplotlib.lines.Line2D at 0x27069d08f10>]

In [120]:
plt.plot(advantages)

[<matplotlib.lines.Line2D at 0x2707c983e20>]

In [119]:
plt.plot(episode_values)

[<matplotlib.lines.Line2D at 0x2709f012e80>]

In [121]:
dset = trainer.replay_buffer.as_dataset(
    sample_batch_size=128,
    num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

In [122]:
for ((board_batch, piece_batch, input_batch, action_batch, old_probs, advantage_batch, return_batch), _) in dset.take(1):
    advantage_batch = (advantage_batch - tf.reduce_mean(advantage_batch)) / (tf.math.reduce_std(advantage_batch) + 1e-10)
    print(input_batch[0], action_batch[0], old_probs[0], advantage_batch[0], return_batch[0])

tf.Tensor([11  2  7  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0], shape=(22,), dtype=int32) tf.Tensor([8], shape=(1,), dtype=int32) tf.Tensor([-0.24575135], shape=(1,), dtype=float32) tf.Tensor([0.86595696], shape=(1,), dtype=float32) tf.Tensor([6.445074], shape=(1,), dtype=float32)


In [123]:
seq_inds = (tf.reduce_sum(tf.cast(input_batch != 0, tf.int32), axis=-1) - 1)
prob_inds = tf.concat([seq_inds[..., None], action_batch], axis=-1)

In [136]:
board_rep, _ = trainer.model.process_board((board_batch, piece_batch), training=False)
logits, _ = trainer.model.process_keys((board_rep, input_batch), training=True)

In [137]:
action_probs = tf.nn.log_softmax(logits, axis=-1)
                
new_probs = tf.gather_nd(action_probs,
                         prob_inds,
                         batch_dims=1)[..., None]

In [126]:
tf.exp(new_probs[0]), tf.exp(old_probs[0])

(<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.9982408], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.78211665], dtype=float32)>)

In [134]:
for _ in range(1000):
    al, cl = trainer._ppo_train_step(board_batch, piece_batch, input_batch, action_batch, old_probs, advantage_batch, return_batch, True)
    print(f'\r{al} {cl}', end='', flush=True)

-0.06052038073539734 0.0413806848227977755

In [138]:
tf.exp(new_probs[0])

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.9998647], dtype=float32)>

In [73]:
advantage_batch[0]

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.77510875], dtype=float32)>

In [75]:
return_batch[0]

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([4.9843183], dtype=float32)>

In [139]:
0.9982408 / 0.78211665

1.2763323731824403

In [25]:
logits = tf.random.uniform((32, 20, 10))

In [31]:
probs = tf.nn.log_softmax(logits, axis=-1)
probs

<tf.Tensor: shape=(32, 20, 10), dtype=float32, numpy=
array([[[-2.8474317, -2.0701065, -2.2764716, ..., -2.5013475,
         -2.259903 , -2.776818 ],
        [-2.1819293, -2.2409906, -2.6492786, ..., -2.1213238,
         -2.8047776, -2.041637 ],
        [-2.399289 , -2.585085 , -1.8766233, ..., -2.5774498,
         -2.5489883, -1.8178211],
        ...,
        [-2.076394 , -2.1778827, -2.2622313, ..., -1.9148115,
         -2.4658272, -2.5965772],
        [-2.0865836, -2.792211 , -2.810768 , ..., -1.9355419,
         -2.6346402, -2.773929 ],
        [-2.765492 , -2.6839004, -2.5387888, ..., -2.179572 ,
         -2.5808856, -1.9415395]],

       [[-2.25219  , -2.50855  , -2.1342962, ..., -2.0830631,
         -2.9577503, -2.603734 ],
        [-2.2969275, -2.3016016, -2.449213 , ..., -2.3447266,
         -2.6452994, -2.2275047],
        [-2.063203 , -2.344955 , -1.9029124, ..., -1.9917884,
         -2.5423703, -2.8087802],
        ...,
        [-2.6587498, -2.0067294, -2.1645055, ..., -2.1

In [45]:
action_batch = tf.random.uniform((32,), minval=0, maxval=10, dtype=tf.int32)
onehot_action = tf.one_hot(action_batch, depth=tf.shape(logits)[-1])
onehot_action

<tf.Tensor: shape=(32, 10), dtype=float32, numpy=
array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0.,

In [35]:
seq_inds = tf.random.uniform((32,), minval=0, maxval=10, dtype=tf.int32)
seq_inds

<tf.Tensor: shape=(32,), dtype=int32, numpy=
array([2, 5, 7, 8, 8, 1, 3, 6, 9, 1, 4, 5, 6, 7, 0, 0, 1, 3, 8, 4, 1, 2,
       7, 0, 6, 3, 1, 6, 3, 4, 0, 8])>

In [47]:
tf.gather(probs,
          seq_inds,
          batch_dims=1)

<tf.Tensor: shape=(32, 10), dtype=float32, numpy=
array([[-2.399289 , -2.585085 , -1.8766233, -2.4650073, -2.2722216,
        -2.4353032, -2.4241667, -2.5774498, -2.5489883, -1.8178211],
       [-2.4900026, -2.4219108, -2.0497487, -2.1943913, -2.0502162,
        -2.6785069, -2.9249504, -2.102774 , -2.2998223, -2.1653404],
       [-2.1992893, -1.9588264, -2.0173807, -2.2239194, -2.6462994,
        -2.351708 , -2.6388388, -2.815093 , -2.1348305, -2.39493  ],
       [-2.4035823, -2.295262 , -2.1989946, -2.3324962, -2.6579607,
        -2.6402154, -2.3755116, -2.0971327, -1.8065579, -2.5323343],
       [-2.3619146, -2.085748 , -2.506484 , -2.6952672, -2.5583549,
        -2.158152 , -1.9112043, -2.7207751, -2.152563 , -2.2110543],
       [-2.3760242, -2.6681528, -2.2393785, -2.3036046, -2.6329813,
        -2.043459 , -2.2097383, -2.088285 , -2.7038596, -2.0459404],
       [-2.5935059, -2.3720427, -2.7251616, -1.9696401, -1.9857353,
        -2.7039323, -2.0769696, -2.4702744, -2.689435 , -1.9

In [64]:
log_probs = tf.nn.log_softmax([3., 3, 3, 3])

In [65]:
tf.reduce_sum(log_probs * tf.exp(log_probs))

<tf.Tensor: shape=(), dtype=float32, numpy=-1.3862944>