In [1]:
from Player import Player
from Trainer import Trainer
from TetrisModel import TetrisModel
from Pretrainer import Pretrainer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
import pickle
import glob
import time

In [2]:
piece_dim = 8
key_dim = 12
depth = 32
gamma = 0.99
lam = 0.95
temperature = 1.0

In [3]:
# Use lambda instead of gamma to immitate shape of gae without value predictions
# pretrainer = Pretrainer(gamma=lam)

In [4]:
# players_data = pretrainer._load_data()

In [5]:
# pretrainer._load_dset(players_data)

In [6]:
# max_len = pretrainer._max_len
max_len = 10

In [7]:
# gt_dset = pretrainer._cache_dset()

In [8]:
agent = TetrisModel(piece_dim=piece_dim,
                    key_dim=key_dim,
                    depth=depth,
                    num_heads=4,
                    num_layers=4,
                    max_length=max_len,
                    out_dim=key_dim)

In [44]:
agent_optimizer = keras.optimizers.Adam(1e-5, clipnorm=1.0)
agent.compile(optimizer=agent_optimizer)

In [10]:
logits, piece_scores, key_scores = agent((tf.random.uniform((32, 28, 10, 1)),
                                          tf.random.uniform((32, 7), minval=0, maxval=8, dtype=tf.int32),
                                          tf.random.uniform((32, max_len), minval=0, maxval=key_dim, dtype=tf.int32)), return_scores=True)
agent.summary(), tf.shape(logits), tf.shape(piece_scores), tf.shape(key_scores)

Model: "tetris_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (32, 70, 32)              37312     
                                                                 
 seq_embedding (SeqEmbedding  multiple                 256       
 )                                                               
                                                                 
 seq_embedding_1 (SeqEmbeddi  multiple                 384       
 ng)                                                             
                                                                 
 piece_dec_0 (DecoderLayer)  multiple                  37984     
                                                                 
 piece_dec_1 (DecoderLayer)  multiple                  37984     
                                                                 
 piece_dec_2 (DecoderLayer)  multiple                 

(None,
 <tf.Tensor: shape=(3,), dtype=int32, numpy=array([32, 10, 12])>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 4, 32,  4,  7, 70])>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 4, 32,  4, 10,  7])>)

In [11]:
critic = TetrisModel(piece_dim=piece_dim,
                     key_dim=key_dim,
                     depth=depth,
                     num_heads=4,
                     num_layers=4,
                     max_length=max_len,
                     out_dim=1)

In [45]:
critic_optimizer = keras.optimizers.Adam(1e-5, clipnorm=1.0)
critic.compile(optimizer=critic_optimizer)

In [13]:
values, piece_scores, key_scores = critic((tf.random.uniform((32, 28, 10, 1)),
                                           tf.random.uniform((32, 7), minval=0, maxval=8, dtype=tf.int32),
                                           tf.random.uniform((32, max_len), minval=0, maxval=key_dim, dtype=tf.int32)), return_scores=True)
critic.summary(), tf.shape(values), tf.shape(piece_scores), tf.shape(key_scores)

Model: "tetris_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_9 (Sequential)   (32, 70, 32)              37312     
                                                                 
 seq_embedding_2 (SeqEmbeddi  multiple                 256       
 ng)                                                             
                                                                 
 seq_embedding_3 (SeqEmbeddi  multiple                 384       
 ng)                                                             
                                                                 
 piece_dec_0 (DecoderLayer)  multiple                  37984     
                                                                 
 piece_dec_1 (DecoderLayer)  multiple                  37984     
                                                                 
 piece_dec_2 (DecoderLayer)  multiple               

(None,
 <tf.Tensor: shape=(3,), dtype=int32, numpy=array([32, 10,  1])>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 4, 32,  4,  7, 70])>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 4, 32,  4, 10,  7])>)

In [14]:
agent_checkpoint = tf.train.Checkpoint(model=agent, optim=agent.optimizer)
agent_checkpoint.restore('agent_checkpoint/pretrained/ckpt-11')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1e19c4edd60>

In [15]:
critic_checkpoint = tf.train.Checkpoint(model=critic, optim=critic.optimizer)
critic_checkpoint.restore('critic_checkpoint/finetuned/ckpt-5')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1e2122ed8b0>

In [16]:
ref_agent = TetrisModel(piece_dim=piece_dim,
                        key_dim=key_dim,
                        depth=depth,
                        num_heads=4,
                        num_layers=4,
                        max_length=max_len,
                        out_dim=key_dim)

In [17]:
logits, piece_scores, key_scores = ref_agent((tf.random.uniform((1, 28, 10, 1)),
                                              tf.random.uniform((1, 7), minval=0, maxval=8, dtype=tf.int32),
                                              tf.random.uniform((1, max_len), minval=0, maxval=key_dim, dtype=tf.int32)), return_scores=True)
tf.shape(logits), tf.shape(piece_scores), tf.shape(key_scores)

(<tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 1, 10, 12])>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 4,  1,  4,  7, 70])>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 4,  1,  4, 10,  7])>)

In [18]:
ref_checkpoint = tf.train.Checkpoint(model=ref_agent)
ref_checkpoint.restore('agent_checkpoint/pretrained/ckpt-11')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1ded44e0520>

In [19]:
# epochs = 10

In [20]:
# actor_losses, critic_losses, accs = pretrainer.train(agent, critic, gt_dset, epochs)

In [21]:
%matplotlib qt

In [22]:
# plt.plot(actor_losses)
# plt.plot(critic_losses)
# plt.plot(accs)

In [23]:
# agent_checkpoint.save('agent_checkpoint/pretrained/ckpt')
# critic_checkpoint.save('critic_checkpoint/pretrained/ckpt')

In [24]:
agent_checkpoint = tf.train.Checkpoint(model=agent, optim=agent.optimizer)
agent_checkpoint_manager = tf.train.CheckpointManager(agent_checkpoint, 'agent_checkpoint/finetuned', max_to_keep=5)

In [25]:
critic_checkpoint = tf.train.Checkpoint(model=critic, optim=critic.optimizer)
critic_checkpoint_manager = tf.train.CheckpointManager(critic_checkpoint, 'critic_checkpoint/finetuned', max_to_keep=5)

In [50]:
trainer = Trainer(agent=agent,
                  critic=critic,
                  ref_model=ref_agent,
                  max_len=max_len,
                  gamma=gamma,
                  lam=lam,
                  temperature=temperature,
                  max_episode_steps=100,
                  buffer_cap=1000)

VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
critic_loss,▅▆▅▄▆▁▄▇▂▂▅▅▂█
reward,▃▃▃▇█▂▆▁▂▂▁▆▅▂
reward_per_piece,▂▃▂▇█▁▃▁▁▃▁█▃▁

0,1
critic_loss,0.04604
reward,0.59
reward_per_piece,0.00967


In [51]:
trainer.fill_replay_buffer()

Done filling replay buffer

In [52]:
while True:
    trainer.train(gens=100, train_steps=10, training_actor=True)
    agent_checkpoint_manager.save()
    critic_checkpoint_manager.save()

PPO Loss: 0.02	|	KL Divergence: 0.14	|	Critic Loss: 0.10	|		

KeyboardInterrupt: 

In [53]:
agent_checkpoint_manager.save()
critic_checkpoint_manager.save()

'critic_checkpoint/finetuned\\ckpt-4'

In [38]:
episode_data = trainer.player.run_episode(agent, critic, max_steps=100, greedy=False, renderer=trainer.renderer)

In [39]:
episode_boards, episode_pieces, episode_inputs, episode_probs, episode_values, episode_rewards = episode_data

In [40]:
episode_advantages, episode_returns = trainer._compute_gae(episode_values, episode_rewards, trainer.gamma, trainer.lam)

In [41]:
fig, ax = plt.subplots()
ax.plot(episode_returns, label='Returns')
ax.plot(episode_values, label='Values')
ax.legend()
tf.reduce_sum(episode_rewards)

<tf.Tensor: shape=(), dtype=float32, numpy=1.1999998>

In [42]:
fig, ax = plt.subplots()
ax.plot(episode_rewards, label='Rewards')
ax.plot(episode_advantages, label='Advantages')
ax.legend()

<matplotlib.legend.Legend at 0x1e2f18951f0>

In [33]:
trainer.wandb_run.finish()

VBox(children=(Label(value='0.093 MB of 0.093 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
critic_loss,▄▂▁▁▁▃█▅▂▂▂▁▃▃▂▄▄▄▄▃▂▂▁▂▁▂▂▂▂▂▂▂▂▂▂▁▁▁▂▂
entropy,▄▁▂▆▄▆▅▇▃▅▆▄▆▆▆█▆▅▇▅▃▅▃▅▆▃█▅▆▅▆▅▇▃█▅▇▇█▆
kl_div,▅▅▄▂▃▂▂▄▁▄▄▂▂▁▄▅▂▄▄▁▄▂▄▄▃█▄▃▂▇▃▆▅▃▆▆▃▂▃▄
ppo_loss,▆▅▂▇▁▅▇▂▂▄▅▃▅▅▃▆▄▇▇▅▂▅▇▇▂█▄▇▃▃▅▁▃▄▆▄█▆▂▃
reward,▄▅▆▄▇▄▄▄▆▅▄▄▅▅▅▅▃▃▆▅▄▅▅▂▃▅▇▅▅▃▃▂▁▇▄▅▅▄▅█
reward_per_piece,▃▄▅▄▃▄▆▂▄▃▆▄▂▅▄▁▂▃▅▃▅▅▆▄▄▆▄█▃▃▆▅▄▄▂▃▄▃▂▅
unclipped_proportion,▅▅▄▄▆▆▇▅▆▄█▆▄█▅▇█▆▃▅▅▃▆▃▆▅▅▄▅▆▃▁▄▅▃▅▆▄▆▄

0,1
critic_loss,0.13156
entropy,-0.2607
kl_div,0.20674
ppo_loss,0.00448
reward,4.25
reward_per_piece,0.0425
unclipped_proportion,0.82742
