# Policy Gradient in 2048
This model plays 2048 and uses the following strategy:
1. Play n games
2. Calculate mean fitness function score
3.
    * Optimize the loss for the games with an above average fitness score (assume that every action at every state was correct).
    * Optimize the negative of the loss for the games with a below average fitness score (assume that every action at every state was incorrect).
4. Repeat.

# Game imports
This is made slightly more complicated than it has to be, because the target folder has characters that are disallowed in python packages.

In [39]:
import os 
dir_path = os.path.abspath('')
dir_path = os.path.join(dir_path, '2048-python-custom-player')

import sys
sys.path.insert(0, dir_path)

import constants as c
from tie_in import TieIn # Used to launch a game of 2048

## Set game constants

In [40]:
c.GRID_LEN_X = 3
c.GRID_LEN_Y = 3
#c.PROBABILITY_4 = 0

# The model

### Imports

In [28]:
import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, Dense

### Clear previous models from memory (if needed)

In [29]:
keras.backend.clear_session()

## Instantiate the model

In [30]:
model = Sequential([Conv2D(32, kernel_size=2, input_shape=(c.GRID_LEN_X, c.GRID_LEN_Y, 1)),
                    #Conv2D(32, kernel_size=2),
                    Conv2D(64, kernel_size=2),
                    Dense(4, activation='softmax')])

#model.compile(loss=keras.losses.CategoricalCrossentropy(from_logits=False), optimizer=keras.optimizers.Adam())
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 2, 2, 32)          160       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 1, 64)          8256      
_________________________________________________________________
dense (Dense)                (None, 1, 1, 4)           260       
Total params: 8,676
Trainable params: 8,676
Non-trainable params: 0
_________________________________________________________________


# Preprocessing
Because the growth rate of the tiles in 2048 is exponential, we take log2 of all values, since linearly scaling values are more manageable. We also squish the values to be smaller in scale. For instance, 2 maps to 0.1, 1024 to 1, 8192 to 1.3, and 0 to 0.

In [31]:
import numpy as np

def preprocess(matrix):
    non_zeros = np.nonzero(matrix)
    matrix[non_zeros] = np.log2(matrix[non_zeros]) / np.log2(1024)
    return matrix

def one_hot(moves):
    move_count = moves.shape[0]
    oh = np.zeros((move_count, 4))
    oh[np.arange(move_count), moves] = 1
    return oh

## Data augmentation
Because of rotational symmetry, each game of 2048 corresponds to another set of three games (at least for square games). If we accept a move in one of the games as valid, then state and action's corresponding rotations must be equally valid. We use this argument to create more artificial games. Hopefully, this will also prevent a collapse in strategy where the model favors one corner.

In [32]:
move_rotation_lookup = { 0: 2, 1: 3, 2: 1, 3: 0} # UP->LEFT, DOWN->RIGHT, LEFT->DOWN, RIGHT->UP
def rotate_90_moves(moves):
    move_count = moves.shape[0]
    rot = np.zeros(moves.shape)
    for i in range(move_count): # TODO: this is likely an inefficient way to implement this
        rot[i] = move_rotation_lookup[rot[i]]
    return rot.astype(int)

def rotate_90_board(boards):
    return np.rot90(boards, axes=(1, 2))

# Create a custom player

In [33]:
class ModelPlayer():
    def __init__(self, model):
        self.model = model
        self.game_final = None # Contains the last state of the game
        self.last_turn = -1
    
    def game_grid_init(self, game_grid):
        pass

    # TODO: maybe randomly pick a move, basing the probabilities on the softmax?
    def play(self, game):
        if game.move_count > self.last_turn:
            self.last_turn = game.move_count
        else:
            print("ModelPlayer: stuck on turn {}".format(game.move_count))
            breakpoint()
        
        possible_directions = game.possible_directions()
        possible_directions = np.array(possible_directions)
        
        model_input = np.array(game.matrix)
        model_input = np.expand_dims(model_input, axis=-1) # Number of channels (1)
        model_input = np.expand_dims(model_input, axis=0) # Batch dimension
        
        model_output = self.model.predict(model_input)
        model_output = np.squeeze(model_output)
        
        model_mask = np.zeros(4)
        model_mask[possible_directions] = 1
        model_output *= model_mask # Only select from possible moves
        
        # Pick a move randomly based on how strongly the model suggests it
        model_output_sum = np.sum(model_output)
        
        response = None
        if model_output_sum > 0:
            model_output_cumulative = np.cumsum(model_output) / np.sum(model_output)
            cutoff_point = np.random.rand()
            response = model_output_cumulative.searchsorted(cutoff_point)
        
        # response = np.argmax(model_output) # Picks the strongest response
        if model_output_sum <= 0 or response is None or not np.any(possible_directions == response): # If we for any reason have picked an impossible move
            response = np.random.choice(possible_directions) # Choose one of the possible ones at randoma
        
        return response
    
    def sleep(self, game, render):
        if not render:
            return 0 # Don't sleep when training

        return np.log2(game.max_tile) / 30 # Go slower when it gets interesting
    
    def lost(self, game):
        self.game_final = game

# Training

## Custom training step

In [34]:
loss_fn = keras.losses.CategoricalCrossentropy(from_logits=False)
optimizer = keras.optimizers.Adam()

#@tf.function
def train_step(model, X, Y, punish=False):
    with tf.GradientTape() as tape:
        out = model(X, training=True)
        out = tf.squeeze(out)
        loss_val = loss_fn(Y, out)
        if punish:
            loss_val *= -1
            
    grads = tape.gradient(loss_val, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))

In [35]:
def fitness_function(game):
    return game.move_count + game.max_tile

## Training procedure

### Constants

In [37]:
EPOCHS = 100
BATCH_SIZE = 8
PRINT_INTERVALS = 10 # Number of seconds between progress update (minimum)

### The actual loop

In [41]:
import time
import multiprocessing
from functools import reduce

last_print_time = time.time()
history_fitness = []
history_move_count = []
history_max_tile = []

print("Starting the training process.")
for epoch in range(1, EPOCHS + 1):
    games = []
    
    for i in range(BATCH_SIZE):
        model_player = ModelPlayer(model)
        ti = TieIn(model_player, render=False, log_history=True)
        ti.start()
        games.append(model_player.game_final)
        
        cur_time = time.time()
        if cur_time - last_print_time >= PRINT_INTERVALS:
            print("Playing: Processed: {}/{} in batch, {}/{} epochs.".format(i+1, BATCH_SIZE, epoch, EPOCHS))
            last_print_time = cur_time
            
    batch_mean_fitness = map(lambda g: fitness_function(g), games)
    batch_mean_fitness = reduce(lambda sum_g, g: sum_g + g, batch_mean_fitness)
    batch_mean_fitness = float(batch_mean_fitness) / BATCH_SIZE
    
    batch_mean_move_count = map(lambda g: g.move_count, games)
    batch_mean_move_count = reduce(lambda sum_g, g: sum_g + g, batch_mean_move_count)
    batch_mean_move_count = float(batch_mean_move_count) / BATCH_SIZE
    
    batch_mean_max_tile = map(lambda g: g.max_tile, games)
    batch_mean_max_tile = reduce(lambda sum_g, g: sum_g + g, batch_mean_max_tile)
    batch_mean_max_tile = float(batch_mean_max_tile) / BATCH_SIZE
    
    print("Status: Epoch {}: Mean fitness: {}, Move count: {}, Max tile: {}".format(epoch, batch_mean_fitness, batch_mean_move_count, batch_mean_max_tile))
    last_print_time = time.time()
    history_fitness.append(batch_mean_fitness)
    history_move_count.append(batch_mean_move_count)
    history_max_tile.append(batch_mean_max_tile)
    
    good_games = filter(lambda g: fitness_function(g) >= batch_mean_fitness, games)
    bad_games = filter(lambda g: fitness_function(g) < batch_mean_fitness, games)
    
    processed_count = 0
    
    for g in good_games:
        X_base = np.array(g.board_history)
        X_base = preprocess(X_base)
        y_base = np.array(g.move_history) # Sparse encoding
        
        for rot in range(4):
            X = X_base[..., np.newaxis] # Same as expand_dims(X, axis=-1)
            Y = one_hot(y_base) # One-hot encoding

            train_step(model, X, Y, punish=False)
            
            if rot < 3:
                X_base = rotate_90_board(X_base)
                y_base = rotate_90_moves(y_base)

            if cur_time - last_print_time >= PRINT_INTERVALS:
                print("Training: Processed: {}/{} in batch, {}/{} epochs.".format(processed_count, BATCH_SIZE, epoch, EPOCHS))
                last_print_time = cur_time
        
    for g in bad_games:
        X_base = np.array(g.board_history)
        X_base = preprocess(X_base)
        y_base = np.array(g.move_history) # Sparse encoding
        
        for rot in range(4):
            X = X_base[..., np.newaxis] # Same as expand_dims(X, axis=-1)
            Y = one_hot(y_base) # One-hot encoding

            train_step(model, X, Y, punish=True)
            
            if rot < 3:
                X_base = rotate_90_board(X_base)
                y_base = rotate_90_moves(y_base)

            if cur_time - last_print_time >= PRINT_INTERVALS:
                print("Training: Processed: {}/{} in batch, {}/{} epochs.".format(processed_count, BATCH_SIZE, epoch, EPOCHS))
                last_print_time = cur_time

Starting the training process.
Status: Epoch 1: Mean fitness: 53.875, Move count: 31.875, Max tile: 22.0
Status: Epoch 2: Mean fitness: 58.0, Move count: 34.0, Max tile: 24.0
Status: Epoch 3: Mean fitness: 72.5, Move count: 38.5, Max tile: 34.0
Status: Epoch 4: Mean fitness: 64.75, Move count: 36.75, Max tile: 28.0
Status: Epoch 5: Mean fitness: 53.0, Move count: 30.0, Max tile: 23.0
Status: Epoch 6: Mean fitness: 69.5, Move count: 37.5, Max tile: 32.0
Playing: Processed: 8/8 in batch, 7/100 epochs.
Status: Epoch 7: Mean fitness: 78.125, Move count: 42.125, Max tile: 36.0
Playing: Processed: 8/8 in batch, 8/100 epochs.
Status: Epoch 8: Mean fitness: 68.875, Move count: 38.875, Max tile: 30.0
Playing: Processed: 8/8 in batch, 9/100 epochs.
Status: Epoch 9: Mean fitness: 61.875, Move count: 33.875, Max tile: 28.0
Playing: Processed: 7/8 in batch, 10/100 epochs.
Status: Epoch 10: Mean fitness: 72.625, Move count: 38.625, Max tile: 34.0
Playing: Processed: 5/8 in batch, 11/100 epochs.
Stat

Status: Epoch 77: Mean fitness: 97.375, Move count: 49.375, Max tile: 48.0
Status: Epoch 78: Mean fitness: 52.5, Move count: 29.5, Max tile: 23.0
Status: Epoch 79: Mean fitness: 60.25, Move count: 35.25, Max tile: 25.0
Playing: Processed: 7/8 in batch, 80/100 epochs.
Status: Epoch 80: Mean fitness: 83.0, Move count: 45.0, Max tile: 38.0
Status: Epoch 81: Mean fitness: 56.0, Move count: 33.0, Max tile: 23.0
Playing: Processed: 8/8 in batch, 82/100 epochs.
Status: Epoch 82: Mean fitness: 71.0, Move count: 39.0, Max tile: 32.0
Playing: Processed: 8/8 in batch, 83/100 epochs.
Status: Epoch 83: Mean fitness: 70.875, Move count: 40.875, Max tile: 30.0
Status: Epoch 84: Mean fitness: 67.125, Move count: 37.125, Max tile: 30.0
Playing: Processed: 8/8 in batch, 85/100 epochs.
Status: Epoch 85: Mean fitness: 72.625, Move count: 40.625, Max tile: 32.0
Status: Epoch 86: Mean fitness: 52.625, Move count: 29.625, Max tile: 23.0
Playing: Processed: 5/8 in batch, 87/100 epochs.
Status: Epoch 87: Mean 

# Save the model and training history

In [20]:
import pickle

model.save('policy_gradient.h5')


policy_gradient_history = { 'history_fitness': history_fitness,
                           'history_move_count': history_move_count,
                           'history_max_tile': history_max_tile }
file = open('policy_gradient_history.pickle', 'wb')
pickle.dump(policy_gradient_history, file)
file.close()

# Load the model

In [21]:
model = tf.keras.models.load_model('policy_gradient.h5')



# Test the model

In [23]:
model_player = ModelPlayer(model)
ti = TieIn(model_player, render=True, log_history=False)
ti.start()