# Training the optimal model

In [1]:
# Loading packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow
import keras
from keras.models import load_model
import os
import sys

In [2]:
# Importign local modules
cwd = os.getcwd()
parent_directory = os.path.abspath(os.path.join(cwd, "..", ".."))
sys.path.append(parent_directory)

from utils.move_encoding import encode_move, decode_move
from utils.board_encoding import encode_board, fen_to_board

### Loading the dataset

There are two options:
1. Load one dataframe containing the full dataset
2. Create the dataframe from a folder that contains a set of partial dataframes

In [3]:
# 1. Loading a full dataset

# Defining the name and location
data_file = "cleaned_data_1M.pkl"
data_path = os.path.join('..', '..', 'data/cleaned_data', data_file)

#Creating the dataframe
df_full = pd.read_pickle(data_path)
df = df_full[['board', 'encoded_board', 'move', 'encoded_move']]

In [3]:
# 2. Loading multiple files

# Defining the name and location
folder_name = "10M"
folder_path = f"../../data/cleaned_data/{folder_name}"


# Loading the full dataframe
def list_files_in_folder(folder_path):
    files = os.listdir(folder_path)
    files_with_path = [os.path.join(folder_path, file) for file in files]

    return files_with_path


output_files = list_files_in_folder(folder_path)
df = pd.concat([pd.read_pickle(file) for file in output_files], ignore_index=True)

In [4]:
# Obtaining basic descriptives
obs_count = df.shape[0]
n_moves = len(df)
n_unique_moves = len(df["encoded_move"].unique())
n_missing_obs = len(df[df.isna().any(axis="columns")])
encoded_board_shape = df["encoded_board"][0].shape

display(df.head(10))
print(f"The total number of moves is: {n_moves}")
print(f"The total number of unique moves is: {n_unique_moves}")
print(f"The total number of missing observations is: {n_missing_obs}")
print(f"The shape of the encoded board array is: {encoded_board_shape}")

Unnamed: 0,board,encoded_board,move,encoded_move
0,r n b q k b n r\np p p p p p p p\n. . . . . . ...,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",d7d5,877
1,r n b q k b n r\np p p . p p p p\n. . . . . . ...,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",c2c4,731
2,r n b q k b n r\np p p . p p p p\n. . . . . . ...,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",e7e6,803
3,r n b q k b n r\np p p . . p p p\n. . . . p . ...,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",c4d5,1905
4,r n b q k b n r\np p p . . p p p\n. . . . p . ...,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",e6d5,1394
5,r n b q k b n r\np p p . . p p p\n. . . . . . ...,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",b1c3,129
6,r n b q k b n r\np p p . . p p p\n. . . . . . ...,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",g8f6,129
7,r n b q k b . r\np p p . . p p p\n. . . . . n ...,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",e2e3,876
8,r n b q k b . r\np p p . . p p p\n. . . . . n ...,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",f8d6,154
9,r n b q k . . r\np p p . . p p p\n. . . b . n ...,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",f1d3,415


The total number of moves is: 947959
The total number of unique moves is: 1818
The total number of missing observations is: 0
The shape of the encoded board array is: (8, 8, 14)


In [None]:
# Check correctness of the board encoding
encoded_board_max = np.max(df["encoded_board"].apply(lambda x: np.max(x)))
encoded_board_min = np.min(df["encoded_board"].apply(lambda x: np.min(x)))

if encoded_board_min != 0 or encoded_board_max != 1:
    print("Check the encoded board values")
else:
    print("Board is correctly encoded")

### Training the optimal model

The current optimal consists of:
- 1 flatten input layer
- 4 dense hidden layers, using the ReLu activation function and the number of neurons ranging from 1500 to 500
- 1 dense output layer, using the Softmax activation function

In [None]:
# Defining the variables needed for converting the data to model input
test_size = 0.3
encoded_board_length = 8 * 8 * 114
encoded_moves_length = 4672

# Converting data to input for the model
x = df["encoded_board"]
y = df["encoded_move"]
x_train, x_val, y_train, y_val = train_test_split(
    x, y, test_size=test_size, random_state=42
)

x_train = x_train.to_numpy()
x_val = x_val.to_numpy()
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

for i in range(len(x_train)):
    x_train[i] = x_train[i].reshape(encoded_board_length)

x_train = np.array(x_train.tolist()).astype("float32")
y_train = y_train.astype("float32")

for i in range(len(x_val)):
    x_val[i] = x_val[i].reshape(encoded_board_length)

x_val = np.array(x_val.tolist()).astype("float32")
y_val = y_val.astype("float32")

In [5]:
# Defining, compiling, and fitting the optimal model
model = keras.Sequential(
    [
        keras.layers.Flatten(input_shape=(encoded_board_length,)),
        keras.layers.Dense(units=1500, activation="relu"),
        keras.layers.Dense(units=1000, activation="relu"),
        keras.layers.Dense(units=750, activation="relu"),
        keras.layers.Dense(units=500, activation="relu"),
        keras.layers.Dense(units=encoded_moves_length, activation="softmax"),
    ]
)

model.compile(
    optimizer="Adam",
    loss="SparseCategoricalCrossentropy",  # Because y is an integer representation
    metrics=["accuracy"],
)

history = model.fit(x=x_train, y=y_train, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x19c0e044898>

In [6]:
model.save("../saved_models/trained_model.h5")