# Training the model to run on Lichess data
## Some pre-requisites if running on Google Collab
If not running on Google collab do not run these next two cells!

In [None]:
# Install the only dependency not available from collab directly
!pip install chess

# Get imported files from repo
!git clone -b rl-setup https://github.com/owenjaques/chessbot.git
!mv chessbot chessbot-repo
!mv chessbot-repo/src/chessbot .
!rm chessbot-repo -r

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')
weights_directory = '/content/gdrive/MyDrive/chessbot_weights/'
print(f'Saving weights to {weights_directory}')

## Get the data
This compression format is really nice, so you can cancel this cell whenever you want and all the games that were downloaded will be maintained.

In [None]:
!wget https://database.lichess.org/standard/lichess_db_standard_rated_2023-02.pgn.zst

## Decompress the Data

In [None]:

!pzstd -d lichess_db_standard_rated_2023-02.pgn.zst

## Play the games from the data
For this section we want to play the games so we can translate them into model inputs which we can train on.

In [15]:
import chess.pgn
import numpy as np
from chessbot.model_input import ModelInput

X_all = None
y_all = None

MAX_GAMES = 100000

with open('lichess_db_standard_rated_2023-02.pgn') as pgn:
    game_count = 0
    game = chess.pgn.read_game(pgn)

    while game is not None and game_count < MAX_GAMES:
        result = game.headers['Result']
        
        # Only train on game played to completion that were not draws
        if game.headers['Termination'] == 'Normal' and result in ['1-0', '0-1']:
            print(f'Processing game {game_count}/{MAX_GAMES}', end='\r')
            X = []
            
            board = game.board()
            for move in game.mainline_moves():
                board.push(move)
                X.append(ModelInput(board).get_input())

            X = np.array(X)
            y = np.empty_like(X)
            
            discount_factor = 0.95
            y_reversed_indices = np.linspace(len(y) - 1, 0, num=len(y))
            y = 1 * discount_factor**y_reversed_indices

            if result == '0-1':
                y = -y

            # Scale the labels to be between 0 and 1 instead of -1 and 1
            y = (y + 1) / 2

            # Save the data
            if X is not None and y is not None:
                if X_all is None:
                    X_all = X
                    y_all = y
                else:
                    X_all = np.concatenate((X_all, X))
                    y_all = np.concatenate((y_all, y))

        # Get the next game
        game_count += 1
        game = chess.pgn.read_game(pgn)

np.savez_compressed(f'{weights_directory}games_data.npz', X=X_all, y=y_all)

Processing game 494/100000

KeyboardInterrupt: 

## Our model
We will initially be using a MLP Regression model set up with the default parameters from scikit-learn's MLP Regression model since it seems like a solid place to start. After some trial and error, a second Dense layer for the model was added to hopefully capture a bit more complexity.

In [None]:
from tensorflow import keras

model = keras.Sequential([
	keras.layers.Dense(128, activation='relu'),
	keras.layers.Dense(128, activation='relu'),
	keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.01),
    loss='mse'
)

## Training the model

In [None]:
model.fit(X_all, y_all, epochs=10, batch_size=128)
model.save(f'{weights_directory}lichess_trained_model')