# Generating and reviewing the data

In [1]:
# Import packages
import sqlite3
import pandas as pd
import gym
import gym_chess
import chess
from gym_chess.alphazero import BoardEncoding
import numpy as np
import matplotlib.pyplot as plt
import math
import shutil
import os
import sys

In [2]:
# Importing created local modules
cwd = os.getcwd()
parent_directory = os.path.abspath(os.path.join(cwd, "..", ".."))
sys.path.append(parent_directory)

from utils.move_encoding import encode_move, decode_move
from utils.board_encoding import encode_board, fen_to_board
from utils.find_move import find_move

### Data loading settings

The dataset used is an obtained from [Lichess](https://database.lichess.org/), with +1M chess games and ~37M chess moves. The database is not included in the repository given size limitation. The features in the database are: 

- id: the unique move ID
- FEN (Forsyth-Edwards Notation): a standard notation for describing a particular board position of a chess game using a single line of text
- eval: the evaluation score of the move

Part of the database is used, to limit the running time and GPU usage. This is determined below, by setting:

- n_observations: the total number of observations (chess moves) to include
- step_size: the steps in which the data is processed, used to keep the reduce the working memory needed

In [4]:
# Variables to be set
n_observations = 10000000
step_size = 10000
folder_name = "10M"

# Creating the directories
folder_path = f"../../data/cleaned_data/{folder_name}"

""" Only exectute the below if the folder should be removed and re-created if it already exists """
# if os.path.exists(folder_path):
#     shutil.rmtree(folder_path)

os.makedirs(folder_path)
output_files = []

# Connecting to the database
database = sqlite3.connect("../../data/lichess_game_data.db")

### Processing the data

A step-wise processing of the data is executed below. In each loop, a part of the database is processed and a .pkl of the cleaned dataframe is created in the folder identified above.

Steps in the data processing are:
1. **An encoded board position is obtained from the FEN.** Following the approach in the AlphaZero paper, a board position is encoded into an (8,8,119) shaped array. 
2. **The move made for a given board position is obtained.** This is done using the current and next board FEN
3. **The move is encoded.** Following the approach in the AlphaZero paper, a move is encoded into an (4672,) shaped array


Details on the encoding can be found in the [AlphaZero paper](https://arxiv.org/abs/1712.01815) and in the board_encoding & move_encoding utility files.

In [5]:
def process_data(database, start_index, step_size, output_file):
    """
    Processes data from a database, encodes board states and moves, and saves the processed data to a file.

    Args:
        database (str or sqlite3.Connection): The database connection, containing the lichess data
        start_index (int): The starting index from which to fetch data.
        step_size (int): The number of records to fetch from the database.
        output_file (str): The path to the output file where the processed data will be saved.

    Returns:
        None

    """

    # Read the data from the database
    query = f"SELECT * FROM evaluations LIMIT {step_size} OFFSET {start_index}"
    df = pd.read_sql_query(query, database)

    # Obtain the encoded board for each observation
    df["board"] = df["fen"].apply(fen_to_board)
    df["encoded_board"] = df["board"].apply(encode_board)

    # Obtain the move that was made for each observation, given the current and next board position
    df["next_fen"] = df["fen"].shift(-1)
    df = df.dropna(subset=["next_fen"]).reset_index(drop=True)
    df["move"] = df.apply(lambda row: find_move(row["fen"], row["next_fen"]), axis=1)
    df = df.dropna(subset=["move"]).reset_index(drop=True)

    # Encode the move
    df["encoded_move"] = df.apply(
        lambda row: encode_move(row["move"], row["board"]), axis=1
    )
    df.dropna(subset=["encoded_move"], inplace=True)
    df.reset_index(inplace=True, drop=True)

    # Save the relevant dataframe
    df = df[["encoded_board", "encoded_move"]]
    df.to_pickle(output_file)

    pass

In [6]:
# Implement loop for step-wise data processing
for start_index in range(0, n_observations, step_size):
    output_file = f"{folder_path}/processed_step_{start_index}.pkl"
    process_data(database, start_index, step_size, output_file)
    output_files.append(output_file)

### Reviewing the data

In [5]:
def list_files_in_folder(folder_path):
    """
    Lists all files in the specified folder, including their full paths.

    Args:
        folder_path (str): The path to the folder whose files are to be listed.

    Returns:
        list of str: A list of full paths to the files in the specified folder.
    """

    files = os.listdir(folder_path)
    files_with_path = [os.path.join(folder_path, file) for file in files]

    return files_with_path

In [6]:
# Generate the combined dataframe
output_files = list_files_in_folder(folder_path)
final_df = pd.concat([pd.read_pickle(file) for file in output_files], ignore_index=True)

In [36]:
# Print summary information of the dataset
n_moves = len(final_df)
n_unique_moves = len(final_df["encoded_move"].unique())
n_missing_obs = len(final_df[final_df.isna().any(axis="columns")])

display(final_df.head(10))
print(f"The total number of moves is: {n_moves}")
print(f"The total number of unique moves is: {n_unique_moves}")
print(f"The total number of missing observations is: {n_missing_obs}")

Unnamed: 0,encoded_board,encoded_move
0,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",877
1,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",731
2,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",803
3,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",1905
4,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",1394
5,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",129
6,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",129
7,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",876
8,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",154
9,"[[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ...",415


The total number of moves is: 9479998
The total number of unique moves is: 1846
The total number of missing observations is: 0
