In [2]:
#Import packages
import sqlite3
import pandas as pd
import gym
import gym_chess
import chess
from gym_chess.alphazero import BoardEncoding
import numpy as np
import matplotlib.pyplot as plt
import math
import shutil



In [3]:
#Importing created modules
import os
import sys 

cwd = os.getcwd()
parent_directory = os.path.abspath(os.path.join(cwd, '..', '..'))
sys.path.append(parent_directory)

from utils.move_encoding import encode_move, decode_move
from utils.board_encoding import encode_board, fen_to_board

In [3]:
#Defining the functiong for finding moves
def find_move(fen1, fen2):
    # Create board objects from FEN positions
    # fen1 = x['fen']
    # fen2 = x['next_fen']
    
    board1 = chess.Board(fen1)
    board2 = chess.Board(fen2)

    # Find the move made between the two positions
    move = np.nan
    for possible_move in board1.legal_moves:
        # Make the move on a copy of the first board
        temp_board = board1.copy()
        temp_board.push(possible_move)
        
        # Compare the resulting FEN position with the second position
        if temp_board==board2:
            move = possible_move
            move = move.uci()
            break

    return move


In [4]:
#Variables to be set
n_observations = 10000000
step_size = 10000
folder_name = "10M"

#Creating the directories
folder_path = f"../../data/cleaned_data/{folder_name}" 

# if os.path.exists(folder_path):
#     # os.rmdir(folder_path)
#     shutil.rmtree(folder_path)
    
os.makedirs(folder_path)

#Loading the database
database = sqlite3.connect('../../data/test_data.db')
output_files = []

In [5]:
#Define the steps for processing the data
def process_data(database, start_index, step_size, output_file):
    
    query = f"SELECT * FROM evaluations LIMIT {step_size} OFFSET {start_index}"
    df = pd.read_sql_query(query, database)
    
    df['board'] = df['fen'].apply(fen_to_board)
    df['encoded_board'] = df['board'].apply(encode_board)
    
    df['next_fen'] = df['fen'].shift(-1)
    df = df.dropna(subset=['next_fen']).reset_index(drop=True)
    
    df['move'] = df.apply(lambda row: find_move(row['fen'], row['next_fen']), axis=1)
    df = df.dropna(subset=['move']).reset_index(drop=True)
    
    df['encoded_move'] = df.apply(lambda row: encode_move(row['move'], row['board']), axis=1)
    df.dropna(subset=['encoded_move'],inplace=True)
    df.reset_index(inplace=True, drop=True)
    df = df[['encoded_board', 'encoded_move']]
    
    df.to_pickle(output_file)
    

In [6]:
#Implement loop for step-wise data processing
for start_index in range(0, n_observations, step_size):
    
    output_file = f"{folder_path}/processed_step_{start_index}.pkl"
    
    # Process the current step and save to file
    process_data(database, start_index, step_size, output_file)
    output_files.append(output_file)

In [10]:
#Obtaining file list
folder_name = "10M"

#Creating the directories
folder_path = f"../../data/cleaned_data/{folder_name}" 

def list_files_in_folder(folder_path):
    # Get a list of all files in the folder
    files = os.listdir(folder_path)
    
    files_with_path = [os.path.join(folder_path, file) for file in files]
    
    return files_with_path

# Example usage:
output_files = list_files_in_folder(folder_path)
print(output_files)


['../../data/cleaned_data/10M\\processed_step_0.pkl', '../../data/cleaned_data/10M\\processed_step_10000.pkl', '../../data/cleaned_data/10M\\processed_step_100000.pkl', '../../data/cleaned_data/10M\\processed_step_1000000.pkl', '../../data/cleaned_data/10M\\processed_step_1010000.pkl', '../../data/cleaned_data/10M\\processed_step_1020000.pkl', '../../data/cleaned_data/10M\\processed_step_1030000.pkl', '../../data/cleaned_data/10M\\processed_step_1040000.pkl', '../../data/cleaned_data/10M\\processed_step_1050000.pkl', '../../data/cleaned_data/10M\\processed_step_1060000.pkl', '../../data/cleaned_data/10M\\processed_step_1070000.pkl', '../../data/cleaned_data/10M\\processed_step_1080000.pkl', '../../data/cleaned_data/10M\\processed_step_1090000.pkl', '../../data/cleaned_data/10M\\processed_step_110000.pkl', '../../data/cleaned_data/10M\\processed_step_1100000.pkl', '../../data/cleaned_data/10M\\processed_step_1110000.pkl', '../../data/cleaned_data/10M\\processed_step_1120000.pkl', '../..

In [11]:
#Generate the combined dataframe
final_df = pd.concat([pd.read_pickle(file) for file in output_files], ignore_index=True)


In [11]:
final_df.to_pickle("../../data/cleaned_data/cleaned_data_10M.pkl")

In [19]:
#Test cell to see if encoding - decoding provides the same result
# df['decoded_move'] = ""

# for i in range(len(df)):
#     df.loc[i, 'decoded_move'] = decode_move(df['encoded_move'][i])
    
# df[['move', 'encoded_move', 'decoded_move']]

Unnamed: 0,move,encoded_move,decoded_move
0,d7d5,3299,d7d5
1,c2c4,666,c2c4
2,e7e6,3372,e7e6
3,c4d5,1699,c4d5
4,e6d5,2851,e6d5
...,...,...,...
94908,g8h7,4023,g8h7
94909,c8f5,3749,c8f5
94910,g7g6,3502,g7g6
94911,f5f7,2421,f5f7
