# Initial Data Processing
---
Purpose of this is to take our initial deepmind dataset and process it. For ex., remove board states w/ only 1 move.

Then create a dataset with samples we can use for train, test, and evals that are non-overlapping.

In [1]:
import ast
import pandas as pd

In [None]:
# Import as a dataframe
file_path = './deepmind_data/deepmind_test_62k.csv'
df = pd.read_csv(file_path)

# Need to convert from string to lists
df['Move'] = df['Move'].apply(ast.literal_eval)
df['Win Probability'] = df['Win Probability'].apply(ast.literal_eval)
print(f"Length of initial df:   {len(df)}")

# Remove all rows w/ less than 2 moves
df = df[df['Move'].apply(len) >= 2]
print(f"Length of processed df: {len(df)}")

Length of initial df:   62561
Length of processed df: 61833


In [3]:
def process_fen(fen):
    """Process FEN by spacing out the board part (before first space), preserving the rest unchanged."""
    first_space_idx = fen.find(' ')
    board = ' '.join(fen[:first_space_idx])
    suffix = fen[first_space_idx:]
    return board + suffix

# Process the FEN strings
# df['FEN'] = df['FEN'].apply(process_fen)

In [None]:
# Now sample data -- create three datasets saved in the 'raw' folder
# Shuffle the DataFrame to randomize rows
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data without overlap
train_df = df.iloc[:20000]  # First 20k samples for 'train'
evals_df = df.iloc[20000:21000]  # Next 1k samples for 'evals'
test_df = df.iloc[21000:23000]  # Next 2k samples for 'test'

# Save the datasets to CSV files in the 'raw' folder
train_df.to_csv('./deepmind_data/train_20k.csv', index=False)
evals_df.to_csv('./deepmind_data/evals_1k.csv', index=False)
test_df.to_csv('./deepmind_data/test_2k.csv', index=False)