In [1]:
import numpy as np
import pandas as pd
import csv
import itertools
import torch
from torch.utils.data import DataLoader, Dataset

In [2]:
dataset = pd.read_csv("games.csv")
games = pd.DataFrame(dataset)

In [3]:
games.columns

Index(['id', 'rated', 'created_at', 'last_move_at', 'turns', 'victory_status',
       'winner', 'increment_code', 'white_id', 'white_rating', 'black_id',
       'black_rating', 'moves', 'opening_eco', 'opening_name', 'opening_ply'],
      dtype='object')

In [5]:
# print the length before getting rid of games with <10 moves
len(games)

20058

In [6]:
# calculate the number of moves for each game
games['num_moves'] = games['moves'].apply(lambda x: len(x.split()))

In [7]:
print(games['num_moves'][:10])

0     13
1     16
2     61
3     61
4     95
5      5
6     33
7      9
8     66
9    119
Name: num_moves, dtype: int64


In [8]:
# get rid of games with >10 moves
games = games.query("num_moves >= 10")

In [10]:
# print the new length
print(len(games))

19441


In [11]:
# get rid of all games where the outcome is a draw
games = games.query("winner != 'draw'")

In [12]:
len(games)

18527

In [13]:
games = games.drop(['victory_status', 'opening_name', 'id', 'white_id', 'black_id', 'rated', 'created_at',
                    'last_move_at', 'increment_code', 'white_rating', 'black_rating', 'opening_eco',
                    'opening_ply', 'num_moves'], axis=1)

In [14]:
games.columns

Index(['turns', 'winner', 'moves'], dtype='object')

In [15]:
print(games.iloc[1:8]) # these are not numbered sequentially since we excluded games with <10 moves

   turns winner                                              moves
1     16  black  d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...
2     61  white  e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...
3     61  white  d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...
4     95  white  e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...
6     33  white  d4 d5 e4 dxe4 Nc3 Nf6 f3 exf3 Nxf3 Nc6 Bb5 a6 ...
8     66  black  e4 e5 Bc4 Nc6 Nf3 Nd4 d3 Nxf3+ Qxf3 Nf6 h3 Bc5...
9    119  white  e4 d5 exd5 Qxd5 Nc3 Qe5+ Be2 Na6 d4 Qf5 Bxa6 b...


In [16]:
# reset the index
games = games.reset_index()

# drop the old index column
games = games.drop('index', axis=1)

In [17]:
print(games.iloc[1:8]) # now they are numbered sequentially.

   turns winner                                              moves
1     16  black  d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...
2     61  white  e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...
3     61  white  d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...
4     95  white  e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...
5     33  white  d4 d5 e4 dxe4 Nc3 Nf6 f3 exf3 Nxf3 Nc6 Bb5 a6 ...
6     66  black  e4 e5 Bc4 Nc6 Nf3 Nd4 d3 Nxf3+ Qxf3 Nf6 h3 Bc5...
7    119  white  e4 d5 exd5 Qxd5 Nc3 Qe5+ Be2 Na6 d4 Qf5 Bxa6 b...


In [18]:
# now we split the 'moves' column into a list of individual moves:
games['moves']=games['moves'].str.split()

# only retain the first ten moves:
games['moves']=games['moves'].apply(lambda x: x[:10])

# convert the list of moves to a string:
games['moves']=games['moves'].apply(lambda x: ' '.join(x))

In [19]:
games['moves'][0]

'd4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+'

In [20]:
first_move = games.loc[0, 'moves'].split()[1]
print(first_move)
print(type(first_move))

d5
<class 'str'>


In [22]:
games.columns


Index(['turns', 'winner', 'moves'], dtype='object')

In [23]:
import pickle
pickle.dump(games, open("games.pkl", "wb"))