This data is from https://www.kaggle.com/competitions/finding-elo/data.

Note that the first 25000 data entries are able to be used for training, as the latter 25000 entries were used for the competition.

In [1]:
import pandas as pd

## Creating pandas table from Chess PGN

In [41]:
import traceback

def createDataframeFromPGN(filename: str, numIter: int = -1) -> pd.DataFrame:
    '''
    Creates a pandas dataframe from a file that uses chess Portable Game Notation (pgn). 
    ---------
    filename: the name of the file to read into a DF.

    numEntries: By default, If specified, reads the first numEntries into a DF. 
    ---------
    Returns pd.DataFrame of the chess data, with columns for whiteElo, blackElo, and set of moves.
    '''

    # get data from file
    plaintext_data = []
    with open(filename, "r") as f:
        plaintext_data = f.read()

    plaintext_split = plaintext_data.split("\n\n")
    plaintext_split.remove("")


    df = pd.DataFrame(data=[],columns=["WhiteElo", "BlackElo", "WhiteWins", "Moves"])


    # by default reads whole file

    if(numIter == -1):
        numDataPoints = len(plaintext_split) // 2
    else:
        numDataPoints = numIter


    for i in range(numDataPoints):
        try:
            metadata = plaintext_split[i * 2]
            gamedata = plaintext_split[i * 2 + 1]
            
            split_metadata = metadata.split("\n")

            white_win_string = split_metadata[6]
        
            white_elo_string = split_metadata[7]
            black_elo_string = split_metadata[8]

            white_elo = int(white_elo_string.split('"')[1])
            black_elo = int(black_elo_string.split('"')[1])

            white_wins = 0 if white_win_string[-3] == "2" else (
                        1 if white_win_string[-3] == "0" else 
                        -1 # if white_win_string[-3:] == "1"
            )


            ###
            ###
            ### cleaning game data

            # replace all newlines in gamedata with spaces
            gamedata_only_spaces = gamedata.translate(str.maketrans("\n", " "))
            
            gamedata_list = gamedata_only_spaces.split(" ")[0:-1] # the last item will be the result

            clean_gamedata_string = ""

            for j in range(len(gamedata_list)):
                # skip turn numbers
                if j % 3 == 0:
                    continue

                clean_gamedata_string += gamedata_list[j]
                clean_gamedata_string += " "

            clean_gamedata_string = clean_gamedata_string[:-1]


            df.loc[i] = [white_elo, black_elo, white_wins, clean_gamedata_string]

        except Exception as e:
            print(i)
            print(traceback.format_exc())
            continue
    
    return df


In [43]:
# we know we have 25000 usable entries from data.txt.
df = createDataframeFromPGN("data.txt", 25000)

25000


In [44]:
df.head()


Unnamed: 0,WhiteElo,BlackElo,WhiteWins,Moves
0,2354,2411,0,Nf3 Nf6 c4 c5 b3 g6 Bb2 Bg7 e3 O-O Be2 b6 O-O ...
1,2523,2460,0,e4 e5 Nf3 Nf6 d4 Nxe4 Nxe5 d6 Nf3 d5 Bd3 Nd6 O-O
2,1915,1999,-1,e4 d5 exd5 Nf6 d4 Nxd5 Nf3 g6 Be2 Bg7 c4 Nb6 N...
3,2446,2191,1,c4 Nf6 Nc3 d6 d4 e5 Nf3 Nbd7 Bg5 Be7 e3 c6 Qc2...
4,2168,2075,1,e4 c5 Nf3 d6 b4 Nf6 bxc5 Nxe4 cxd6 Qb6 d4 Bg4 ...


## Combining Stockfish Data with PGN Data

In [50]:
stockfish_data = pd.read_csv("stockfish.csv")

#df.insert(4,"Stockfish ratings", , True)
df['StockfishScores'] = stockfish_data['MoveScores']
df.head()
df.to_csv('base_chess_data.csv')

Unnamed: 0,WhiteElo,BlackElo,WhiteWins,Moves,StockfishScores
0,2354,2411,0,Nf3 Nf6 c4 c5 b3 g6 Bb2 Bg7 e3 O-O Be2 b6 O-O ...,18 17 12 8 -5 12 3 -2 22 21 20 13 8 21 11 3 -6...
1,2523,2460,0,e4 e5 Nf3 Nf6 d4 Nxe4 Nxe5 d6 Nf3 d5 Bd3 Nd6 O-O,26 44 26 18 14 34 36 31 37 35 42 52 55
2,1915,1999,-1,e4 d5 exd5 Nf6 d4 Nxd5 Nf3 g6 Be2 Bg7 c4 Nb6 N...,26 51 68 57 65 77 48 93 61 63 63 58 53 46 69 2...
3,2446,2191,1,c4 Nf6 Nc3 d6 d4 e5 Nf3 Nbd7 Bg5 Be7 e3 c6 Qc2...,2 21 5 53 35 45 37 54 10 22 8 48 30 17 13 35 -...
4,2168,2075,1,e4 c5 Nf3 d6 b4 Nf6 bxc5 Nxe4 cxd6 Qb6 d4 Bg4 ...,26 64 35 53 18 20 18 20 10 49 60 95 91 82 83 9...
