In [9]:
import pandas as pd
import numpy as np
import re
import chess.pgn
import io

### Formatting Data
Pandas dataframe. Currently, data is stored as a list of strings.

Game Dataframe:
- Player names and elos
- PGN
- Opening

Move Dataframe:
- Player name and elo
- Who is to move
- White time remaining
- Black time remaining
- White total material
- Black total material
- Current position
- Number of reasonable engine recommended moves (maybe?)
- Position complexity
- Move time (response)

In [2]:
games = pd.DataFrame(columns=["White Name", "Black Name", "ECO Opening", "White Elo", "Black Elo", "PGN", "Game Length"])

with open("games_condensed.txt", "r") as file:
    for line in file:
        line = line.split(",")

        # Don't include wacky variants like Chess960
        if "Variant" in line[3]:
            continue
        
        # Remove unnecessary features
        indices=[2, 3, 4, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 19]  # Manually found
        for i in sorted(indices, reverse=True):
            del line[i]

        # Parse strings to get valuable information
        pattern = r'\\"(.*?)\\"'  # This string pattern encloses all important info
        for i in range(0, 5):
            match = re.search(pattern, line[i])
            if match:
                line[i] = match.group(1)
        
        pattern = r'(\d+)\. '
        match = re.findall(pattern, line[5])
        line.append(int(match[-1]))
        
        if line[6] >= 5 and line[6] <= 80:
            games.loc[games.shape[0]] = line

In [3]:
games.head()

Unnamed: 0,White Name,Black Name,ECO Opening,White Elo,Black Elo,PGN,Game Length
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12
1,colinsong1,Octopus6666,B22,1596,1600,"""1. e4 {[%clk 0:03:00]} 1... c5 {[%clk 0:03:0...",36
2,TimeFish_576,kimmmanhhh,A21,1564,1522,"""1. c4 {[%clk 0:03:00]} 1... e5 {[%clk 0:03:0...",64
3,TimeFish_576,Tavusheci,A15,1548,1563,"""1. c4 {[%clk 0:03:00]} 1... c5 {[%clk 0:02:5...",27
4,Octopus6666,S11chandru,D02,1625,1607,"""1. d4 {[%clk 0:03:00]} 1... d5 {[%clk 0:03:0...",42


## PGN to FEN Files

In [6]:
PGN_lst = games['PGN']

In [7]:
games['FEN'] = 0
games.head()

Unnamed: 0,White Name,Black Name,ECO Opening,White Elo,Black Elo,PGN,Game Length,FEN
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12,0
1,colinsong1,Octopus6666,B22,1596,1600,"""1. e4 {[%clk 0:03:00]} 1... c5 {[%clk 0:03:0...",36,0
2,TimeFish_576,kimmmanhhh,A21,1564,1522,"""1. c4 {[%clk 0:03:00]} 1... e5 {[%clk 0:03:0...",64,0
3,TimeFish_576,Tavusheci,A15,1548,1563,"""1. c4 {[%clk 0:03:00]} 1... c5 {[%clk 0:02:5...",27,0
4,Octopus6666,S11chandru,D02,1625,1607,"""1. d4 {[%clk 0:03:00]} 1... d5 {[%clk 0:03:0...",42,0


## This chunk adds a complete FEN_lst for every game.

In [10]:
for i in range(len(PGN_lst)):
    this_game = chess.pgn.read_game(io.StringIO(PGN_lst[i]))
    
    # For each game, iterate through all moves and play them on a board.
    fen_lst = []
    board = this_game.board()
    for move in this_game.mainline_moves():
        board.push(move)
        fen_lst.append(board.fen())
    games['FEN'][i] = fen_lst

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games['FEN'][i] = fen_lst
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [11]:
games['FEN'][0]

['rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq - 0 1',
 'rnbqkbnr/pppppp1p/6p1/8/4P3/8/PPPP1PPP/RNBQKBNR w KQkq - 0 2',
 'rnbqkbnr/pppppp1p/6p1/8/4P3/2N5/PPPP1PPP/R1BQKBNR b KQkq - 1 2',
 'rnbqk1nr/ppppppbp/6p1/8/4P3/2N5/PPPP1PPP/R1BQKBNR w KQkq - 2 3',
 'rnbqk1nr/ppppppbp/6p1/8/2B1P3/2N5/PPPP1PPP/R1BQK1NR b KQkq - 3 3',
 'rnbqk1nr/pppp1pbp/4p1p1/8/2B1P3/2N5/PPPP1PPP/R1BQK1NR w KQkq - 0 4',
 'rnbqk1nr/pppp1pbp/4p1p1/8/2B1P3/2NP4/PPP2PPP/R1BQK1NR b KQkq - 0 4',
 'rnbqk2r/ppppnpbp/4p1p1/8/2B1P3/2NP4/PPP2PPP/R1BQK1NR w KQkq - 1 5',
 'rnbqk2r/ppppnpbp/4p1p1/8/2B1P3/2NPB3/PPP2PPP/R2QK1NR b KQkq - 2 5',
 'rnbq1rk1/ppppnpbp/4p1p1/8/2B1P3/2NPB3/PPP2PPP/R2QK1NR w KQ - 3 6',
 'rnbq1rk1/ppppnpbp/4p1p1/8/2B1P3/2NPB3/PPPQ1PPP/R3K1NR b KQ - 4 6',
 'r1bq1rk1/ppppnpbp/2n1p1p1/8/2B1P3/2NPB3/PPPQ1PPP/R3K1NR w KQ - 5 7',
 'r1bq1rk1/ppppnpbp/2n1p1p1/8/2B1P3/2NPB3/PPPQ1PPP/2KR2NR b - - 6 7',
 'r1bq1rk1/ppp1npbp/2n1p1p1/3p4/2B1P3/2NPB3/PPPQ1PPP/2KR2NR w - - 0 8',
 'r1bq1rk1/ppp1npbp/2n1p1p1/3P4/2B5/

In [12]:
# Use the explode() method to split the FEN list into separate rows
new_games = games.explode('FEN')
new_games

Unnamed: 0,White Name,Black Name,ECO Opening,White Elo,Black Elo,PGN,Game Length,FEN
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12,rnbqkbnr/pppppp1p/6p1/8/4P3/8/PPPP1PPP/RNBQKBN...
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12,rnbqkbnr/pppppp1p/6p1/8/4P3/2N5/PPPP1PPP/R1BQK...
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12,rnbqk1nr/ppppppbp/6p1/8/4P3/2N5/PPPP1PPP/R1BQK...
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12,rnbqk1nr/ppppppbp/6p1/8/2B1P3/2N5/PPPP1PPP/R1B...
...,...,...,...,...,...,...,...,...
4573,vinicius-valle,bayesschack,B23,1557,1574,"""1. e4 {[%clk 0:03:00]} 1... c5 {[%clk 0:02:5...",24,r2qr1k1/3b2b1/4pnp1/2pP2Np/7P/pP3Q2/P1B2PP1/BR...
4573,vinicius-valle,bayesschack,B23,1557,1574,"""1. e4 {[%clk 0:03:00]} 1... c5 {[%clk 0:02:5...",24,r2qr1k1/3b2b1/4p1p1/2pn2Np/7P/pP3Q2/P1B2PP1/BR...
4573,vinicius-valle,bayesschack,B23,1557,1574,"""1. e4 {[%clk 0:03:00]} 1... c5 {[%clk 0:02:5...",24,r2qr1k1/3b2b1/4p1B1/2pn2Np/7P/pP3Q2/P4PP1/BR3R...
4573,vinicius-valle,bayesschack,B23,1557,1574,"""1. e4 {[%clk 0:03:00]} 1... c5 {[%clk 0:02:5...",24,r2q1rk1/3b2b1/4p1B1/2pn2Np/7P/pP3Q2/P4PP1/BR3R...


In [13]:
new_games["Game"] = new_games.index
new_games

Unnamed: 0,White Name,Black Name,ECO Opening,White Elo,Black Elo,PGN,Game Length,FEN,Game
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,0
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12,rnbqkbnr/pppppp1p/6p1/8/4P3/8/PPPP1PPP/RNBQKBN...,0
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12,rnbqkbnr/pppppp1p/6p1/8/4P3/2N5/PPPP1PPP/R1BQK...,0
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12,rnbqk1nr/ppppppbp/6p1/8/4P3/2N5/PPPP1PPP/R1BQK...,0
0,colinsong1,TimeFish_576,B06,1604,1609,"""1. e4 {[%clk 0:03:00]} 1... g6 {[%clk 0:02:5...",12,rnbqk1nr/ppppppbp/6p1/8/2B1P3/2N5/PPPP1PPP/R1B...,0
...,...,...,...,...,...,...,...,...,...
4573,vinicius-valle,bayesschack,B23,1557,1574,"""1. e4 {[%clk 0:03:00]} 1... c5 {[%clk 0:02:5...",24,r2qr1k1/3b2b1/4pnp1/2pP2Np/7P/pP3Q2/P1B2PP1/BR...,4573
4573,vinicius-valle,bayesschack,B23,1557,1574,"""1. e4 {[%clk 0:03:00]} 1... c5 {[%clk 0:02:5...",24,r2qr1k1/3b2b1/4p1p1/2pn2Np/7P/pP3Q2/P1B2PP1/BR...,4573
4573,vinicius-valle,bayesschack,B23,1557,1574,"""1. e4 {[%clk 0:03:00]} 1... c5 {[%clk 0:02:5...",24,r2qr1k1/3b2b1/4p1B1/2pn2Np/7P/pP3Q2/P4PP1/BR3R...,4573
4573,vinicius-valle,bayesschack,B23,1557,1574,"""1. e4 {[%clk 0:03:00]} 1... c5 {[%clk 0:02:5...",24,r2q1rk1/3b2b1/4p1B1/2pn2Np/7P/pP3Q2/P4PP1/BR3R...,4573


In [14]:
#new_games.to_csv('new_games.csv', index=False)