In [1]:
from parsita import *
from parsita.util import constant
import json

In [2]:
def formatannotations(annotations):
    return {ant[0]: ant[1] for ant in annotations}

def formatgame(game):
    return {
        'moves': game[0],
        'outcome': game[1]
    }

def formatentry(entry):
    return {'annotations': entry[0], 'game': entry[1]}

def handleoptional(optionalmove):
    if len(optionalmove) > 0:
        return optionalmove[0]
    else:
        return None

In [3]:
# Define Grammar by building up from smallest components

# tokens
quote = lit(r'"')
whitespace = lit(' ') | lit('\n')
tag = reg(r'[\u0021-\u0021\u0023-\u005A\u005E-\u007E]+')
string = reg(r'[\u0020-\u0021\u0023-\u005A\u005E-\U0010FFFF]+')

# Annotations: [Foo "Super Awesome Information"]
annotation = '[' >> (tag) << ' ' & (quote >> string << quote) << ']'
annotations = repsep(annotation, '\n') > formatannotations

# Moves are more complicated
regularmove = reg(r'[a-h1-8NBRQKx\+#=]+') # Matches more than just chess moves
longcastle = reg(r'O-O-O[+#]?') # match first to avoid castle matching spuriously
castle = reg(r'O-O[+#]?')
nullmove = lit('--') # Illegal move rarely used in annotations

move = regularmove | longcastle | castle | nullmove

# Build up the game
movenumber = (reg(r'[0-9]+') << '.' << whitespace) > int
turn = movenumber & (move << whitespace) & (opt(move << whitespace) > handleoptional)

draw = lit('1/2-1/2')
white = lit('1-0')
black = lit('0-1')
outcome = draw | white | black

game = (rep(turn) & outcome) > formatgame

# A PGN entry is annotations and the game
entry = ((annotations << rep(whitespace)) & (game << rep(whitespace))) > formatentry

# A file is repeated entries
file = rep(entry)

In [4]:
with open('twic1368.pgn', 'r') as f:
    parsedoutput = file.parse(f.read()).or_die()

In [5]:
# How many games were parsed?
len(parsedoutput)

4408

In [6]:
# Structure of the final games
parsedoutput[100]

{'annotations': {'Event': 'Lozovatsky Mem A 2021',
  'Site': 'Chelyabinsk RUS',
  'Date': '2021.01.18',
  'Round': '5.18',
  'White': 'Mischuk,D',
  'Black': 'Bryakin,M',
  'Result': '1-0',
  'WhiteTitle': 'IM',
  'BlackTitle': 'IM',
  'WhiteElo': '2364',
  'BlackElo': '2448',
  'ECO': 'D15',
  'Opening': 'QGD Slav',
  'Variation': '4.Nc3',
  'WhiteFideId': '14118734',
  'BlackFideId': '4165314',
  'EventDate': '2021.01.15'},
 'game': {'moves': [[1, 'd4', 'd5'],
   [2, 'c4', 'c6'],
   [3, 'Nc3', 'Nf6'],
   [4, 'Nf3', 'a6'],
   [5, 'a4', 'e6'],
   [6, 'g3', 'c5'],
   [7, 'Bg2', 'dxc4'],
   [8, 'dxc5', 'Qxd1+'],
   [9, 'Nxd1', 'Bd7'],
   [10, 'Ne5', 'Nc6'],
   [11, 'Nxc6', 'Bxc6'],
   [12, 'Bxc6+', 'bxc6'],
   [13, 'Bd2', 'Ne4'],
   [14, 'Rc1', 'Nxd2'],
   [15, 'Kxd2', 'O-O-O+'],
   [16, 'Ke3', 'Bxc5+'],
   [17, 'Kf3', 'Kb7'],
   [18, 'Rxc4', 'Be7'],
   [19, 'Nc3', 'Rd2'],
   [20, 'Rb1', 'f5'],
   [21, 'a5', 'Rhd8'],
   [22, 'Na4', 'R8d4'],
   [23, 'b3', 'Rxc4'],
   [24, 'bxc4+', 'Kc7'],