In [35]:
# Author: Kerem Kazan
# Title: Chess Commentator Transformer - Data Mining

In [36]:
# !pip install bs4
# !pip install selenium

# Scrape Links

The data we need is not publicly available. We will have to mine it ourselves. We will scrape the links of the games from the gameknot website. Then, we will visit them individually, and pull the data we need.

In [37]:
import requests
from bs4 import BeautifulSoup
import json
import time

rootUrl="https://gameknot.com"


# We creat a state_file to allow us to resume scraping - because we will often get rate-limited and our script will break.

state_file = "./dataset/data/state/link_state.json"
state = json.load(open(state_file))
last_page_index = state.get("last_page_index", 0)

data_file = "./dataset/data/gameknot/saved_links.txt"

with open(data_file, "a") as df:
  for pageIndex in range(last_page_index, 313):
      print(f"Processing page {pageIndex}")
      # sleep for 2 seconds to avoid being blocked
      time.sleep(2)
      
      pageUrl="https://gameknot.com/list_annotated.pl?u=all&c=0&sb=0&rm=0&rn=0&rx=9999&sr=0&p="+str(pageIndex)
      r=requests.get(pageUrl)
      if r.status_code != 200:
        print(f"Error on page {pageIndex}: {r.status_code}")
        break;

      soup=BeautifulSoup(r.content,'html.parser')
      for elem in soup.find_all('tr',["evn_list","odd_list"]):
        listOfLinks=elem.find_all('a')
        link = listOfLinks[1].get('href')
        if 'gm' in link:
            # get the gm query string
            gm_query = link.split('gm=')[1]
            gm_url = f"https://gameknot.com/annotate.pl?id={gm_query}&mv=1"
            print(f"--{gm_url}")
            df.write(gm_url+"\n")
        else:
          print(f"No gm query string found for {link}")

      state["last_page_index"] = pageIndex
      with open(state_file, "w") as sf:
        json.dump(state, sf)
      
with open("./dataset/data/gameknot/saved_links.txt", "r") as f:
  links = f.readlines()

print("before deduplication: ", len(links))
# Turn the links into a set to deduplicate
links = set(links)

# Turn the set into a list
links = list(links)

# rewrite the file
with open("./dataset/data/gameknot/saved_links_deduped.txt", "w") as f:
  for link in links:
    f.write(link)

Processing page 1
--https://gameknot.com/annotate.pl?id=71945&mv=1
--https://gameknot.com/annotate.pl?id=71941&mv=1
--https://gameknot.com/annotate.pl?id=71931&mv=1
--https://gameknot.com/annotate.pl?id=71925&mv=1
--https://gameknot.com/annotate.pl?id=71921&mv=1
--https://gameknot.com/annotate.pl?id=71906&mv=1
--https://gameknot.com/annotate.pl?id=71903&mv=1
--https://gameknot.com/annotate.pl?id=71901&mv=1
--https://gameknot.com/annotate.pl?id=71882&mv=1
--https://gameknot.com/annotate.pl?id=71877&mv=1
--https://gameknot.com/annotate.pl?id=71876&mv=1
--https://gameknot.com/annotate.pl?id=71872&mv=1
--https://gameknot.com/annotate.pl?id=71860&mv=1
--https://gameknot.com/annotate.pl?id=71852&mv=1
--https://gameknot.com/annotate.pl?id=71831&mv=1
--https://gameknot.com/annotate.pl?id=71823&mv=1
--https://gameknot.com/annotate.pl?id=71816&mv=1
--https://gameknot.com/annotate.pl?id=71797&mv=1
--https://gameknot.com/annotate.pl?id=71794&mv=1
--https://gameknot.com/annotate.pl?id=71760&mv=1
--

KeyboardInterrupt: 

# Pull PGNs

PGN is a standard format for representing chess games. It contains the moves of the game, and optional comments and metadata. The data we need is behind a button and a js-activated popup. We will use selenium to click the button and get the PGN for each game.

In [38]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Let's first create our helper functions:

def click_save_export_with_retry(url: str, max_retries: int = 3):
    for i in range(max_retries):
        try:
            return click_save_export(url)
        except Exception as e:
            print(f"Error: {e}. Retrying...")
            time.sleep(50 / (i + 1))

def click_save_export(url: str):
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Remove this line to see the browser
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")

    # Create the WebDriver
    service = Service()  # You can pass executable_path if needed
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        driver.get(url)

        # Wait for the element to be present in the DOM and visible
        wait = WebDriverWait(driver, 15)
        element = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, '//a[@onclick="show_save_export_menu(this)" and contains(text(), "Save/Export")]')
            )
        )

        # Click the element
        element.click()
        print("Clicked the Save/Export button.")

        
        element = wait.until(
          EC.element_to_be_clickable(
            (By.XPATH, '//a[@onclick="show_pgn()" and contains(text(), "Get")]')
          )
        )
        element.click()
        print("Clicked the Get PGN button.")

         # Wait for the textarea to appear
        textarea = wait.until(
            EC.presence_of_element_located((By.ID, "pgn_code"))
        )
        pgn_text = textarea.get_attribute("value")
        print("Extracted PGN text:\n")

        return pgn_text


    except Exception as e:
        print(f"Error: {e}")

    finally:
        driver.quit()


In [39]:
# Take it for a spin:

pgn_text = click_save_export("https://gameknot.com/annotate.pl?id=404&mv=1")
print(pgn_text)

Clicked the Save/Export button.
Clicked the Get PGN button.
Extracted PGN text:

[Event "Let's play chess"]
[Site "http://gameknot.com/"]
[Date "08-Jul-06"]
[Round "-"]
[White "monitache"]
[Black "abro"]
[Result "0-1"]
[WhiteElo "1543"]
[BlackElo "1595"]
[TimeControl "7 days per move"]

1. d4 Nf6 2. c4 e5 {Budapest Gambit. Sharp and full of traps.} 3. e3 {White
declines Gambit.} exd4 4. exd4 Bb4+ 5. Bd2 Bxd2+ 6. Nxd2 O-O {Rook comes
to e-file, while White King is still in center.} 7. Bd3 Re8+ 8. Ne2 {=}
Ng4 {?! This is not good move. But playing risky and unnatural moves, make
opponent astray.} 9. Nf3 Nc6 10. O-O d5 {This is good move as breaks center
and free the bishop.} 11. c5 {White's bishop is pretty strong, Black should
do something about it.} Nb4 12. Bb1 {Now a1 rook is temoparay out of play.}
b6 {Black still wants to break center, but this moves intends something
else too.} 13. a3 Ba6 {Bishop attacks knight ... fork ... and win.} 14.
Ne5 {?} Qh4 {This is not best. Best one was 

In [40]:
# Let's iterate through the links we saved earlier, and pull the PGNs:

import json
import os 

with open("./dataset/data/gameknot/saved_links.txt", "r") as f:
  links = f.readlines()

print(len(links))

state_file = "./dataset/data/state/pgn_state.json"
state = json.load(open(state_file))
last_page_index = state.get("last_page_index", 0)

print(f"Processing {len(links)} games, starting from page {last_page_index}")

for url_index in range(last_page_index, len(links)):
    print(f"Processing page {url_index} - {links[url_index]}")
    url = links[url_index]
    game_id = url.split("id=")[1].split("&")[0]

    target_file_name = f"./dataset/data/gameknot/pgn/{game_id}.pgn"

    # check if the file already exists
    if os.path.exists(target_file_name):
      print(f"File {target_file_name} already exists, skipping")
      continue
    else:
      print(f"...downloading: {links[url_index]}")

    pgn_text = click_save_export_with_retry(links[url_index])

    if not pgn_text:
      print(f"No PGN text found for {links[url_index]}, skipping")
      continue

    with open(target_file_name, "w") as f:
      f.write(pgn_text)
    print(f"Saved game {game_id}")

    state["last_page_index"] = url_index
    with open(state_file, "w") as sf:
      json.dump(state, sf)
    


760
Processing 760 games, starting from page 5
Processing page 5 - https://gameknot.com/annotate.pl?id=72336&mv=1

File ./dataset/data/gameknot/pgn/72336.pgn already exists, skipping
Processing page 6 - https://gameknot.com/annotate.pl?id=72332&mv=1

...downloading: https://gameknot.com/annotate.pl?id=72332&mv=1

Clicked the Save/Export button.
Clicked the Get PGN button.
Extracted PGN text:

Saved game 72332
Processing page 7 - https://gameknot.com/annotate.pl?id=72330&mv=1

...downloading: https://gameknot.com/annotate.pl?id=72330&mv=1

Clicked the Save/Export button.
Clicked the Get PGN button.
Extracted PGN text:

Saved game 72330
Processing page 8 - https://gameknot.com/annotate.pl?id=72327&mv=1

...downloading: https://gameknot.com/annotate.pl?id=72327&mv=1

Clicked the Save/Export button.
Clicked the Get PGN button.
Extracted PGN text:

Saved game 72327
Processing page 9 - https://gameknot.com/annotate.pl?id=72320&mv=1

...downloading: https://gameknot.com/annotate.pl?id=72320&m

KeyboardInterrupt: 

## A quick primer on Chess notation

There are a few modern standars for Chess notation:

- FEN notation: A cryptic string that describes the board state. For example: "rnb1kbnr/ppp2ppp/8/3qp3/8/5N2/PPPP1PPP/RNBQKB1R w KQkq - 0 4". This string not only contains where each piece is, but also the side to move, castling rights, en passant target, halfmove clock, and fullmove number.

- SAN notation: Standard Algebraic Notation - a verbose notation for moves. For example: "1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 0-0-0". This notation is used to describe a sequence of moves in a game: A "full move number" is written first, followed by a list of moves - first white, then black.

- PGN notation: Portable Game Notation - a standard notation for games. Contains a series of SAN notations, but also contains comments.

- UCI notation: A compact notation for moves. For example: "e2e4". This notation is used to describe that a piece moved from the e2 square to the e4 square - which is likely a pawn move - but it could also be a rook move or a queen move. It all depends on the current board state.

## Now What?

Our raw dataset is in PGN format. The good news is we can use a very popular library called `python-chess` to parse PGNs, extract the moves, and convert back and forth between PGN and FEN notations. This will make our data more portable and easier to work with.

# Build CSV

We will now build a CSV file that contains the FENs and the comments for each move.

In [41]:
import chess.pgn
import glob
import csv

# We will do some light pre-processing to remove bad rows. The following notebooks will keep gradually refining the dataset. For now, we will remove known common unwanted words, emotes, non-ascii characters, and short comments. The dataset also contains non-english comments, which of course we will remove.

import re
import chess.pgn
import glob
import csv

GAMES_DIR = "./dataset/data/gameknot/pgn" 

ANNOTATED_POSITIONS_OUTPUT_CSV = "./dataset/data/gameknot_annotated_positions.csv"

stats = {
  'num_input': 0,
  'num_comments': 0,
  'num_skipped_comments_with_emotes': 0,
  'num_skipped_comments_too_short': 0,
  'num_skipped_comments_no_alphabetic_characters': 0,
  'num_skipped_comments_non_english': 0,
  'num_skipped_comments_non_ascii': 0,
}

def sanitize_comment(comment):
  comment = comment.strip()

  # make sure the first word is alpha-numeric
  words = comment.split()
  while words and not words[0].isalnum():
    words = words[1:]

  comment = ' '.join(words)
  comment = comment.strip()

  return comment

def has_emotes(text):
  if re.search(r'[:;][-~]?[)D]', text) is not None:
    return True
  if re.search(r':\s*\)', text) is not None:
    return True
  if re.search(r'\b(lol|lmao|lmfao|rofl|roflmao)\b', text.lower()) is not None:
    return True
  return False
  

def extract_annotated_moves(pgn_path, debug=False):
  from langdetect import detect

  all_data = []
  with open(pgn_path, encoding="utf-8", errors="ignore") as f:
      game = chess.pgn.read_game(f)
      board = game.board()
      for node in game.mainline():

        move = node.move
        fen_before = board.fen()
        uci = move.uci()
        comment = node.comment
        full_move_number = board.fullmove_number
        is_first_full_move = full_move_number == 1
        is_white_move = board.turn == chess.WHITE
        
        board.push(move)
        fen_after = board.fen()

        if not comment:
          continue

        if is_first_full_move:
          if comment:
            if debug:
              print("--- Skipping first move: ", comment)
          continue

        comment = sanitize_comment(comment)
        stats['num_input'] += 1

        if has_emotes(comment):
          stats['num_skipped_comments_with_emotes'] += 1
          if debug:
            print("--- Skipping comment: has emotes: ", comment)
          continue

        # if contains non-ascii characters, skip it
        if not comment.isascii():
          stats['num_skipped_comments_non_ascii'] += 1
          if debug:
            print("--- Skipping comment: non-ascii: ", comment)
          continue

        if len(comment.split()) < 2:
          stats['num_skipped_comments_too_short'] += 1
          if debug:
            print("--- Skipping comment: too short: ", comment)
          continue

        # if comment doesn't contain any alphabetic characters, skip it
        if not any(c.isalpha() for c in comment):
          stats['num_skipped_comments_no_alphabetic_characters'] += 1
          if debug:
            print("--- Skipping comment: no alphabetic characters: ", comment)
          continue

        if not detect(comment) == 'en':
          stats['num_skipped_comments_non_english'] += 1
          if debug:
            print(f"--- Non-english comment: {comment}")
          continue

        all_data.append({
            "fen_before": fen_before,
            "fen_after": fen_after,
            "uci": uci,
            "comment": comment,
            "is_white": "true" if is_white_move else "false",
            "full_move_number": full_move_number,
            "game": pgn_path,
        })
        stats['num_comments'] += 1

  return all_data

In [42]:
# The cell above prepared the helper functions. Now, let's go through all the PGNS we just downloaded and prepare our csv:

# # Get all PGN files in the directory
pgn_files = glob.glob(f"{GAMES_DIR}/*.pgn")

# # Process each file and combine results
all_moves = []

print(f"Processing {len(pgn_files)} files...")
file_count = 0
for pgn_file in pgn_files:
    moves = extract_annotated_moves(pgn_file)
    all_moves.extend(moves)
    file_count += 1
    if file_count % 100 == 0 and file_count > 0:
      print(f"Processed {file_count}/{len(pgn_files)} files...")

print(len(all_moves))

print("Eliminated: ", stats['num_input'] - stats['num_comments'])
print("Saved: ", stats['num_comments'])


with open(ANNOTATED_POSITIONS_OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["fen_before", "fen_after", "uci", "comment", "full_move_number", "is_white", "game"])
    writer.writeheader()
    writer.writerows(all_moves)

print(f"\nSaved {len(all_moves)} rows to {ANNOTATED_POSITIONS_OUTPUT_CSV}")


Processing 17 files...
515
Eliminated:  167
Saved:  515

Saved 515 rows to ./dataset/data/gameknot_annotated_positions.csv


In [None]:
# This concludes our data-mining process.