# 3. Retrieve tournaments

Requirements

In [1]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


Imports

In [2]:
import os
import time

import pandas as pd
import numpy as np

import requests
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm
import logging
from bs4 import BeautifulSoup

import inflect
p = inflect.engine()

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

Logging configuration

In [3]:
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

Constants

In [4]:
# Repositories creation
DATA_DIR = os.path.join("..", "data")
TEMP_DIR = os.path.join(DATA_DIR, "temp")
RAW_DIR = os.path.join(DATA_DIR, "raw")

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(RAW_DIR, exist_ok=True)

In [5]:
# Files
TOP_GAMES_LIST_FILE = os.path.join(DATA_DIR, "boardgames_ranks.csv")
USERNAMES_FILE = os.path.join(TEMP_DIR, "usernames.json")
TOURNAMENTS_FILE = os.path.join(RAW_DIR, "tournaments.json")

In [6]:
# General download parameters
BACKUP_PERIOD = 100 # Frequency of data backup
REQUEST_DELAY = 0.5    # Delay between requests
MAX_RETRIES = 5 # Max number of retries for a request

In [7]:
OVERWRITE = False    # If True, the existing files will be overwritten

In [8]:
# Game interval definition
GAME_RANK_MIN = 1   # Rank of the first game to download
GAME_RANK_MAX = 2000    # Rank of the last game to download
GAME_NUM = GAME_RANK_MAX - GAME_RANK_MIN + 1    # Number of games to download

In [9]:
# Specific download parameters
MAX_TOURNAMENTS_NUM = 10 # Max number of tournaments to download for each game and each status

In [10]:
# URLs
BGA_BASE_URL = "https://boardgamearena.com"

Utility functions

In [11]:
def save_to_json(data, filename):
    """
    Save data to a JSON file.

    Parameters:
    data (any): The data to be saved to the JSON file. This can be any data type that is serializable to JSON.
    filename (str): The name of the file where the data will be saved.

    Returns:
    None
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [12]:
def append_to_json(new_data, filename):
    """
    Adds new data to an existing JSON file without duplicates.

        new_data (list): The new data to add.
        filename (str): The name of the JSON file.
    """
    # Read existing data from the file.
    try:
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        existing_data = []

    # Combine the existing data with the new data and remove duplicates.
    combined_data = {json.dumps(item, sort_keys=True): item for item in existing_data + new_data}.values()

    # Save the combined data back to the file.
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(list(combined_data), f, ensure_ascii=False, indent=4)

## 1.1 Functions

Normalize boardgame name to use in URL

In [13]:
def normalize_gamename(gamename):
    """
    Normalizes the name of a game by removing spaces and special characters, and converting everything to lowercase.
    Replaces numbers with their corresponding words in English.

        Args:
            gamename (str): The original name of the game.

        Returns:
            str: The normalized game name.
    """
    normalized_name = ''.join(e for e in gamename if e.isalnum() or e.isspace()).lower()
    words = normalized_name.split()
    normalized_words = [p.number_to_words(word) if word.isdigit() else word for word in words]
    return ''.join(e for e in ''.join(normalized_words) if not e.isspace())

Retrieve the BGA id of a game, if it exists

In [14]:
def find_id_from_gamename(driver, gamename, sleep_time=0.5, max_retries=5):
    """
    Retrieve the game ID from the game name using a Selenium WebDriver.
    Args:
        driver (selenium.webdriver): The Selenium WebDriver instance used to navigate the web page.
        gamename (str): The name of the game for which the ID is to be found.
        sleep_time (float, optional): The time to wait between retries in seconds. Default is 0.5 seconds.
        max_retries (int, optional): The maximum number of retries if the request fails. Default is 5.
    Returns:
        str or None: The game ID if found, otherwise None.
    Raises:
        Exception: If an unexpected error occurs during the retrieval process.
    """
    # Build the URL for requesting the forum list
    url = f"{BGA_BASE_URL}/gamepanel?game={normalize_gamename(gamename)}"
    status_code = 500

    # Retry loop for fetching the threads from the forum
    while status_code != 200:
        # Pause briefly between retries
        time.sleep(sleep_time)  # Delay
        try:
            driver.get(url)
            break
        except Exception as e:
            # Log an error if an exception occurs
            logger.error(f"Exception occurred while retrieving {gamename}: {e}.")
            max_retries -= 1
            if max_retries == 0:
                # Log an error if retries are exhausted
                logger.error(f"Error retrieving {gamename}. Retries exhausted.")
                return None

    # Check for fatal error indicating game not found
    try:
        if driver.find_elements(By.CSS_SELECTOR, "div.fatalerror#bga_fatal_error"):
            logger.info(f'Game "{gamename}" not found.')
            return None
        # Find the id element
        id_span = driver.find_element(By.CSS_SELECTOR, '#game_id')
        game_id = id_span.get_attribute('innerHTML')
        return game_id
    except Exception as e:
        logger.error(f'Element with CSS selector "#game_id" not found for game "{gamename}" - error {e}.')
        return None

    # Log an error if the link is not found
    logger.error(f'Link for game "{gamename}" not found.')
    return None

Retrieve a list of tournaments ids given the related game and optional parameters

In [15]:
def find_tournaments_by_id(
    game_id,
    ptime=0,
    prestige=0,
    ptype=0,
    players_per_match_min=0,
    players_per_match_max=0,
    gamecateg=3,
    status="future",
    tournament_i_registered=0,
    full="true",
    max_tournaments_num=np.inf, max_posts=5, sleep_time=0.5, max_retries=5
    ):
    '''
        
    '''
    # Build the URL for requesting the tournament list
    base_url = f"{BGA_BASE_URL}/tournamentlist?d"
    params = {
        "time": ptime,
        "prestige": prestige,
        "type": ptype,
        "players_per_match_min": players_per_match_min,
        "players_per_match_max": players_per_match_max,
        "gamecateg": gamecateg,
        "status": status,
        "game": game_id,
        "tournament_i_registered": tournament_i_registered,
        "full": full
    }
    url = base_url + "&" + "&".join([f"{key}={value}" for key, value in params.items()])
    status_code = 500

    # Retry loop for fetching the threads from the forum
    while status_code != 200:
        # Pause briefly between retries
        time.sleep(sleep_time)  # Delay
        try:
            driver.get(url)
            break
        except Exception as e:
            # Log an error if an exception occurs
            logger.error(f"Exception occurred while retrieving tournaments (game_id={gamename}): {e}.")
            max_retries -= 1
            if max_retries == 0:
                # Log an error if retries are exhausted
                logger.error(f"Error retrieving tournaments (game_id={gamename}). Retries exhausted.")
                return None
    
    # Retrieve tournament list
    try:
        tournaments = driver.find_elements(By.XPATH, '//*[@id="tournament_list"]/a[*]')
    except Exception as e:
        logger.error(f'Element with CSS selector "#tournament_list" not found for game "{gamename}" - error {e}.')
        return None

    # Retrieve their ids
    tournament_ids = []
    for tournament in tournaments:
        href = tournament.get_attribute("href")
        tournament_id = href.split("id=")[-1].split("&")[0]
        tournament_ids.append(tournament_id)

    return tournament_ids if len(tournament_ids) <= max_tournaments_num else tournament_ids[:max_tournaments_num]

Retrieve tournament data given its ID

In [16]:
def fetch_tournament_data(tournament_id, status, sleep_time=0.5, max_retries=5):
    '''

    '''    
    # Build the URL for requesting the tournament page
    url = f"https://boardgamearena.com/tournament?id={tournament_id}"
    status_code = 500

    # Retry loop for fetching the threads from the forum
    while status_code != 200:
        # Pause briefly between retries
        try:
            driver.get(url)
            break
        except Exception as e:
            # Log an error if an exception occurs
            logger.error(f"Exception occurred while retrieving {gamename}: {e}.")
            max_retries -= 1
            if max_retries == 0:
                # Log an error if retries are exhausted
                logger.error(f"Error retrieving {gamename}. Retries exhausted.")
                return None
            time.sleep(sleep_time)  # Delay
    
    time.sleep(sleep_time)  # Delay

    try:
        tournament_details = {
            "id": tournament_id,
            "name": driver.find_element(By.XPATH, '//*[@id="tournament-module"]/div[3]/div/div/div[1]/div[3]/div[1]/div[2]/div[1]').get_attribute('innerHTML'),
            "type": driver.find_element(By.XPATH, '//*[@id="tournament-module"]/div[3]/div/div/div[3]/div[2]/div[1]').get_attribute('innerHTML'),
            "type_description": driver.find_element(By.XPATH, '//*[@id="tournament-module"]/div[3]/div/div/div[3]/div[2]/div[2]').get_attribute('innerHTML'),
            "status": status,
            "num_participants": driver.find_element(By.XPATH, '//*[@id="tournament-module"]/div[3]/div/div/div[1]/div[3]/div[2]/div[2]/div[2]/b').get_attribute('innerHTML'),
            "min_participants": driver.find_element(By.XPATH, '//*[@id="tournament-module"]/div[3]/div/div/div[1]/div[3]/div[2]/div[2]/div[2]/div').get_attribute('innerHTML').split(" ")[0],
            "max_participants": driver.find_element(By.XPATH, '//*[@id="tournament-module"]/div[3]/div/div/div[1]/div[3]/div[2]/div[2]/div[2]/small').get_attribute('innerHTML').split(" ")[2],
            "starting_time": None,
            "location": {
                "state": None,
                "country": None,
                "city": None
            },
            "participants": [],
            "administrator": None,
            "options": {}
        }

        # Retrieve starting time
        time_text = driver.find_element(By.XPATH, '//*[@id="tournament-module"]/div[3]/div/div/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]').get_attribute('innerHTML')
        tournament_details["starting_time"] = f"{time_text.split(" ")[0]} {time_text.split(" ")[2]}"
        
        # Retrieve tournament options
        options = driver.find_elements(By.XPATH, '//*[@id="tournament-module"]/div[3]/div/div/div[3]/div[2]/div[*]')
        for option in options:
            key = option.find_element(By.XPATH, 'div[1]').get_attribute('innerHTML')
            if key == "Reputation required":
                value = option.find_element(By.XPATH, 'div[2]/span/div').get_attribute('innerHTML')
            elif key == "Time allotted to each player":
                value = option.find_element(By.XPATH, 'div[2]/span/span').get_attribute('innerHTML')
            else:
                value = option.find_element(By.XPATH, 'div[2]/span').get_attribute('innerHTML')
            tournament_details["options"][key] = value
        
    except Exception as e:
        logger.error(f"Exception occurred while retrieving tournament details: {e}")
        return None
    
    return tournament_details

## 3.2 Execution

Setup the driver

In [17]:
# Setup Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--lang=en")  # Set browser language to English

# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

Retrieves the game list from the csv

In [19]:
# Top games data
top_games_df = pd.read_csv(TOP_GAMES_LIST_FILE)
games = top_games_df.loc[GAME_RANK_MIN-1:GAME_RANK_MAX-1, ['id', 'name']]

# Remove games already collected
if not OVERWRITE:
    try:
        with open(TOURNAMENTS_FILE, "r", encoding="utf-8") as f:
            data = json.load(f)
            collected_ids = [game["id"] for game in data]
            if collected_ids:
                games = games[~games['id'].isin(collected_ids)]
    except FileNotFoundError:
        pass
logger.warning(f"Games to collect: {len(games)}")



Retrieves the game list from the csv

In [19]:
saved_once = not OVERWRITE

In [None]:
all_data = []
i = 0
for name, game_id in tqdm(zip(games["name"], games["id"]), desc="Fetching forums", total=games.shape[0]):
    game = {
        "id": game_id,
        "name": name,
        "tournaments": []
    }
    
    # Find tournaments by game ID
    game_id = find_id_from_gamename(driver, name, sleep_time=REQUEST_DELAY, max_retries=MAX_RETRIES)
    if game_id is None:
        continue
    logger.warning(f'Fetching tournaments for "{name}" ({game_id})')

    # Find tournaments ids by game ID, in different statuses
    statuses=["future", "progress", "finished"]
    tournaments_ids = []
    for status in statuses:
        tournaments_status_ids = find_tournaments_by_id(
            game_id,
            status=status,
            ptime=4 if status=="future" else 0,
            max_tournaments_num=MAX_TOURNAMENTS_NUM, sleep_time=REQUEST_DELAY, max_retries=MAX_RETRIES
        )
        for tid in tournaments_status_ids:
            tournaments_ids.append({"id":tid,"status":status})
    logger.info(f'Fetching {len(tournaments_ids)} tournaments for "{name}" ({game_id})')

    tournaments = []
    for tournament in tournaments_ids:
            tournament_data = fetch_tournament_data(tournament["id"], tournament["status"], sleep_time=REQUEST_DELAY, max_retries=MAX_RETRIES)
            if tournament_data:
                tournaments.append(tournament_data)
    logger.info(f'Fetched tournaments data for "{name}".')

    # Add the tournaments to the game data
    game["tournaments"] = tournaments

    # Append the game data to the list
    all_data.append(game)

    i += 1
    if i % BACKUP_PERIOD == 0:
        # Save the data to the JSON file
        if not saved_once:
            # Save the data to the JSON file
            save_to_json(all_data, TOURNAMENTS_FILE)
            saved_once = True
        else:
            # Append the data to the JSON file
            append_to_json(all_data, TOURNAMENTS_FILE)
        logger.info(f"Saved {i} games into '{TOURNAMENTS_FILE}'")

        # Reset the data list
        all_data = []
    
if all_data:
    if not saved_once:
        # Save the data to the JSON file
        save_to_json(all_data, TOURNAMENTS_FILE)
    else:
        # Append the data to the JSON file
        append_to_json(all_data, TOURNAMENTS_FILE)
    logger.info(f"Saved remaining {len(all_data)} games forums into '{TOURNAMENTS_FILE}'")

Fetching forums:   5%|▍         | 93/2000 [15:39<3:05:40,  5.84s/it]2025-01-18 02:25:13,471 - ERROR - Element with CSS selector "#game_id" not found for game "Revive" - error Message: no such element: Unable to locate element: {"method":"css selector","selector":"#game_id"}
  (Session info: chrome=131.0.6778.265); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x0036FD53+23747]
	(No symbol) [0x002F7D54]
	(No symbol) [0x001CBE53]
	(No symbol) [0x0020FCA6]
	(No symbol) [0x0020FEEB]
	(No symbol) [0x0024D852]
	(No symbol) [0x00231E44]
	(No symbol) [0x0024B41E]
	(No symbol) [0x00231B96]
	(No symbol) [0x00203F3C]
	(No symbol) [0x00204EBD]
	GetHandleVerifier [0x0064AC73+3017699]
	GetHandleVerifier [0x0065B93B+3086507]
	GetHandleVerifier [0x006540F2+3055714]
	GetHandleVerifier [0x00405AF0+637536]
	(No symbol) [0x00300A5D]
	(No symbol) [0x002FDA28]
	(No symbol) [0x00