# 1. Retrieve top games and reviews

> Nei dati scaricati mancano: 
> - date delle reviews
> - location della review
> - gli id degli utenti che hanno scritto le reviews

Requirements

In [16]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


Imports

In [17]:
import os
import time

import pandas as pd
import numpy as np

import requests
from lxml import html
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm
import logging
from bs4 import BeautifulSoup

Logging configuration

In [18]:
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

Constants

In [19]:
# Repositories
DATA_DIR = os.path.join("..", "data")
TEMP_DIR = os.path.join(DATA_DIR, "temp")
RAW_DIR = os.path.join(DATA_DIR, "raw")

In [None]:
# Repos creation
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(RAW_DIR, exist_ok=True)

In [None]:
# Files
TOP_GAMES_LIST_FILE = os.path.join(DATA_DIR, "boardgames_ranks.csv")
USERNAMES_FILE = os.path.join(TEMP_DIR, "usernames.json")
BOARD_GAMES_FILE = os.path.join(RAW_DIR, "boardgames&reviews.json")

In [31]:
# General download parameters
BACKUP_PERIOD = 200 # Frequency of data backup
REQUEST_DELAY = 0.01    # Delay between requests
MAX_RETRIES = 5 # Max number of retries for a request

In [None]:
OVERWRITE = False    # If True, the existing files will be overwritten

In [None]:
# Game interval definition
GAME_RANK_MIN = 1   # Rank of the first game to download
GAME_RANK_MAX = 2000    # Rank of the last game to download
GAME_NUM = GAME_RANK_MAX - GAME_RANK_MIN + 1    # Number of games to download

In [None]:
# Specific download parameters
REVIEWS_PER_GAME_LIMIT = 500    # Maximum number of reviews to download for each game

In [32]:
# URLs
BGG_BASE_URL = "https://boardgamegeek.com/xmlapi2"

Utility functions

In [33]:
def save_to_json(data, filename):
    """
    Save data to a JSON file.

    Parameters:
    data (any): The data to be saved to the JSON file. This can be any data type that is serializable to JSON.
    filename (str): The name of the file where the data will be saved.

    Returns:
    None
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [34]:
def append_to_json(new_data, filename):
    """
    Adds new data to an existing JSON file without duplicates.

        new_data (list): The new data to add.
        filename (str): The name of the JSON file.
    """
    # Read existing data from the file.
    try:
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        existing_data = []

    # Combine the existing data with the new data and remove duplicates.
    combined_data = {json.dumps(item, sort_keys=True): item for item in existing_data + new_data}.values()

    # Save the combined data back to the file.
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(list(combined_data), f, ensure_ascii=False, indent=4)

## 1.1. Functions

Board game details extraction 

In [35]:
def parse_game_data(xml_root):
    """
    Parses the XML data of board games and extracts relevant information.

    Args:
        xml_root (Element): The root element of the XML tree containing game data.

    Returns:
        list: A list of dictionaries, each containing information about a game.
    """
    games = []
    for item in xml_root.findall("item"):
        game = {
            "id": item.attrib.get("id"),  
            "name": item.find("name").attrib.get("value"),
            "description": item.find("description").text if item.find("description") is not None else "", 
            "imageURL": item.find("image").text if item.find("image") is not None else "", 
            "rating": 0.0,  # Inserted after fetching reviews
            "yearReleased": int(item.find("yearpublished").attrib.get("value", 0)),
            "minPlayers": int(item.find("minplayers").attrib.get("value", 0)),
            "maxPlayers": int(item.find("maxplayers").attrib.get("value", 0)),
            "minSuggAge": int(item.find("minage").attrib.get("value", 0)),
            "minPlayTime": int(item.find("minplaytime").attrib.get("value", 0)),
            "maxPlayTime": int(item.find("maxplaytime").attrib.get("value", 0)),
            "designers": [link.attrib.get("value") for link in item.findall("link[@type='boardgamedesigner']")],
            "artists": [link.attrib.get("value") for link in item.findall("link[@type='boardgameartist']")],
            "publishers": [link.attrib.get("value") for link in item.findall("link[@type='boardgamepublisher']")],
            "categories": [link.attrib.get("value") for link in item.findall("link[@type='boardgamecategory']")],
            "mechanisms": [link.attrib.get("value") for link in item.findall("link[@type='boardgamemechanic']")],
            "family": [link.attrib.get("value") for link in item.findall("link[@type='boardgamefamily']")],
            "reviews": []  # Populated by fetch_reviews
        }
        games.append(game)
    return games


Short description is not included in the XML file, so we need to scrape directly BGG website to retrieve it.

In [36]:
def fetch_short_description(game_id, sleep_time=1, max_retries=5):
    """
    Fetches the short description of a board game from BoardGameGeek.
    Args:
        game_id (int): The ID of the board game to fetch the description for.
        sleep_time (int, optional): The time to sleep between retries in seconds (default is 1).
        max_retries (int, optional): The maximum number of retries for the request (default is 5).
    Returns:
        str: The short description of the board game if found.
        tuple: An empty tuple if the description is not found or if an error occurs.
    """

    # Construct the API endpoint URL for fetching reviews.
    url = f"https://boardgamegeek.com/boardgame/{game_id}"
    
    # Define headers for the request to simulate a real browser and avoid being blocked.
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    # Initialize the status code to simulate a retry loop until the response is successful.
    status_code = 500

    # Retry loop: Keeps trying until a successful response is received or retries are exhausted.
    while status_code != 200:
        time.sleep(sleep_time)  # Add a delay between retries to avoid overloading the server.
        try:
            # Make a GET request to the URL with the headers.
            response = requests.get(url, headers=headers)
            status_code = response.status_code  # Update the status code with the response.
            
            # Break the loop if the response is successful (status code 200).
            if status_code == 200:
                break
            
            # Decrement retries left and handle the case when no retries remain.
            max_retries -= 1
            if max_retries == 0:
                logger.error(f"Error during retrieving short description for ID {game_id}: {status_code}. No retries left.")
                return None  # Return empty results if retries are exhausted.
        
        # Handle network errors or exceptions during the request.
        except Exception as e:
            logger.error(f"Error during retrieving short description for ID {game_id}: {e}")
            max_retries -= 1
            if max_retries == 0:
                logger.error(f"Error during retrieving short description for ID {game_id}. No retries left.")
                return None # Return empty results if retries are exhausted.

    # Check again for a successful response after exiting the loop.
    if response.status_code == 200: # Parse the HTML content        
        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # Find the short description in the HTML content.
        meta_tag = soup.find('meta', {'name': 'description'})
        if meta_tag and 'content' in meta_tag.attrs:
            return (meta_tag['content'])
        else:
            logger.warning(f"Short description not found for game ID {game_id}.")
            return None
    
    else:
        # Handle unexpected cases where the response status is not successful.
        logger.error(f"Error during retrieving short description for ID {game_id}: {response.status_code}")
        return None


Review extraction from a given board game review XML page

In [37]:
def fetch_reviews(game_id, page=1, sleep_time=1, max_retries=5):
    """
    Fetches reviews for a specific game from BoardGameGeek.

    Args:
        game_id (int): The ID of the game to fetch reviews for.
        page (int, optional): The page number of reviews to fetch. Defaults to 1.
        sleep_time (int, optional): The time to wait between retries in seconds. Defaults to 1.
        max_retries (int, optional): The maximum number of retries in case of failure. Defaults to 5.

    Returns:
        tuple: A tuple containing two elements:
            - reviews (list of dict): A list of dictionaries, each containing 'user', 'rating', and 'comment' keys.
            - usernames (list of str): A list of unique usernames who have reviewed the game.
    """

    # Construct the API endpoint URL for fetching reviews.
    url = f"{BGG_BASE_URL}/thing?id={game_id}&comments=1&page={page}&pagesize=100"
    
    # Define headers for the request to simulate a real browser and avoid being blocked.
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    # Initialize the status code to simulate a retry loop until the response is successful.
    status_code = 500

    # Retry loop: Keeps trying until a successful response is received or retries are exhausted.
    while status_code != 200:
        time.sleep(sleep_time)  # Add a delay between retries to avoid overloading the server.
        try:
            # Make a GET request to the URL with the headers.
            response = requests.get(url, headers=headers)
            status_code = response.status_code  # Update the status code with the response.
            
            # Break the loop if the response is successful (status code 200).
            if status_code == 200:
                break
            
            # Decrement retries left and handle the case when no retries remain.
            max_retries -= 1
            if max_retries == 0:
                logger.error(f"Error during retrieving reviews for ID {game_id}: {status_code}. No retries left.")
                return [], []  # Return empty results if retries are exhausted.
        
        # Handle network errors or exceptions during the request.
        except Exception as e:
            logger.error(f"Error during retrieving reviews for ID {game_id}: {e}")
            max_retries -= 1
            if max_retries == 0:
                logger.error(f"Error during retrieving reviews for ID {game_id}. No retries left.")
                return [], []  # Return empty results if retries are exhausted.

    # Check again for a successful response after exiting the loop.
    if response.status_code == 200:
        # Parse the XML response content.
        root = ET.fromstring(response.content)
        
        # Initialize storage for reviews and unique usernames.
        reviews = []
        usernames = set()

        # Extract individual comments from the XML.
        for comment in root.findall("item/comments/comment"):
            username = comment.attrib.get("username")  # Get the username of the commenter.
            usernames.add(username)  # Add the username to the set to ensure uniqueness.
            
            # Append the parsed review to the reviews list.
            reviews.append({
                "user": username,
                "rating": float(comment.attrib.get("rating", 0)) if comment.attrib.get("rating", "N/A") != "N/A" else "N/A",
                "comment": comment.attrib.get("value", "").strip()
            })
        
        # Return the list of reviews and the unique usernames as a tuple.
        return reviews, list(usernames)
    else:
        # Handle unexpected cases where the response status is not successful.
        logger.error(f"Error during retrieving reviews for ID {game_id}: {response.status_code}")
        return [], []


Aggregate previous functions to retrieve all data for a given board game

In [38]:
def fetch_game_data(game_id, review_limit=np.inf, sleep_time=0.5, max_retries=5):
    """
    Fetches game data and includes reviews from BoardGameGeek.

    Parameters:
    game_id (int): The ID of the game to fetch data for.
    review_limit (int, optional): The maximum number of reviews to fetch. Defaults to np.inf.
    sleep_time (float, optional): The time to wait between requests in seconds. Defaults to 0.5.
    max_retries (int, optional): The maximum number of retries for failed requests. Defaults to 5.

    Returns:
    tuple: A tuple containing:
        - game_data (list): A list with the game data dictionary.
        - usernames (list): A list of unique usernames who reviewed the game.
    """

    url = f"{BGG_BASE_URL}/thing?id={game_id}&stats=1"
    status_code = 500

    while status_code != 200:
        time.sleep(sleep_time)  # Delay
        try:
            response = requests.get(url)
            status_code = response.status_code
            if status_code == 200:
                break
            max_retries -= 1
            if max_retries == 0:
                logger.error(f"Error for ID {game_id}: {status_code}. No retries left.")
                return None, []
        except Exception as e:
            logger.error(f"Error for ID {game_id}: {e}")
            max_retries -= 1
            if max_retries == 0:
                logger.error(f"Error for ID {game_id}. No retries left.")
                return None, []

    if response.status_code == 200:
        # Fetch game data
        root = ET.fromstring(response.content)
        game_data = parse_game_data(root)
        rating = 0.0
        n_reviews = 0
        # Fetch short description
        short_description = fetch_short_description(game_id)
        if short_description:
            game_data[0]["short_description"] = short_description
        else:
            game_data[0]["short_description"] = ""
        # Fetch reviews
        if game_data:
            time.sleep(sleep_time)   # 1s delay
            reviews = []
            usernames = set()
            page = 1
            while len(reviews) < review_limit:
                new_reviews, users = fetch_reviews(game_id, page)
                if not new_reviews:
                    break
                reviews.extend(new_reviews)
                usernames.update(users)
                rating += sum(r["rating"] for r in new_reviews if r["rating"] != "N/A")
                n_reviews += len([r for r in new_reviews if r["rating"] != "N/A"])
                page += 1
                time.sleep(REQUEST_DELAY)
            game_data[0]["reviews"] = reviews
        # Add rating
        game_data[0]["rating"] = rating / n_reviews if n_reviews > 0 else 0.0
        logger.info(f"Game ID {game_id}: {game_data[0]['name']} - {len(reviews)} reviews")
        return game_data, list(usernames)
    else:
        logger.error(f"Error for ID {game_id}: {response.status_code}")
        return None, []

## 1.2 Execution

Retrieve top games list from the csv

In [39]:
# Top games data
top_games_df = pd.read_csv(TOP_GAMES_LIST_FILE)
game_ids = top_games_df['id'].tolist()[GAME_RANK_MIN-1:GAME_RANK_MAX]

# Remove games already collected
if not OVERWRITE:
    try:
        with open(BOARD_GAME_FILE, "r", encoding="utf-8") as f:
            data = json.load(f)
            collected_ids = [game["id"] for game in data]
            game_ids = [game_id for game_id in game_ids if game_id not in collected_ids]
    except FileNotFoundError:
        pass
logger.info(f"Games to collect: {len(game_ids)}")

Download the XML page for each game and extract the data, saving it periodically

In [40]:
saved_once = not OVERWRITE

In [41]:
# Download data
all_games = []
all_users = set()

logger.info("BoardGames data scraping...")
for i, game_id in enumerate(tqdm(game_ids, desc="Boardgames download"), start=1):
    if logger.isEnabledFor(logging.INFO):
        print()
    game_data, users = fetch_game_data(game_id, review_limit=REVIEWS_PER_GAME_LIMIT)
    if game_data:
        all_games.extend(game_data)
        all_users.update(users)

    if i % BACKUP_PERIOD == 0:
        if not saved_once:
            save_to_json(all_games, BOARD_GAMES_FILE)
            saved_once = True
        else:
            append_to_json(all_games, BOARD_GAMES_FILE)
        if logger.isEnabledFor(logging.INFO):
            print()
        logger.info(f"Saved {i} games into '{BOARD_GAMES_FILE}'")

        # Reset variable
        all_games = []

    time.sleep(REQUEST_DELAY)  # Delay

# Save any remaining games
if logger.isEnabledFor(logging.INFO):
    print()
if all_games:
    if not saved_once:
        save_to_json(all_games, BOARD_GAMES_FILE)
        saved_once = True
    else:
        append_to_json(all_games, BOARD_GAMES_FILE)
    logger.info(f"Saved remaining {len(all_games)} games into '{BOARD_GAMES_FILE}'")

# Save usernames
save_to_json(list(all_users), USERNAMES_FILE)
logger.info(f"Saved {len(all_users)} users into '{USERNAMES_FILE}'")

logger.info(f"BoardGames data scraping completed. Data saved into '{BOARD_GAMES_FILE}'.")

Boardgames download: 100%|██████████| 1/1 [00:12<00:00, 12.61s/it]
