# 4. Retrieve users

Requirements

In [1]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


Imports

In [2]:
import os
import time

import pandas as pd

import requests
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm
import logging
from bs4 import BeautifulSoup

Logging configuration

In [3]:
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

Constants

In [4]:
# Repositories managing
DATA_DIR = os.path.join("..", "data")
TEMP_DIR = os.path.join(DATA_DIR, "temp")
RAW_DIR = os.path.join(DATA_DIR, "raw")

In [None]:
# Repos creation
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(RAW_DIR, exist_ok=True)

In [None]:
# Files
TOP_GAMES_LIST_FILE = os.path.join(DATA_DIR, "boardgames_ranks.csv")
USERNAMES_FILE = os.path.join(TEMP_DIR, "usernames.json")
USERS_FILE = os.path.join(RAW_DIR, "users.json")

In [None]:
# General download parameters
BACKUP_PERIOD = 200 # Frequency of data backup

REQUEST_DELAY = 0.01    # Delay between requests
COLLECTION_REQUEST_DELAY = 0.1  # Delay between requests (collection-specific)

MAX_RETRIES = 5 # Max number of retries for a request
MAX_COLLECTION_RETRIES = 7  # Max number of retries for a request (collection-specific)

In [None]:
OVERWRITE = False   # If True, the existing files will be overwritten
ALL_USERS = False   # If True, all users will be downloaded
USER_NUM = 5000    # Number of users to download

In [5]:
# Game interval definition
GAME_RANK_MIN = 1   # Rank of the first game to download
GAME_RANK_MAX = 2000    # Rank of the last game to download
GAME_NUM = GAME_RANK_MAX - GAME_RANK_MIN + 1    # Number of games to download

In [6]:
# URLs
BGG_BASE_URL = "https://boardgamegeek.com/xmlapi2"

Utility functions

In [7]:
def save_to_json(data, filename):
    """
    Save data to a JSON file.

    Parameters:
    data (any): The data to be saved to the JSON file. This can be any data type that is serializable to JSON.
    filename (str): The name of the file where the data will be saved.

    Returns:
    None
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [8]:
def append_to_json(new_data, filename):
    """
    Adds new data to an existing JSON file without duplicates.

        new_data (list): The new data to add.
        filename (str): The name of the JSON file.
    """
    # Read existing data from the file.
    try:
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        existing_data = []

    # Combine the existing data with the new data and remove duplicates.
    combined_data = {json.dumps(item, sort_keys=True): item for item in existing_data + new_data}.values()

    # Save the combined data back to the file.
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(list(combined_data), f, ensure_ascii=False, indent=4)

## 2.1 Functions

Retrieve general user information

In [9]:
def fetch_general_user_data(username, sleep_time=REQUEST_DELAY, max_retries=MAX_RETRIES):
    """
    Retrieve general user data from BoardGameGeek.

    Args:
        username (str): The username of the user.
        sleep_time (float): The time to sleep between requests.
        max_retries (int): The maximum number of retries in case of errors.

    Returns:
        dict: The general user data.
    """
    url = f"{BGG_BASE_URL}/user?name={username}&buddies=1"
    status_code = 500
    
    # Retry loop for fetching the threads from the forum
    while status_code != 200:
        # Pause briefly between retries
        time.sleep(sleep_time)  # Delay
        try:
            response = requests.get(url)
            status_code = response.status_code
            if status_code == 200:
                break
            max_retries -= 1
            if max_retries == 0:
                # Log an error if retries are exhausted
                logger.error(f"Error fetching user {username}: {status_code}. Retries exhausted.")
                return []
        except Exception as e:
            # Log an error if an exception occurs
            logger.error(f"Error fetching user {username}: {e}.")
            return []

    if response.status_code == 200:
        root = ET.fromstring(response.text)        
        # Extract user details
        user_data = {
            "id": root.attrib.get("id"),
            "name": root.attrib.get("name"),
            "firstname": root.find("firstname").attrib.get("value", ""),
            "lastname": root.find("lastname").attrib.get("value", ""),
            "avatarlink": root.find("avatarlink").attrib.get("value", ""),
            "yearregistered": root.find("yearregistered").attrib.get("value", ""),
            "lastlogin": root.find("lastlogin").attrib.get("value", ""),
            "stateorprovince": root.find("stateorprovince").attrib.get("value", ""),
            "country": root.find("country").attrib.get("value", ""),
            "buddies": [] # Buddies will be generated from scratch
        }
        # Return the user data
        return user_data
    else:
        # Log an error if the status code is not 200
        logger.error(f"Error fetching user {username}: {response.status_code}.")
        return []

Retrieve user's collection, restricted to the already downloaded board games

In [10]:
def fetch_user_collection(username, games_set, sleep_time=REQUEST_DELAY, max_retries=MAX_RETRIES):
    """
    Retrieve the user collection from BoardGameGeek.

    Args:
        username (str): The username of the user.
        games_set (set): The set of games to filter the collection.
        sleep_time (float): The time to sleep between requests.
        max_retries (int): The maximum number of retries in case of errors.

    Returns:
        list: The user collection.
    """
    url = f"{BGG_BASE_URL}/collection?username={username}&subtype=boardgame&excludesubtype=boardgameexpansion"
    status_code = 500
    
    # Retry loop for fetching collection data
    retries = 0
    while status_code != 200:
        # Pause briefly between retries with exponential backoff
        time.sleep(sleep_time * (2 ** retries))  # Delay
        try:
            response = requests.get(url)
            status_code = response.status_code
            if status_code == 200:
                break
            retries += 1
            if retries == max_retries:
                # Log an error if retries are exhausted
                logger.error(f"Error fetching {username}'s collection: {status_code}. Retries exhausted.")
                return []
        except Exception as e:
            # Log an error if an exception occurs
            logger.error(f"Error fetching {username}'s collection: {e}.")
            return []

    if response.status_code == 200:
        # Parse the XML content
        root = ET.fromstring(response.text)
        games = []

        # Iterate through each item (game) in the collection
        for item in root.findall("item"):
            if int(item.attrib.get("objectid")) not in games_set:
                continue            
            game = {
                "id": item.attrib.get("objectid"),
                "name": item.find("name").text,
                "year_published": item.find("yearpublished").text if item.find("yearpublished") is not None else None,
                "image": item.find("image").text if item.find("image") is not None else None,
                "adding_time": item.find("status").attrib.get("lastmodified") if item.find("status").attrib.get("lastmodified") is not None else None
            }
            games.append(game)
        
        return games
    else:
        # Log an error if the status code is not 200
        logger.error(f"Error fetching user {username}: {response.status_code}.")
        return []

## 2.2 Execution

Prepare the list of users to retrieve

In [11]:
# Load the list of usernames previously collected
with open(USERNAMES_FILE, "r", encoding="utf-8") as f:
    usernames = list(json.load(f))
logger.info(f"Total number of users collected: {len(usernames)}")

# Remove the users already collected
if not OVERWRITE:
    try:
        with open(USERS_FILE, "r", encoding="utf-8") as f:
            users = json.load(f)
        users = {user["name"]: user for user in users}
        usernames = [username for username in usernames if username not in users]
    except FileNotFoundError:
        users = []
logger.info(f"Total number of users not yet collected: {len(usernames)}")

# Remove duplicates and limit the number of users
if not ALL_USERS:
    usernames = set(usernames[:USER_NUM-len(users)])
else:
    usernames = set(usernames)
logger.info(f"Total number of users to collect: {len(usernames)}")

# Retrieve Top games ids
top_games_df = pd.read_csv(TOP_GAMES_LIST_FILE)
games = set(top_games_df.loc[GAME_RANK_MIN-1:GAME_RANK_MAX-1, 'id'])

Retrieve users infromation

In [12]:
saved_once = not OVERWRITE

In [13]:
all_data = []
i = 0

for username in tqdm(usernames):
    if logger.isEnabledFor(logging.INFO):
        print()
    
    # Retrieve general user's profile data
    user = fetch_general_user_data(username, sleep_time=REQUEST_DELAY, max_retries=MAX_RETRIES)
    if user is None:
        continue
    logger.info(f"User {username} ({user['id']}) retrieved. Retrieving user's collection...")

    # Retrieve user's collection
    collection = fetch_user_collection(username, games_set=games, sleep_time=COLLECTION_REQUEST_DELAY, max_retries=MAX_COLLECTION_RETRIES)
    logger.info(f"User {username} ({user['id']}) collection retrieved.")

    # Append user's collection to the general user's profile data
    user["collection"] = collection

    # Append user's data to the list of all data
    all_data.append(user)

    i += 1
    if i % BACKUP_PERIOD == 0:
        # Save the data to the JSON file
        if not saved_once:
            # Save the data to the JSON file
            save_to_json(all_data, USERS_FILE)
            saved_once = True
        else:
            # Append the data to the JSON file
            append_to_json(all_data, USERS_FILE)
        logger.info(f"Saved {i} user(s) into '{USERS_FILE}'")

        # Reset the data list
        all_data = []

# Save the data to the JSON file
if all_data:
    if not saved_once:
        # Save the data to the JSON file
        save_to_json(all_data, USERS_FILE)
    else:
        # Append the data to the JSON file
        append_to_json(all_data, USERS_FILE)
    logger.info(f"Saved {i} user(s) into '{USERS_FILE}'")

 16%|█▌        | 531/3400 [1:54:36<9:56:14, 12.47s/it] 2025-01-17 01:40:21,306 - ERROR - Error fetching TheLimestoneCowboy's collection: 429. Retries exhausted.
 16%|█▌        | 532/3400 [1:54:55<11:33:49, 14.52s/it]2025-01-17 01:40:40,430 - ERROR - Error fetching Bizud's collection: 202. Retries exhausted.
 16%|█▌        | 534/3400 [1:55:16<9:24:07, 11.81s/it] 2025-01-17 01:41:01,576 - ERROR - Error fetching Now Thats A FancyDan's collection: 429. Retries exhausted.
 32%|███▏      | 1074/3400 [3:54:33<7:59:24, 12.37s/it]2025-01-17 03:40:18,535 - ERROR - Error fetching Perimones's collection: 429. Retries exhausted.
 32%|███▏      | 1075/3400 [3:54:52<9:15:09, 14.33s/it]2025-01-17 03:40:37,422 - ERROR - Error fetching MeepleDimples's collection: 202. Retries exhausted.
 32%|███▏      | 1076/3400 [3:55:11<10:07:54, 15.69s/it]2025-01-17 03:40:56,230 - ERROR - Error fetching akomen999's collection: 202. Retries exhausted.
 32%|███▏      | 1077/3400 [3:55:30<10:43:48, 16.63s/it]2025-01-17 