# 2. Retrieve threads

> Note:
> - I dati sono sufficienti ma occorre ristrutturarli, in quanto diversi da come li vorremmo
> - Le reply non sono dirette: il subject delle reply è "Re: [subject originale]" e il body è il testo della reply

Requirements

In [33]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


Imports

In [20]:
import os
import time

import pandas as pd

import requests
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm
import logging
from bs4 import BeautifulSoup

Logging configuration

In [21]:
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

Constants

In [None]:
# Repositories
DATA_DIR = os.path.join("..", "data")
TEMP_DIR = os.path.join(DATA_DIR, "temp")
RAW_DIR = os.path.join(DATA_DIR, "raw")

In [None]:
# Repos creation
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(RAW_DIR, exist_ok=True)

In [None]:
# Files
TOP_GAMES_LIST_FILE = os.path.join(DATA_DIR, "boardgames_ranks.csv")
USERNAMES_FILE = os.path.join(TEMP_DIR, "usernames.json")
FORUMS_FILE = os.path.join(RAW_DIR, "forums.json")

In [None]:
# General download parameters
BACKUP_PERIOD = 25 # Frequency of data backup
REQUEST_DELAY = 0.01    # Delay between requests
MAX_RETRIES = 5 # Max number of retries for a request

In [None]:
OVERWRITE = False    # If True, the existing files will be overwritten

In [None]:
# Game interval definition
GAME_RANK_MIN = 1   # Rank of the first game to download
GAME_RANK_MAX = 2000    # Rank of the last game to download
GAME_NUM = GAME_RANK_MAX - GAME_RANK_MIN + 1    # Number of games to download

In [23]:
# Specific download parameters
MAX_FORUMS_PER_GAME = 10    # Maximum number of forums to download for each game
MIN_THREADS_PER_FORUM = 8   # Minimum number of threads to download for each forum
MAX_THREADS_PER_FORUM = 20  # Maximum number of threads to download for each forum
MIN_ARTICLES_PER_THREAD = 10    # Minimum number of articles to download for each thread
MAX_ARTICLES_PER_THREAD = 70    # Maximum number of articles to download for each thread

In [24]:
# URLs
BGG_BASE_URL = "https://boardgamegeek.com/xmlapi2"

Utility functions

In [25]:
def save_to_json(data, filename):
    """
    Save data to a JSON file.

    Parameters:
    data (any): The data to be saved to the JSON file. This can be any data type that is serializable to JSON.
    filename (str): The name of the file where the data will be saved.

    Returns:
    None
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [2]:
def append_to_json(new_data, filename):
    """
    Adds new data to an existing JSON file without duplicates.

        new_data (list): The new data to add.
        filename (str): The name of the JSON file.
    """
    # Read existing data from the file.
    try:
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        existing_data = []

    # Combine the existing data with the new data and remove duplicates.
    combined_data = {json.dumps(item, sort_keys=True): item for item in existing_data + new_data}.values()

    # Save the combined data back to the file.
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(list(combined_data), f, ensure_ascii=False, indent=4)

## 2.1 Functions

Fetches the list of forums associated with a game ID

In [27]:
def fetch_forum_list(game_id, max_forums=100, min_threads=5, sleep_time=0.5, max_retries=5):
    """
        Fetches the list of forums associated with a game.
        
        Args:
            game_id (int): The ID of the game for which to fetch the forums.
            max_forums (int, optional): The maximum number of forums to fetch. Defaults to 100.

        Returns:
            list: A list of dictionaries, each containing information about a forum:
                - id (str): The ID of the forum.
                - title (str): The title of the forum.
                - num_threads (int): The number of threads in the forum.
                - num_posts (int): The number of posts in the forum.
                - last_post_date (str): The date of the last post in the forum.
    """
    # Build the URL for requesting the forum list
    url = f"{BGG_BASE_URL}/forumlist?id={game_id}&type=thing"
    status_code = 500
    
    # Retry loop for fetching the threads from the forum
    while status_code != 200:
        # Pause briefly between retries
        time.sleep(sleep_time)  # Delay
        try:
            response = requests.get(url)
            status_code = response.status_code
            if status_code == 200:
                break
            max_retries -= 1
            if max_retries == 0:
                # Log an error if retries are exhausted
                logger.error(f"Error fetching threads for forum {forum_id}: {status_code}. Retries exhausted.")
                return []
        except Exception as e:
            # Log an error if an exception occurs
            logger.error(f"Error fetching threads for forum {forum_id}: {e}.")
            return []
    
    if response.status_code == 200:
        # Parse the XML response
        root = ET.fromstring(response.content)
        forums = []
        # Iterate over each forum in the XML, respecting the max_forums limit
        for forum in root.findall("forum")[:max_forums]:
            # Capture relevant forum information in a dictionary
            if forum.attrib.get("numthreads") is not None and int(forum.attrib.get("numthreads")) > 5:
                forums.append({
                    "id": forum.attrib.get("id"),
                    "title": forum.attrib.get("title"),
                    "num_threads": int(forum.attrib.get("numthreads", 0)),
                    "num_posts": int(forum.attrib.get("posts", 0)),
                    "last_post_date": forum.attrib.get("lastpostdate")
                })
        # Return the collected forum data
        return forums
    else:
        # Log an error if the request was unsuccessful
        logger.error(f"Error fetching forums for game {game_id}: {response.status_code}")
        return []

Fetches the list of threads associated with a forum ID

In [28]:
def fetch_threads_from_forum(forum_id, max_threads=5, min_articles=10, max_articles=100, sleep_time=0.5, max_retries=5):
    """
        Fetches threads from a specific forum, limiting the number of threads.
        
        Args:
            forum_id (int): The ID of the forum to fetch threads from.
            max_threads (int, optional): The maximum number of threads to fetch. Defaults to 5.

        Returns:
            list: A list of dictionaries, each containing details of a thread. Each dictionary has the following keys:
                - thread_id (str): The ID of the thread.
                - author (str): The author of the thread.
                - subject (str): The subject of the thread.
                - num_articles (int): The number of articles in the thread.
                - post_date (str): The post date of the thread.
                - last_post_date (str): The last post date of the thread.
    """

    url = f"{BGG_BASE_URL}/forum?id={forum_id}"
    status_code = 500
    
    # Retry loop for fetching the threads from the forum
    while status_code != 200:
        # Pause briefly between retries
        time.sleep(sleep_time)  # Delay
        try:
            response = requests.get(url)
            status_code = response.status_code
            if status_code == 200:
                break
            max_retries -= 1
            if max_retries == 0:
                # Log an error if retries are exhausted
                logger.error(f"Error fetching threads for forum {forum_id}: {status_code}. Retries exhausted.")
                return []
        except Exception as e:
            # Log an error if an exception occurs
            logger.error(f"Error fetching threads for forum {forum_id}: {e}.")
            return []
    
    if status_code == 200:
        # Parse the XML response
        root = ET.fromstring(response.content)
        threads = []
        # Iterate over each thread in the XML, respecting the max_threads limit
        for thread in root.findall("threads/thread")[:max_threads]:
            # Capture relevant thread information in a dictionary
            if thread.attrib.get("numarticles") is not None and int(thread.attrib.get("numarticles")) > min_articles and int(thread.attrib.get("numarticles")) < max_articles:
                threads.append({
                    "thread_id": thread.attrib.get("id"),
                    "author": thread.attrib.get("author"),
                    "subject": thread.attrib.get("subject"),
                    "num_articles": int(thread.attrib.get("numarticles", 0)),
                    "post_date": thread.attrib.get("postdate"),
                    "last_post_date": thread.attrib.get("lastpostdate")
                })
        # Return the collected thread data
        return threads
    else:
        # Log an error if the request was unsuccessful
        logger.error(f"Error fetching threads for forum {forum_id}: {response.status_code}")
        return []

Fetches the list of messages associated with a thread ID

In [29]:
def fetch_messages_from_thread(thread_id, max_posts=5, sleep_time=0.5, max_retries=5):
    """
        Fetches messages from a specific thread, supporting pagination.
        Args:
            thread_id (int): The ID of the thread to fetch messages from.
            max_posts (int, optional): The maximum number of posts to fetch. Defaults to 5.
            sleep_time (float, optional): The time to sleep between retries in seconds. Defaults to 0.5.
            max_retries (int, optional): The maximum number of retries for fetching messages. Defaults to 5.
        Returns:
            tuple: A tuple containing:
                - messages (list): A list of dictionaries, each containing details of a message.
                - usernames (list): A list of unique usernames who posted in the thread.
    """

    messages = []
    usernames = set()
    
    # Build the URL for requesting the thread messages
    url = f"{BGG_BASE_URL}/thread?id={thread_id}"
    status_code = 500

    # Retry loop for fetching the thread messages
    while status_code != 200:
        # Pause briefly between retries
        time.sleep(sleep_time)  # Delay
        try:
            response = requests.get(url)
            status_code = response.status_code
            if status_code == 200:
                break
            max_retries -= 1
            if max_retries == 0:
                # Log an error if retries are exhausted
                logger.error(f"Error fetching messages for thread {thread_id}: {status_code}. Retries exhausted.")
                return [], []
        except Exception as e:
            # Log an error if an exception occurs
            logger.error(f"Error fetching messages for thread {thread_id}: {e}.")
            return [], []
    
    if response.status_code == 200:
        # Parse the XML response
        root = ET.fromstring(response.content)
        # Iterate over each article in the XML, respecting the max_posts limit
        for article in root.find("articles").findall("article")[:max_posts]:
            usernames.add(article.attrib.get("username"))
            # Capture relevant message information in a dictionary
            messages.append({
                "article_id": article.attrib.get("id"),
                "username": article.attrib.get("username"),
                "post_date": article.attrib.get("postdate"),
                "edit_date": article.attrib.get("editdate"),
                "num_edits": int(article.attrib.get("numedits", 0)),
                "subject": article.find("subject").text if article.find("subject") is not None else None,
                "content": article.find("body").text if article.find("body") is not None else None
            })
        logger.info(f"\t\tDownloaded {len(messages)} messages from thread {thread_id}")
    else:
        # Log an error if the request was unsuccessful
        logger.error(f"Error fetching messages for thread {thread_id}: {response.status_code}")
        return [], []
    
    # Return the collected messages and unique usernames
    return messages, list(usernames)

## 2.2 Execution

Retrieves the game list from the csv

In [30]:
with open(USERNAMES_FILE, "r", encoding="utf-8") as f:
    usernames = set(json.load(f))
    
# Top games data
top_games_df = pd.read_csv(TOP_GAMES_LIST_FILE)
games = top_games_df.loc[GAME_RANK_MIN-1:GAME_RANK_MAX-1, ['id', 'name']]

# Remove games already collected
if not OVERWRITE:
    try:
        with open(FORUMS_FILE, "r", encoding="utf-8") as f:
            data = json.load(f)
            collected_ids = [game["id"] for game in data]
            games = [game for game in games if game['id'] not in collected_ids]
    except FileNotFoundError:
        pass
logger.info(f"Games to collect: {len(games)}")

Downoald threads content for each game

In [31]:
saved_once = not OVERWRITE

In [32]:
all_data = []
i = 0
for name, game_id in tqdm(zip(games["name"], games["id"]), desc="Fetching forums", total=games.shape[0]):
    if logger.isEnabledFor(logging.INFO):
        print()
    game = {
        "id": game_id,
        "name": name,
        "forums": []
    }
    # Fetch the list of forums for the specified game
    forums = fetch_forum_list(game_id, max_forums=MAX_FORUMS_PER_GAME, min_threads=MIN_THREADS_PER_FORUM, sleep_time=REQUEST_DELAY, max_retries=MAX_RETRIES)
    logger.info(f"Retrieving {len(forums)} forums for game {game['name']}...")
    
    # For each forum in the list of forums
    for forum in forums:
        # Fetch the threads from the specified forum
        threads = fetch_threads_from_forum(forum["id"], max_threads=MAX_THREADS_PER_FORUM, min_articles=MIN_ARTICLES_PER_THREAD, max_articles=MAX_ARTICLES_PER_THREAD, sleep_time=REQUEST_DELAY, max_retries=MAX_RETRIES)
        logger.info(f"\tRetrieving {len(threads)} threads for forum {forum['title']}...")

        # For each thread in the list of threads
        for thread in threads:
            # Fetch the messages and users from the specified thread
            thread["messages"], users = fetch_messages_from_thread(thread["thread_id"], max_posts=MAX_ARTICLES_PER_THREAD, sleep_time=REQUEST_DELAY, max_retries=MAX_RETRIES)
            # Update the set of usernames with the fetched users
            usernames.update(users)

        # Add the threads to the forum
        forum["threads"] = threads

    # Add the forums to the game
    game["forums"] = forums

    # Append the game data to the list of all data
    all_data.append(game)

    i += 1
    if i % BACKUP_PERIOD == 0:
        # Save the data to the JSON file
        if not saved_once:
            # Save the data to the JSON file
            save_to_json(all_data, FORUMS_FILE)
            saved_once = True
        else:
            # Append the data to the JSON file
            append_to_json(all_data, FORUMS_FILE)
        logger.info(f"Saved {i} games into '{FORUMS_FILE}'")

        # Reset the data list
        all_data = []
    
# Save the remaining data to the JSON file
if logger.isEnabledFor(logging.INFO):
    print()
if all_data:
    if not saved_once:
        # Save the data to the JSON file
        save_to_json(all_data, FORUMS_FILE)
    else:
        # Append the data to the JSON file
        append_to_json(all_data, FORUMS_FILE)
    logger.info(f"Saved remaining {len(all_data)} games forums into '{FORUMS_FILE}'")

# Save the set of usernames to the JSON file
save_to_json(list(usernames), USERNAMES_FILE)

Fetching forums:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching forums: 100%|██████████| 1/1 [00:34<00:00, 34.17s/it]
