# 3. threads.json

Requirements

In [87]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


Imports

In [88]:
import os
import time

import pandas as pd
import numpy as np

import json
from tqdm import tqdm
import logging

from bs4 import BeautifulSoup
import re

import uuid
import random
from datetime import datetime, timedelta

Logging configuration

In [89]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

Constants

In [90]:
# Repositories
DATA_DIR = os.path.join("..", "data")
TEMP_DIR = os.path.join(DATA_DIR, "temp")
RAW_DIR = os.path.join(DATA_DIR, "raw")
MONGO_DIR = os.path.join(DATA_DIR, "clean", "mongo")
NEO4J_DIR = os.path.join(DATA_DIR, "clean", "neo4j")

In [91]:
# Repos creation
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(MONGO_DIR, exist_ok=True)
os.makedirs(NEO4J_DIR, exist_ok=True)

In [92]:
# Files
MAP_FILE = os.path.join(TEMP_DIR, "user_map.json")
RECENT_REVIEWS = os.path.join(TEMP_DIR, "recent_reviews.json")

FORUMS_FILE = os.path.join(RAW_DIR, "forums.json")
BOARD_GAMES_FILE = os.path.join(RAW_DIR, "boardgames&reviews.json")

GAMES_JSON = os.path.join(MONGO_DIR, "games.json")
THREADS_JSON = os.path.join(MONGO_DIR, "threads.json")

Utility functions

In [93]:
def save_to_json(data, filename):
    """
    Save data to a JSON file.

    Parameters:
    data (any): The data to be saved to the JSON file. This can be any data type that is serializable to JSON.
    filename (str): The name of the file where the data will be saved.

    Returns:
    None
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [94]:
def append_to_json(new_data, filename):
    """
    Adds new data to an existing JSON file without duplicates.

        new_data (list): The new data to add.
        filename (str): The name of the JSON file.
    """
    # Read existing data from the file.
    try:
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        existing_data = []

    # Combine the existing data with the new data and remove duplicates.
    combined_data = {json.dumps(item, sort_keys=True): item for item in existing_data + new_data}.values()

    # Save the combined data back to the file.
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(list(combined_data), f, ensure_ascii=False, indent=4)

## Functions

In [95]:
def truncate_html(content, length):
    soup = BeautifulSoup(content, 'html.parser')
    truncated_content = ''
    current_length = 0

    for element in soup.recursiveChildGenerator():
        if isinstance(element, str):
            if current_length + len(element) > length:
                truncated_content += element[:length - current_length]
                break
            else:
                truncated_content += element
                current_length += len(element)
        else:
            truncated_content += str(element)
    
    return truncated_content

## Execution

In [96]:
with open(BOARD_GAMES_FILE, "r", encoding="utf-8") as f:
    raw_games = [ 
        {
            "id": int(raw_game["id"]), 
            "name": raw_game["name"],
            "yearReleased": raw_game["yearReleased"] if raw_game["yearReleased"] >= 0 else 0
        } 
        for raw_game in json.load(f)
    ]
with open(GAMES_JSON, "r", encoding="utf-8") as f:
    clean_games = [ 
        {
            "id": game["id"], 
            "name": game["name"],
            "yearReleased": game["yearReleased"]
        } 
        for game in json.load(f)
    ]
game_dict = {}
for raw_game in raw_games:
    for clean_game in clean_games:
        if raw_game["name"] == clean_game["name"] and raw_game["yearReleased"] == clean_game["yearReleased"]:
            game_dict[raw_game["id"]] = clean_game
            break
logger.info(f"Loaded {len(game_dict)} games.")

with open(FORUMS_FILE, "r", encoding="utf-8") as f:
    raw_forums = json.load(f)
logger.info(f"Loaded {len(raw_forums)} forums.")

with open(MAP_FILE, "r", encoding="utf-8") as f:
    user_map = json.load(f)
logger.info(f"Loaded {len(user_map)} users.")

2025-01-20 15:44:35,414 - INFO - Loaded 2000 games.
2025-01-20 15:44:55,523 - INFO - Loaded 302 forums.
2025-01-20 15:44:55,759 - INFO - Loaded 121274 users.


In [101]:
threads = []

for raw_game in tqdm(raw_forums, desc="Processing threads"):
    for raw_forum in raw_game["forums"]:
        for raw_thread in raw_forum["threads"]:
            if raw_thread["author"] not in user_map:
                continue
            thread = {
                "id": str(uuid.uuid4()),
                "author": user_map[raw_thread["author"]],
                "content": raw_thread["subject"],
                "postDate": datetime.strptime(raw_thread["post_date"], "%a, %d %b %Y %H:%M:%S %z").isoformat(),
                "lastPostDate": None,
                "tag": raw_forum["title"],
                "game": game_dict[raw_game["id"]],
                "messages": []
            }

            # Add messages and update last post date
            for raw_message in raw_thread["messages"]:
                if raw_message["username"] not in user_map:
                    continue
                message = {
                    "id": str(uuid.uuid4()),
                    "author": user_map[raw_message["username"]],
                    "postDate": datetime.strptime(raw_message["post_date"], "%Y-%m-%dT%H:%M:%S%z").isoformat(),
                    "content": None
                }

                # Manage message content
                if raw_message["content"] is None:
                    continue
                soup = BeautifulSoup(raw_message["content"], "html.parser")
                font_tag = soup.find("font", {"color": "#2121A4"})
                if font_tag:
                    html_reply = str(font_tag)
                    font_tag.decompose()

                    # Extract reply info
                    reply_soup = BeautifulSoup(html_reply, 'html.parser')
                    if reply_soup.find('div', class_='quotetitle'):
                        username_tag = reply_soup.find('div', class_='quotetitle').find('b')
                        username = re.search(r'^(.*) wrote:', username_tag.text).group(1)
                        body_tag = reply_soup.find('div', class_='quotebody').find('i')
                        body_content = body_tag.decode_contents()
                        
                        if username in user_map:
                            username = user_map[username]
                            # Search the reply message in previous messages
                            for prev_message in thread["messages"]:
                                if prev_message["author"] == username and (body_content.strip() in prev_message["content"].strip()):
                                    message["replyTo"] = {
                                        "username": prev_message["author"],
                                        "messageUUID": prev_message['id'],
                                        "contentPreview": truncate_html(prev_message["content"], 100)  # Insert only the first 100 characters of the message
                                    }
                                    break

                message["content"] = str(soup)

                thread["messages"].append(message)
                thread["lastPostDate"] = max(message["postDate"], thread["lastPostDate"]) if thread["lastPostDate"] else message["postDate"]
            
            threads.append(thread)

  soup = BeautifulSoup(raw_message["content"], "html.parser")
  soup = BeautifulSoup(content, 'html.parser')
Processing threads: 100%|██████████| 302/302 [05:55<00:00,  1.18s/it]


In [102]:
if threads:
    save_to_json(threads, THREADS_JSON)
logger.info(f"Saved {len(threads)} threads to {THREADS_JSON}.")
threads = None

2025-01-20 16:00:11,862 - INFO - Saved 9665 threads to ..\data\clean\mongo\threads.json.
