# 3. threads.json

Requirements

In [59]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


Imports

In [60]:
import os
import time

import pandas as pd
import numpy as np

import json
from tqdm import tqdm
import logging

from bs4 import BeautifulSoup
import re

import uuid
import random
import math
from datetime import datetime, timedelta
import dateparser

import string
import secrets
import hashlib

Logging configuration

In [61]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

Constants

In [62]:
# Repositories
DATA_DIR = os.path.join("..", "data")
TEMP_DIR = os.path.join(DATA_DIR, "temp")
RAW_DIR = os.path.join(DATA_DIR, "raw")
MONGO_DIR = os.path.join(DATA_DIR, "clean", "mongo")
NEO4J_DIR = os.path.join(DATA_DIR, "clean", "neo4j")

In [63]:
# Repos creation
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(MONGO_DIR, exist_ok=True)
os.makedirs(NEO4J_DIR, exist_ok=True)

In [64]:
# Files
MAP_FILE = os.path.join(TEMP_DIR, "user_map.json")

BOARD_GAMES_FILE = os.path.join(RAW_DIR, "boardgames&reviews.json")
TOURNAMENTS_FILE = os.path.join(RAW_DIR, "tournaments.json")

GAMES_JSON = os.path.join(MONGO_DIR, "games.json")
TOURNAMENTS_JSON = os.path.join(MONGO_DIR, "tournaments.json")

NEO4J_FOLLOWERS = os.path.join(NEO4J_DIR, "followers.json")
NEO4J_USERS = os.path.join(NEO4J_DIR, "users.json")

NEO4J_TOURNAMENTS_JSON = os.path.join(NEO4J_DIR, "tournaments.json")
NEO4J_WINNERS_JSON = os.path.join(NEO4J_DIR, "winners.json")
NEO4J_PARTICIPANTS_JSON = os.path.join(NEO4J_DIR, "participants.json")
NEO4J_ADMINISTRATORS_JSON = os.path.join(NEO4J_DIR, "administrators.json")
NEO4J_GAME_TOURNAMENTS_JSON = os.path.join(NEO4J_DIR, "game_tournaments.json")

Utility functions

In [65]:
def save_to_json(data, filename):
    """
    Save data to a JSON file.

    Parameters:
    data (any): The data to be saved to the JSON file. This can be any data type that is serializable to JSON.
    filename (str): The name of the file where the data will be saved.

    Returns:
    None
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [66]:
def append_to_json(new_data, filename):
    """
    Adds new data to an existing JSON file without duplicates.

        new_data (list): The new data to add.
        filename (str): The name of the JSON file.
    """
    # Read existing data from the file.
    try:
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        existing_data = []

    # Combine the existing data with the new data and remove duplicates.
    combined_data = {json.dumps(item, sort_keys=True): item for item in existing_data + new_data}.values()

    # Save the combined data back to the file.
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(list(combined_data), f, ensure_ascii=False, indent=4)

## Functions

In [67]:
def get_allowed_users(admin):
    if admin not in follows:
        return []  # L'admin non segue nessuno
    # Filtra gli utenti seguiti che ricambiano il follow
    return [
        user for user in follows[admin]
        if user in follows and admin in follows[user]
    ]

In [68]:
def calculate_proximity_scores(users, admin_location):
    scores = []
    for user in users:
        loc = user["location"]
        if loc is None:
            scores.append(0)  # Nessuna location = minimo punteggio
        elif loc["city"] is not None and loc["city"] == admin_location["city"]:
            scores.append(3)  # Stessa città
        elif loc["stateOrProvince"] is not None and loc["stateOrProvince"] == admin_location["stateOrProvince"]:
            scores.append(2)  # Stesso paese
        elif loc["country"] is not None and loc["country"] == admin_location["country"]:
            scores.append(2)  # Stesso stato
        else:
            scores.append(0)  # Nessuna corrispondenza
    return scores

In [69]:
def sample_with_exponential_probability(users, proximity_scores, k):
    """
    Esegue un campionamento con probabilità esponenziale.

    Args:
        users (list): Lista di utenti ordinati per vicinanza.
        proximity_scores (list): Lista di punteggi che indicano la vicinanza (valori più alti = più vicini).
        k (int): Numero di utenti da campionare.

    Returns:
        list: Lista di utenti selezionati.
    """
    # Calcola probabilità esponenziali
    max_score = max(proximity_scores)
    weights = [math.exp(score - max_score) for score in proximity_scores]
    
    # Esegue il campionamento basato sui pesi
    selected_users = random.choices(users, weights=weights, k=k)
    return selected_users

## Execution

In [70]:
with open(BOARD_GAMES_FILE, "r", encoding="utf-8") as f:
    raw_games = [ 
        {
            "id": int(raw_game["id"]), 
            "name": raw_game["name"],
            "yearReleased": raw_game["yearReleased"] if raw_game["yearReleased"] >= 0 else 0
        } 
        for raw_game in json.load(f)
    ]
with open(GAMES_JSON, "r", encoding="utf-8") as f:
    clean_games = [ 
        {
            "id": game["id"], 
            "name": game["name"],
            "yearReleased": game["yearReleased"]
        } 
        for game in json.load(f)
    ]
game_dict = {}
for raw_game in raw_games:
    for clean_game in clean_games:
        if raw_game["name"] == clean_game["name"] and raw_game["yearReleased"] == clean_game["yearReleased"]:
            game_dict[raw_game["id"]] = clean_game
            break
logger.info(f"Loaded {len(game_dict)} games.")

with open(TOURNAMENTS_FILE, "r", encoding="utf-8") as f:
    raw_tournaments = json.load(f)
logger.info(f"Loaded {len(raw_tournaments)} tournaments.")

with open(NEO4J_USERS, "r", encoding="utf-8") as f:
    users = json.load(f)
logger.info(f"Loaded {len(users)} users.")

with open(NEO4J_FOLLOWERS, "r", encoding="latin-1") as f:
    relationships = json.load(f)
logger.info(f"Loaded {len(relationships)} followers.")

2025-01-21 10:55:51,065 - INFO - Loaded 2000 games.
2025-01-21 10:55:51,147 - INFO - Loaded 199 tournaments.
2025-01-21 10:55:51,180 - INFO - Loaded 5000 users.
2025-01-21 10:55:51,707 - INFO - Loaded 259629 followers.


In [71]:
follows = {}
for rel in relationships:
    follower = rel["follower"]
    followed = rel["followed"]
    follows.setdefault(follower, set()).add(followed)
logger.info(f"Loaded {len(follows)} follows.")

2025-01-21 10:55:52,040 - INFO - Loaded 4999 follows.


In [72]:
# Expand tournament occurrences
populated_games = {game["id"]: game["name"] for game in raw_tournaments}
missing_games = set(game_dict.keys()) - set(populated_games.keys())
missing_games = [{"id": game_id, "name": game_dict[game_id]["name"]} for game_id in missing_games]
missing_games[:5]

[{'id': 1, 'name': 'Die Macher'},
 {'id': 4098, 'name': 'Age of Steam'},
 {'id': 3, 'name': 'Samurai'},
 {'id': 4099, 'name': 'Keythedral'},
 {'id': 5, 'name': 'Acquire'}]

In [73]:
populated_games

{342942: 'Ark Nova',
 167791: 'Terraforming Mars',
 220308: 'Gaia Project',
 193738: 'Great Western Trail',
 169786: 'Scythe',
 173346: '7 Wonders Duel',
 120677: 'Terra Mystica',
 266192: 'Wingspan',
 251247: 'Barrage',
 373106: 'Sky Team',
 3076: 'Puerto Rico',
 185343: 'Anachrony',
 170216: 'Blood Rage',
 31260: 'Agricola',
 231733: 'Obsession',
 276025: 'Maracaibo',
 216132: 'Clans of Caledonia',
 383179: 'Age of Innovation',
 230802: 'Azul',
 28143: 'Race for the Galaxy',
 317985: 'Beyond the Sun',
 93: 'El Grande',
 68448: '7 Wonders',
 322289: "Darwin's Journey",
 225694: 'Decrypto',
 236457: 'Architects of the West Kingdom',
 310873: 'Carnegie',
 364073: 'Splendor Duel',
 122515: 'Keyflower',
 42: 'Tigris & Euphrates',
 18602: 'Caylus',
 73439: 'Troyes',
 163412: 'Patchwork',
 144733: 'Russian Railroads',
 196340: 'Yokohama',
 414317: 'Harmonies',
 254640: 'Just One',
 132531: 'Roll for the Galaxy',
 30549: 'Pandemic',
 281259: 'The Isle of Cats',
 118048: 'Targi',
 263918: 'Ca

In [74]:
generated_raw_tournaments = []
for game in missing_games:
    generated_element = {
        "id": game["id"],
        "name": game["name"],
        "tournaments": []
    }
    # Add a random number of tournaments for the game
    for i in range(random.randint(5, 15)):
        pt_id = random.sample(list(populated_games.keys()), 1)[0]
        rnd_game_tournament = [raw_tournament for raw_tournament in raw_tournaments if raw_tournament["id"] == pt_id][0]
        while len(rnd_game_tournament["tournaments"]) == 0:
            pt_id = random.sample(list(populated_games.keys()), 1)[0]
            rnd_game_tournament = [raw_tournament for raw_tournament in raw_tournaments if raw_tournament["id"] == pt_id][0]
        rnd_tournament = random.choice(rnd_game_tournament["tournaments"])
        # Replace substring in the tournament name if it contains a certain substring (case insensitive)
        if rnd_game_tournament["name"].lower() in rnd_tournament["name"].lower():
            rnd_tournament["name"] = re.sub(re.escape(rnd_game_tournament["name"]), game["name"], rnd_tournament["name"], flags=re.IGNORECASE)
        generated_element["tournaments"].append(rnd_tournament)
    generated_raw_tournaments.append(generated_element)
raw_tournaments.extend(generated_raw_tournaments)
logger.info(f"Expanded tournaments to {len(raw_tournaments)}.")

2025-01-21 10:55:52,986 - INFO - Expanded tournaments to 2000.


In [79]:
tournaments = []
neo4j_tournaments = []
neo4j_game_tournaments = []
neo4j_participants = []
neo4j_winners = []
neo4j_administrators = []
STATUS = {
    "future": "OPEN_FOR_REGISTRATION",
    "full": "CLOSE_FOR_REGISTRATION",
    "progress": "IN_PROGRESS",
    "finished": "FINISHED"
}
VISIBILITY = ["PUBLIC", "PRIVATE", "INVITE"]

for raw_game in tqdm(raw_tournaments):
    for raw_tournament in raw_game["tournaments"]:

        ##################### MONGO TOURNAMENT #####################

        tournament = {
            "id": str(uuid.uuid4()),
            
            "name": raw_tournament["name"],
            "game": game_dict[raw_game["id"]],
            
            "type": raw_tournament["type"],
            "typeDescription": raw_tournament["type_description"],
            
            "startingTime": None,
            "location": {
                "city": None,
                "stateOrProvince": None,
                "country": None
            },
            
            "numParticipants": int(raw_tournament["num_participants"]),
            "minParticipants": int(raw_tournament["min_participants"]),
            "maxParticipants": int(raw_tournament["max_participants"]),
            
            "administrator": None,
            "winner": None,
            
            "visibility": None,
            
            "options": {k: v for k, v in raw_tournament["options"].items() if k != "Reputation required"}
        }

        if raw_tournament["status"] == "future" and raw_tournament["num_participants"] == raw_tournament["max_participants"]:
            status = STATUS["full"]
        else:
            status = STATUS[raw_tournament["status"]]

        # Set the starting time of the tournament
        if raw_tournament["starting_time"] is not None:
            try:
                tournament["startingTime"] = datetime.strptime(raw_tournament["starting_time"], "%m/%d/%Y %H:%M").isoformat(timespec='minutes')
                if status == "OPEN_FOR_REGISTRATION" or status == "CLOSE_FOR_REGISTRATION" and tournament["startingTime"] < datetime.now().isoformat(timespec='minutes'):
                    tournament["startingTime"] = (datetime.now() + timedelta(days=random.randint(15, 30))).isoformat(timespec='minutes')
            except ValueError:
                if status == "OPEN_FOR_REGISTRATION" or status == "CLOSE_FOR_REGISTRATION":
                    tournament["startingTime"] = (datetime.now() + timedelta(days=random.randint(15, 30))).isoformat(timespec='minutes')
                elif status == "IN_PROGRESS":
                    tournament["startingTime"] = (datetime.now() - timedelta(days=random.randint(1, 30))).isoformat(timespec='minutes')
                elif status == "FINISHED":
                    tournament["startingTime"] = (datetime.now() - timedelta(days=random.randint(31, 60))).isoformat(timespec='minutes')
                else:
                    continue
        else:
            if status == "OPEN_FOR_REGISTRATION" or status == "CLOSE_FOR_REGISTRATION":
                tournament["startingTime"] = datetime.now() + timedelta(days=random.randint(1, 30))
            elif status == "IN_PROGRESS":
                tournament["startingTime"] = datetime.now() - timedelta(days=random.randint(1, 30))
            elif status == "FINISHED":
                tournament["startingTime"] = datetime.now() - timedelta(days=random.randint(31, 60))
            else:
                continue

        # Set the administrator of the tournament and its location
        admin = random.choice(users)
        tournament["administrator"] = admin["username"]
        tournament["location"] = admin["location"]

        # Set the visibility of the tournament
        tournament["visibility"] = np.random.choice(VISIBILITY, p=[0.3, 0.3, 0.4])
        if tournament["visibility"] == "PRIVATE":
            tournament["allowed"] = get_allowed_users(admin["username"])
            if len(tournament["allowed"]) == 0:
                tournament["visibility"] = "PUBLIC"
            elif len(tournament["allowed"]) < tournament["numParticipants"]:
                tournament["minParticipants"] = len(tournament["allowed"])
                tournament["numParticipants"] = len(tournament["allowed"])
        
        ##################### NEO4J TOURNAMENT #####################

        # Add the tournament to the list of tournaments
        neo4j_tournament = {
            "id": tournament["id"],
            "name": tournament["name"],
            "visibility": tournament["visibility"],
            "maxParticipants": tournament["maxParticipants"],
            "startingTime": tournament["startingTime"],
            "location": tournament["location"],
        }
        neo4j_tournaments.append(neo4j_tournament)       

        ##################### NEO4J RELATED TO #####################

        # Set the game-tournament relationship
        neo4j_game_tournaments.append({
            "game": tournament["game"]["id"],
            "tournament": tournament["id"],
        })

        ##################### NEO4J ADMINISTRATORS #####################

        # Add the administrator to the list of administrators
        neo4j_administrator = {
            "tournament": tournament["id"],
            "administrator": tournament["administrator"],
            "creationTime": (datetime.fromisoformat(tournament["startingTime"]) - timedelta(days=random.randint(1, 30))).isoformat()
        }
        neo4j_administrators.append(neo4j_administrator)


        ##################### NEO4J PARTICIPANTS #####################

        # Set the participants of the tournament
        tournament_participants = []
        if tournament["visibility"] == "PUBLIC":
            # all users, ordered by proximity with an exponential probability distribution
            proximity_scores = calculate_proximity_scores(
                users, 
                tournament["location"]
            )
            tournament_participants = sample_with_exponential_probability(
                [user["username"] for user in users],
                proximity_scores,
                tournament["numParticipants"]
            )
        elif tournament["visibility"] == "PRIVATE":
            # Partecipanti solo dalla lista allowed, ordinati per vicinanza
            proximity_scores = calculate_proximity_scores(
                [user for user in users if user["username"] in tournament["allowed"]], 
                tournament["location"]
            )
            tournament_participants = sample_with_exponential_probability(
                tournament["allowed"],
                proximity_scores,
                tournament["numParticipants"]
            )
        elif tournament["visibility"] == "INVITE":
            # all the nearest users
            nearby_users = calculate_proximity_scores(
                users, 
                tournament["location"]
            )
            # Sort users by proximity score in descending order
            sorted_users = [user for _, user in sorted(zip(nearby_users, users), key=lambda pair: pair[0], reverse=True)]
            # Select the top users based on the number of participants
            tournament_participants = [user["username"] for user in sorted_users[:tournament["numParticipants"]]]
        # Add the participants to the list of participants
        for participant in tournament_participants:
            neo4j_participants.append({
                "tournament": tournament["id"],
                "participant": participant,
                "registrationTime": ((datetime.fromisoformat(neo4j_administrator["creationTime"])) + timedelta(days=random.randint(1, max(1, (min(datetime.fromisoformat(tournament["startingTime"]), datetime.now()) - datetime.fromisoformat(neo4j_administrator["creationTime"])).days)))).isoformat()
            })            

        ##################### NEO4J WINNERS #####################

        # Set the winner of the tournament
        if status == "FINISHED" and tournament["numParticipants"] > 0:
            tournament["winner"] = random.choice(tournament_participants)
            # Add the winner to the list of winners
            neo4j_winners.append({
                "tournament": tournament["id"],
                "winner": tournament["winner"],
                "winnerTime": ((datetime.fromisoformat(tournament["startingTime"])) + timedelta(days=random.randint(1, 30))).isoformat()
            })
        
        # Append the tournament to the list of tournaments.
        tournaments.append(tournament)

100%|██████████| 2000/2000 [01:39<00:00, 20.03it/s]


In [80]:
save_to_json(tournaments, TOURNAMENTS_JSON)
logger.info(f"Saved {len(tournaments)} tournaments.")

save_to_json(neo4j_tournaments, NEO4J_TOURNAMENTS_JSON)
logger.info(f"Saved {len(neo4j_tournaments)} tournaments.")

save_to_json(neo4j_game_tournaments, NEO4J_GAME_TOURNAMENTS_JSON)
logger.info(f"Saved {len(neo4j_game_tournaments)} game-tournament relationships.")

save_to_json(neo4j_participants, NEO4J_PARTICIPANTS_JSON)
logger.info(f"Saved {len(neo4j_participants)} participants.")

save_to_json(neo4j_winners, NEO4J_WINNERS_JSON)
logger.info(f"Saved {len(neo4j_winners)} winners.")

save_to_json(neo4j_administrators, NEO4J_ADMINISTRATORS_JSON)
logger.info(f"Saved {len(neo4j_administrators)} administrators.")

2025-01-21 11:06:42,915 - INFO - Saved 20006 tournaments.
2025-01-21 11:06:43,666 - INFO - Saved 20006 tournaments.
2025-01-21 11:06:43,868 - INFO - Saved 20006 game-tournament relationships.
2025-01-21 11:06:52,067 - INFO - Saved 531808 participants.
2025-01-21 11:06:52,312 - INFO - Saved 9799 winners.
2025-01-21 11:06:52,652 - INFO - Saved 20006 administrators.
