# 1. games.json / reviews.json

Requirements

In [1]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


Imports

In [2]:
import os
import time

import pandas as pd
import numpy as np

import json
from tqdm import tqdm
import logging

import uuid
import random
from datetime import datetime, timedelta

Logging configuration

In [3]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

Constants

In [4]:
# Repositories
DATA_DIR = os.path.join("..", "data")
TEMP_DIR = os.path.join(DATA_DIR, "temp")
RAW_DIR = os.path.join(DATA_DIR, "raw")
MONGO_DIR = os.path.join(DATA_DIR, "clean", "mongo")
NEO4J_DIR = os.path.join(DATA_DIR, "clean", "neo4j")

In [5]:
# Repos creation
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(MONGO_DIR, exist_ok=True)
os.makedirs(NEO4J_DIR, exist_ok=True)

In [6]:
MIN_UPLOAD_TIME = 2000

In [7]:
# Files
MAP_FILE = os.path.join(TEMP_DIR, "user_map.json")
BIRTH_DATE_FILE = os.path.join(TEMP_DIR, "birth_dates.json")
RECENT_REVIEWS = os.path.join(TEMP_DIR, "recent_reviews.json")

BOARD_GAMES_FILE = os.path.join(RAW_DIR, "boardgames&reviews.json")
USERS_FILE = os.path.join(RAW_DIR, "users.json")

GAMES_JSON = os.path.join(MONGO_DIR, "games.json")
REVIEWS_JSON = os.path.join(MONGO_DIR, "reviews.json")
NEO4J_GAMES_JSON = os.path.join(NEO4J_DIR, "games.json")

Utility functions

In [8]:
def save_to_json(data, filename):
    """
    Save data to a JSON file.

    Parameters:
    data (any): The data to be saved to the JSON file. This can be any data type that is serializable to JSON.
    filename (str): The name of the file where the data will be saved.

    Returns:
    None
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [9]:
def append_to_json(new_data, filename):
    """
    Adds new data to an existing JSON file without duplicates.

        new_data (list): The new data to add.
        filename (str): The name of the JSON file.
    """
    # Read existing data from the file.
    try:
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        existing_data = []

    # Combine the existing data with the new data and remove duplicates.
    combined_data = {json.dumps(item, sort_keys=True): item for item in existing_data + new_data}.values()

    # Save the combined data back to the file.
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(list(combined_data), f, ensure_ascii=False, indent=4)

## Functions

In [10]:
def generate_upload_date(year_released, popularity_score, std_dev=0.3):
    """
    Generate a realistic upload date based on the release year and popularity.
    
    Args:
        year_released (int): The year the game was released.
        popularity_score (float): Score from 0 (least popular) to 1 (most popular).
        std_dev (int): Standard deviation for the normal distribution.
    
    Returns:
        datetime: A random upload date.
    """
    # Bias the center year based on popularity (more popular games are uploaded later)
    bias_year = int(((datetime.now().year - 1) - year_released) * (1 - popularity_score))
    center_year = year_released + bias_year
    
    # Generate a random year using a normal distribution centered around the biased year
    upload_year = int(random.gauss(center_year, std_dev))
    upload_year = max(upload_year, max(year_released, 1))  # Ensure it’s not before release year and within valid range
    upload_year = min(upload_year, datetime.now().year - 1)  # Ensure it’s not in the future
    
    # Generate a random date within the selected year
    start_date = datetime(upload_year, 1, 1)
    end_date = datetime(upload_year, 12, 31)
    random_days = random.randint(0, (end_date - start_date).days)
    upload_date = start_date + timedelta(days=random_days)
    
    return upload_date

In [11]:
def generate_review_date_gamma(upload_date, shape=2, scale=30, max_years=5):
    """
    Generate a realistic review postDate using a gamma distribution.
    
    Args:
        upload_date (datetime): The date the game was uploaded to the site.
        shape (float): Shape parameter of the gamma distribution (α).
        scale (float): Scale parameter of the gamma distribution (β).
        max_years (int): Maximum years after the upload date for a review.
    
    Returns:
        datetime: A realistic postDate for the review.
    """
    # Define the maximum number of days for reviews
    max_days = ((datetime.now().year - 1) - upload_date.year) * 365
    end_date = min(upload_date + timedelta(days=max_days), datetime.now())
    
    # Generate random days using the gamma distribution
    random_days = int(random.gammavariate(shape, scale))
    
    # Clamp random_days to ensure it doesn't exceed the valid range
    random_days = min(random_days, (end_date - upload_date).days)
    
    # Calculate the review date
    review_date = upload_date + timedelta(days=random_days)
    return review_date

In [12]:
def generate_birth_dates(review_dates, random_prob=0.7):
    """
    Genera date di nascita verosimili basate sulla data della prima review.
    
    :param review_dates: Dizionario con {utente: data_prima_review}
    :return: Dizionario con {utente: data_di_nascita}
    """
    birth_dates = {}

    for user, review_date in review_dates.items():
        # Converte la data della prima review in un oggetto datetime
        review_datetime = datetime.strptime(review_date, "%Y-%m-%dT%H:%M:%SZ")
        
        # Genera un'età casuale tra 18 e 60 anni
        if random.random() < random_prob:
            age = random.randint(18, 40)
        else:
            age = random.randint(41, 60)
        
        # Calcola la data di nascita sottraendo l'età
        birth_date = review_datetime - timedelta(days=age * 365)  # Approx 365 giorni per anno
        
        # Salva la data di nascita nel dizionario come stringa
        birth_dates[user] = birth_date.strftime("%Y-%m-%d")

    return birth_dates

## Execution

Load the games data from the `raw/boardgames&reviews.json`

In [13]:
with open(BOARD_GAMES_FILE, "r", encoding="utf-8") as f:
    raw_games = json.load(f)
logger.info(f"Loaded {len(raw_games)} games.")

# Load the user map
with open(MAP_FILE, "r", encoding="utf-8") as f:
    user_map = json.load(f)
logger.info(f"Loaded user map with {len(user_map)} users.")

# Load the users
with open(USERS_FILE, "r", encoding="utf-8") as f:
    raw_users = json.load(f)
logger.info(f"Loaded {len(raw_users)} users.")

2025-01-20 12:13:38,883 - INFO - Loaded 2120 games.
2025-01-20 12:13:39,037 - INFO - Loaded user map with 121274 users.
2025-01-20 12:13:44,138 - INFO - Loaded 5000 users.


In [14]:
games = []
neo4jgames = []
reviews = []

for raw_game in tqdm(raw_games):

    ########################## MONGO BOARDGAME ##############################

    game = {
        "id": str(uuid.uuid4()),

        "name": raw_game["name"],
        "yearReleased": raw_game["yearReleased"] if raw_game["yearReleased"] >= 0 else 0,
        "uploadTime": None,
        
        "description": raw_game["description"],
        "shortDescription": raw_game["short_description"],
        
        "averageRating": None,
        "ratingVoters": None,

        "minPlayers": raw_game["minPlayers"],
        "maxPlayers": raw_game["maxPlayers"],
        "minSuggAge": raw_game["minSuggAge"],
        "minPlaytime": raw_game["minPlayTime"],
        "maxPlaytime": raw_game["maxPlayTime"],

        "designers": raw_game["designers"],
        "artists": raw_game["artists"],
        "publishers": raw_game["publishers"],

        "categories": raw_game["categories"],

        "mechanics": raw_game["mechanisms"],
        "family": raw_game["family"]
    }

    # Generate average rating and rating voters
    ratings = [review["rating"] for review in raw_game["reviews"] if review["rating"] != "N/A"]
    game["ratingVoters"] = len(ratings)
    game["averageRating"] = sum(ratings) / game["ratingVoters"] if game["ratingVoters"] > 0 else None

    # Generate upload time
    game["uploadTime"] = generate_upload_date(max(game["yearReleased"], MIN_UPLOAD_TIME), game["averageRating"] / 10 if game["averageRating"] is not None else 0).isoformat() + 'Z'

    # Append to games list
    games.append(game)

    ######################### NEO4J BOARDGAME ##############################

    neo4jgame = {
        "id": game["id"],
        "name": game["name"],
        "yearReleased": game["yearReleased"],
        "shortDescription": game["shortDescription"],
        "categories": game["categories"]
    }

    # Append to neo4j games list
    neo4jgames.append(neo4jgame)

    ######################### MONGO REVIEWS ##############################

    for raw_review in raw_game["reviews"]:
        if raw_review["rating"] == "N/A":
            continue

        review = {
            "id": str(uuid.uuid4()),

            "postDate": None,
            "author": {
                "username": user_map[raw_review["user"]],
                "birthDate": None
            },

            "location": {
                "city": None,
                "stateOrProvince": None,
                "country": None
            },

            "game": {
                "id": game["id"],
                "name": game["name"],
                "yearReleased": game["yearReleased"],
                "shortDescription": game["shortDescription"]
            },

            "rating": raw_review["rating"],
            "content": raw_review["comment"]
        }

        # Generate post date
        review["postDate"] = generate_review_date_gamma(datetime.fromisoformat(game["uploadTime"][:-1])).isoformat() + 'Z'

        # Insert author location
        user = next((u for u in raw_users if u["name"] == review["author"]["username"]), None)
        if user:
            review["location"]["city"] = user.get("city", None)
            review["location"]["stateOrProvince"] = user.get("stateorprovince", None)
            review["location"]["country"] = user.get("country", None)

        # Append to reviews list
        reviews.append(review)

100%|██████████| 2120/2120 [06:40<00:00,  5.30it/s]


In [15]:
# Ensure any user wrote at most one review per game
unique_reviews = []
game_partial_num = {}
game_partial_sum = {}
seen_reviews = set()
for review in reviews:
    game_id = review["game"]["id"]
    author = review["author"]["username"]
    if (game_id, author) not in seen_reviews:
        unique_reviews.append(review)
        seen_reviews.add((game_id, author))
    game_partial_num[game_id] = game_partial_num.get(game_id, 0) + 1
    game_partial_sum[game_id] = game_partial_sum.get(game_id, 0) + review["rating"]
reviews = unique_reviews

In [16]:
# Find the 3 most recent reviews for each user
recent_reviews = {}
for review in reviews:
    user = review["author"]["username"]
    post_date = review["postDate"]
    if user not in recent_reviews:
        recent_reviews[user] = []
    recent_reviews[user].append({
        "id": review["id"],
        "postDate": post_date,
        "game": {
            "id": review["game"]["id"],
            "name": review["game"]["name"],
            "yearReleased": review["game"]["yearReleased"]
        },
        "rating": review["rating"],
        "content": review["content"]
    })
    # Sort the reviews by postDate in descending order and keep only the 3 most recent
    recent_reviews[user] = sorted(recent_reviews[user], key=lambda x: x["postDate"], reverse=True)[:3]

In [17]:
# Update games with rating voters and average rating
for game in games:
    game["ratingVoters"] = game_partial_num.get(game["id"], 0)
    game["averageRating"] = game_partial_sum.get(game["id"], 0) / game["ratingVoters"] if game["ratingVoters"] > 0 else None

In [18]:
# Find the date of the first review for each user
user_first_review = {}
for review in reviews:
    user = review["author"]["username"]
    post_date = review["postDate"]
    user_first_review[user] = post_date if user not in user_first_review else min(user_first_review[user], post_date)

# Generate birth dates for users
birth_dates = generate_birth_dates(user_first_review)

# Insert birth dates into reviews
for review in reviews:
    review["author"]["birthDate"] = birth_dates[review["author"]["username"]]

In [19]:
if recent_reviews:
    save_to_json(recent_reviews, RECENT_REVIEWS)
logger.info(f"Saved {len(recent_reviews)} recent reviews to {RECENT_REVIEWS}.")
recent_reviews = None

if birth_dates:
    save_to_json(birth_dates, BIRTH_DATE_FILE)
logger.info(f"Saved {len(birth_dates)} birth dates to {BIRTH_DATE_FILE}.")
birth_dates = None

if games:
    save_to_json(games, GAMES_JSON)
logger.info(f"Saved {len(games)} games to {GAMES_JSON}.")
games = None

if neo4jgames:
    save_to_json(neo4jgames, NEO4J_GAMES_JSON)
logger.info(f"Saved {len(neo4jgames)} games to {NEO4J_GAMES_JSON}.")
neo4jgames = None

if reviews:
    save_to_json(reviews, REVIEWS_JSON)
logger.info(f"Saved {len(reviews)} reviews to {REVIEWS_JSON}.")
reviews = None

2025-01-20 12:20:31,245 - INFO - Saved 5000 recent reviews to ..\data\temp\recent_reviews.json.
2025-01-20 12:20:31,279 - INFO - Saved 5000 birth dates to ..\data\temp\birth_dates.json.
2025-01-20 12:20:31,604 - INFO - Saved 2120 games to ..\data\clean\mongo\games.json.
2025-01-20 12:20:31,684 - INFO - Saved 2120 games to ..\data\clean\neo4j\games.json.
2025-01-20 12:22:31,519 - INFO - Saved 675226 reviews to ..\data\clean\mongo\reviews.json.
