# 2. users.json

Requirements

In [21]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


Imports

In [22]:
import os
import time

import pandas as pd
import numpy as np

import json
from tqdm import tqdm
import logging

import uuid
import random
from datetime import datetime, timedelta

import string
import secrets
import hashlib

Logging configuration

In [23]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

Constants

In [24]:
# Repositories
DATA_DIR = os.path.join("..", "data")
TEMP_DIR = os.path.join(DATA_DIR, "temp")
RAW_DIR = os.path.join(DATA_DIR, "raw")
MONGO_DIR = os.path.join(DATA_DIR, "clean", "mongo")
NEO4J_DIR = os.path.join(DATA_DIR, "clean", "neo4j")

In [25]:
# Repos creation
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(MONGO_DIR, exist_ok=True)
os.makedirs(NEO4J_DIR, exist_ok=True)

In [26]:
# Files
MAP_FILE = os.path.join(TEMP_DIR, "user_map.json")
BIRTH_DATE_FILE = os.path.join(TEMP_DIR, "birth_dates.json")
RECENT_REVIEWS = os.path.join(TEMP_DIR, "recent_reviews.json")

BOARD_GAMES_FILE = os.path.join(RAW_DIR, "boardgames&reviews.json")
USERS_FILE = os.path.join(RAW_DIR, "users.json")

MONGO_USERS_JSON = os.path.join(MONGO_DIR, "users.json")
NEO4J_USERS_JSON = os.path.join(NEO4J_DIR, "users.json")

Utility functions

In [27]:
def save_to_json(data, filename):
    """
    Save data to a JSON file.

    Parameters:
    data (any): The data to be saved to the JSON file. This can be any data type that is serializable to JSON.
    filename (str): The name of the file where the data will be saved.

    Returns:
    None
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [28]:
def append_to_json(new_data, filename):
    """
    Adds new data to an existing JSON file without duplicates.

        new_data (list): The new data to add.
        filename (str): The name of the JSON file.
    """
    # Read existing data from the file.
    try:
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        existing_data = []

    # Combine the existing data with the new data and remove duplicates.
    combined_data = {json.dumps(item, sort_keys=True): item for item in existing_data + new_data}.values()

    # Save the combined data back to the file.
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(list(combined_data), f, ensure_ascii=False, indent=4)

## Functions

In [29]:
def generate_random_password(length=12):
    """
    Genera una password casuale con caratteri alfanumerici e simboli speciali.
    """
    characters = string.ascii_letters + string.digits + "!@#$%^&*()-_=+<>?"
    password = ''.join(secrets.choice(characters) for _ in range(length))
    return password

In [30]:
def hash_password(password):
    """
    Codifica una password in SHA-256 e restituisce l'hash in formato esadecimale.
    """
    hashed = hashlib.sha256(password.encode('utf-8')).hexdigest()
    return hashed

## Execution

In [31]:
with open(USERS_FILE, "r", encoding="utf-8") as f:
    raw_users = json.load(f)
logger.info(f"Loaded {len(raw_users)} users.")

with open(MAP_FILE, "r", encoding="utf-8") as f:
    user_map = json.load(f)
logger.info(f"Loaded {len(user_map)} user mappings.")

with open(BIRTH_DATE_FILE, "r", encoding="utf-8") as f:
    birth_dates = json.load(f)
logger.info(f"Loaded {len(birth_dates)} birth dates.")

with open(RECENT_REVIEWS, "r", encoding="utf-8") as f:
    recent_reviews = json.load(f)

2025-01-20 12:35:34,774 - INFO - Loaded 5000 users.
2025-01-20 12:35:34,888 - INFO - Loaded 121274 user mappings.
2025-01-20 12:35:34,909 - INFO - Loaded 5000 birth dates.


In [32]:
users = []
neo4j_users = []

i = 0
EMAIL_DOMAINS = ["gmail.com", "yahoo.com", "hotmail.com", "outlook.com"]
for raw_user in tqdm(raw_users):

    ########################## MONGO USERS ##############################

    user = {
        "id": str(uuid.uuid4()),

        "username": raw_user["name"],
        
        "firstName": raw_user["firstname"],
        "lastName": raw_user["lastname"],
        
        "email": raw_user["firstname"].lower() + "." + raw_user["lastname"].lower() + str(i) + "@" + random.choice(EMAIL_DOMAINS),
        "password": hash_password(generate_random_password(12 - 4 + i % 8)),

        "birthDate": birth_dates[raw_user["name"]],

        "location": {
            "city": raw_user["city"] if "city" in raw_user else None,
            "stateOrProvince": raw_user["stateorprovince"] if "stateorprovince" in raw_user else None,
            "country": raw_user["country"] if "country" in raw_user else None
        },

        "followers": None,
        "following": None,
        "tournaments": {
            "participated": None,
            "won": None,
            "created": None
        },
        
        "mostRecentReviews": recent_reviews[raw_user["name"]],

        "role": "ROLE_USER"
    }

    # Append to user list
    users.append(user)

    ######################## NEO4J USERS ###########################

    neo4j_user = {
        "username": user["username"],
        "location": user["location"],
    }

    # Append to neo4j user list
    neo4j_users.append(neo4j_user)

    i += 1

100%|██████████| 5000/5000 [00:00<00:00, 19305.51it/s]


In [33]:
# Save users to JSON
if users:
    save_to_json(users, MONGO_USERS_JSON)
logger.info(f"Saved {len(users)} users to {MONGO_USERS_JSON}.")
users = None

# Save neo4j users to JSON
if neo4j_users:
    save_to_json(neo4j_users, NEO4J_USERS_JSON)
logger.info(f"Saved {len(neo4j_users)} users to {NEO4J_USERS_JSON}.")
neo4j_users = None

2025-01-20 12:35:36,658 - INFO - Saved 5000 users to ..\data\clean\mongo\users.json.
2025-01-20 12:35:36,779 - INFO - Saved 5000 users to ..\data\clean\neo4j\users.json.
