# 4. Followers and Likes

Requirements

In [1]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


Imports

In [2]:
import os
import time

import pandas as pd
import numpy as np

import json
from tqdm import tqdm
import logging

from bs4 import BeautifulSoup
import re

import uuid
import random
from datetime import datetime, timedelta

import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter

from joblib import Parallel, delayed

Logging configuration

In [3]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

Constants

In [4]:
# Repositories
DATA_DIR = os.path.join("..", "data")
TEMP_DIR = os.path.join(DATA_DIR, "temp")
RAW_DIR = os.path.join(DATA_DIR, "raw")
MONGO_DIR = os.path.join(DATA_DIR, "clean", "mongo")
NEO4J_DIR = os.path.join(DATA_DIR, "clean", "neo4j")

In [5]:
# Repos creation
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(MONGO_DIR, exist_ok=True)
os.makedirs(NEO4J_DIR, exist_ok=True)

In [14]:
# Files
MAP_FILE = os.path.join(TEMP_DIR, "user_map.json")
RECENT_REVIEWS = os.path.join(TEMP_DIR, "recent_reviews.json")

BOARD_GAMES_FILE = os.path.join(RAW_DIR, "boardgames&reviews.json")
RAW_USERS_FILE = os.path.join(RAW_DIR, "users.json")

GAMES_JSON = os.path.join(MONGO_DIR, "games.json")
THREADS_JSON = os.path.join(MONGO_DIR, "threads.json")
REVIEWS_JSON = os.path.join(MONGO_DIR, "reviews.json")
USERS_JSON = os.path.join(MONGO_DIR, "users.json")

NEO4J_USERS = os.path.join(NEO4J_DIR, "users.json")
NEO4J_LIKES = os.path.join(NEO4J_DIR, "likes.json")
NEO4J_FOLLOWERS = os.path.join(NEO4J_DIR, "followers.json")

Utility functions

In [7]:
def save_to_json(data, filename):
    """
    Save data to a JSON file.

    Parameters:
    data (any): The data to be saved to the JSON file. This can be any data type that is serializable to JSON.
    filename (str): The name of the file where the data will be saved.

    Returns:
    None
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [8]:
def append_to_json(new_data, filename):
    """
    Adds new data to an existing JSON file without duplicates.

        new_data (list): The new data to add.
        filename (str): The name of the JSON file.
    """
    # Read existing data from the file.
    try:
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
    except FileNotFoundError:
        existing_data = []

    # Combine the existing data with the new data and remove duplicates.
    combined_data = {json.dumps(item, sort_keys=True): item for item in existing_data + new_data}.values()

    # Save the combined data back to the file.
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(list(combined_data), f, ensure_ascii=False, indent=4)

## Functions

In [9]:
def game_similarity(user1, user2, users):
    likes1 = set(users[user1]["likes"])
    likes2 = set(users[user2]["likes"])
    if not likes1 or not likes2:
        return 0
    return len(likes1 & likes2) / len(likes1 | likes2)

In [10]:
def review_similarity(user1, user2, users):
    reviews1 = set(users[user1]["reviews"])
    reviews2 = set(users[user2]["reviews"])
    if not reviews1 and not reviews2:
        return 0
    return len(reviews1 & reviews2) / len(reviews1 | reviews2)

In [11]:
def thread_similarity(user1, user2, users):
    threads1 = set(users[user1]["threads"])
    threads2 = set(users[user2]["threads"])
    if not threads1 and not threads2:
        return 0
    return len(threads1 & threads2) / len(threads1 | threads2)

In [12]:
def game_thread_similarity(user1, user2, users):
    game_threads1 = set(users[user1]["game_threads"])
    game_threads2 = set(users[user2]["game_threads"])
    if not game_threads1 and not game_threads2:
        return 0
    return len(game_threads1 & game_threads2) / len(game_threads1 | game_threads2)

In [13]:
def combined_similarity(user1, user2, users, weights=(0.4, 0.3, 0.2, 0.1)):
    w_games, w_reviews, w_threads, w_game_threads = weights
    return (
        w_games * game_similarity(user1, user2, users) +
        w_reviews * review_similarity(user1, user2, users) +
        w_threads * thread_similarity(user1, user2, users) +
        w_game_threads * game_thread_similarity(user1, user2, users)
    )

In [14]:
def calculate_edges(user1, users, threshold=0.5):
    local_edges = []
    for user2 in users.keys():
        if user1 != user2:
            prob = combined_similarity(user1, user2, users)
            if prob > threshold and random.random() < prob:
                local_edges.append((user1, user2))
    return local_edges

## Execution

In [29]:
with open(BOARD_GAMES_FILE, "r", encoding="utf-8") as f:
    raw_games = [ 
        {
            "id": int(raw_game["id"]), 
            "name": raw_game["name"],
            "yearReleased": raw_game["yearReleased"] if raw_game["yearReleased"] >= 0 else 0
        } 
        for raw_game in json.load(f)
    ]
with open(GAMES_JSON, "r", encoding="utf-8") as f:
    clean_games = [ 
        {
            "id": game["id"], 
            "name": game["name"],
            "yearReleased": game["yearReleased"]
        } 
        for game in json.load(f)
    ]
game_dict = {}
for raw_game in raw_games:
    for clean_game in clean_games:
        if raw_game["name"] == clean_game["name"] and raw_game["yearReleased"] == clean_game["yearReleased"]:
            game_dict[raw_game["id"]] = clean_game
            break
logger.info(f"Loaded {len(game_dict)} games.")

2025-01-21 02:15:50,377 - INFO - Loaded 2000 games.


In [36]:
with open(NEO4J_USERS, "r", encoding="utf-8") as f:
    users = [user["username"] for user in json.load(f)]
    user_objects = {user: {"reviews": [], "likes": [], "threads": [], "game_threads": []} for user in users}
logger.info(f"Loaded {len(users)} users from {NEO4J_USERS}")
users = None
logger.info(f"Loading user data...")

with open(REVIEWS_JSON, "r", encoding="utf-8") as f:
    reviews = json.load(f)
for review in reviews:
    user_objects[review["author"]["username"]]["reviews"].append(review["game"]["id"])
reviews = None
logger.info(f"Loaded reviews.")

with open(THREADS_JSON, "r", encoding="utf-8") as f:
    threads = json.load(f)
for thread in threads:
    for message in thread["messages"]:
        user_objects[message["author"]]["threads"].append(thread["id"])
        user_objects[message["author"]]["game_threads"].append(thread["game"]["id"])
threads = None
logger.info(f"Loaded threads.")

with open(MAP_FILE, "r", encoding="utf-8") as f:
    user_map = json.load(f)
with open(RAW_USERS_FILE, "r", encoding="utf-8") as f:
    raw_users = json.load(f)
for raw_user in raw_users:
    for like in raw_user["collection"]:
        if int(like["id"]) in game_dict:
            user_objects[user_map[raw_user["name"]]]["likes"].append(game_dict[int(like["id"])]["id"])
raw_users = None
logger.info(f"Loaded likes.")

2025-01-21 02:29:44,606 - INFO - Loaded 5000 users from ..\data\clean\neo4j\users.json
2025-01-21 02:29:44,607 - INFO - Loading user data...
2025-01-21 02:30:12,751 - INFO - Loaded reviews.
2025-01-21 02:30:19,376 - INFO - Loaded threads.
2025-01-21 02:30:26,075 - INFO - Loaded likes.


In [37]:
print(user_objects["ska_dad"]["likes"])
users = user_objects

['bd4e151b-fc86-4df9-8ff8-6d7bc71aa694', '037713e2-2488-4e92-827c-0c335610f899', '6db29df3-b110-4e56-9a0d-f8b3b0a4153d', 'c79e2dcd-d599-4fb5-a4af-40abffa3ee5e', 'c1db8d08-8689-44ee-8ba1-b0b2cb9d969f', '1409b765-e3d2-4e94-b818-1e4cd7e8d69e', '1c7abe5b-5cd8-44a7-852c-d681c4289bc3', 'fe1849b5-573c-4c03-8268-f8592a77bdae', '552dbc0b-85a5-4f8f-8c28-85b59c574a67', '2fdd8d09-f43b-471c-81b8-e2876bfe78f6', 'f72739c7-6127-4e2c-ab31-ea94c830bc3f', '1f76d06d-e494-41a3-8cc3-12a68704e6be', '59ac80d2-2038-43a7-8a8a-31b39d4de38b', '3c432fe4-895f-4d04-b7e2-a54c5be6ced0', '57569489-dacd-4696-9866-c789ec542935', '9dec7c90-4671-44b2-935e-2971661da450', 'cfb1602c-f72b-4f12-b3c8-fcafa163d381', '0a94bf5e-56b4-46fa-aead-3607e691fe77', 'c0978497-ede6-493d-a9e0-1219654c6ca2', 'fc65d705-fac2-4285-a366-21005ed1e304', '77b4c92c-efbc-4832-90e4-425bd47eabd0', '985444d7-b88e-4915-ae90-be4404f0878b', '9a81f536-9ca6-4425-8ff1-1ce6eeede89e', '6b5ec8b2-aac7-45b2-84f1-2d3de1b62497', 'e15eca6a-9301-4057-b2e6-8ce9d2f94d0e',

In [18]:
# Similarity threshold
threshold = 0.5
edges = []

for user1 in tqdm(users.keys(), desc="Finding edges"):
    for user2 in users.keys():
        if user1 != user2:
            prob = combined_similarity(user1, user2, users)
            if prob > threshold and random.random() < prob:
                edges.append((user1, user2))  # user1 segue user2

Finding edges:   0%|          | 0/5000 [00:00<?, ?it/s]

Finding edges: 100%|██████████| 5000/5000 [51:37<00:00,  1.61it/s]  


In [19]:
influencers = sorted(users.keys(), key=lambda u: (
    len(users[u]["reviews"]) + len(users[u]["threads"])
), reverse=True)[:50]  # Top 50 influencer

# Aggiungi più follower per gli influencer
for influencer in influencers:
    for user in users.keys():
        if user != influencer and random.random() < 0.8:
            edges.append((user, influencer))

In [20]:
reciprocal_probability = 0.3

for edge in edges.copy():
    if random.random() < reciprocal_probability:
        edges.append((edge[1], edge[0]))  # Relazione inversa

In [21]:
print("Nodes:", len(users))
print("Edges:", len(edges))

Nodes: 5000
Edges: 259674


In [22]:
in_degrees = Counter(edge[1] for edge in edges)
out_degrees = Counter(edge[0] for edge in edges)

print("Follower distribution:", in_degrees.most_common(10))
print("Following distribution:", out_degrees.most_common(10))

Follower distribution: [('Base the Bass', 4074), ('dasfungames', 4068), ('RGG_Ken', 4066), ('Elio19', 4052), ('garbagerunner', 4047), ('Pembrose', 4041), ('bnordeng', 4039), ('Stephilmike', 4039), ('gmbalbee', 4036), ('DrSly', 4035)]
Following distribution: [('Stephilmike', 1328), ('RGG_Ken', 1322), ('joespleen', 1281), ('JFAudy', 1277), ('Hellman1001', 1274), ('JJazz', 1273), ('MisuVir', 1272), ('curtc', 1267), ('garbagerunner', 1266), ('hollowdeathcult', 1265)]


In [31]:
with open(USERS_JSON, "r", encoding="latin-1") as f:
    birth_dates = json.load(f)
birth_dates = {birth["username"]: (datetime.strptime(birth["birthDate"], "%Y-%m-%d") + timedelta(days=18*365)).isoformat() for birth in birth_dates}

In [33]:
if edges:
    with open(NEO4J_FOLLOWERS, "w") as f:
        json.dump([
            {
                "follower": edge[0], 
                "followed": edge[1],
                "since": random.choice(pd.date_range(
                    min(birth_dates[edge[0]], birth_dates[edge[1]]),
                    max(birth_dates[edge[0]], birth_dates[edge[1]]))).isoformat(),
            } 
            for edge in edges], 
            f, ensure_ascii=False, indent=4)
logger.info(f"Saved {len(edges)} edges to {NEO4J_FOLLOWERS}")

2025-01-21 02:19:49,123 - INFO - Saved 259674 edges to ..\data\clean\neo4j\followers.json


In [42]:
likes = []
release_dict = {game_info["id"]: game_info["yearReleased"] for game_id, game_info in game_dict.items()}
for user in users.keys():
    for like in users[user]["likes"]:
        likes.append(
            {
                "user": user, 
                "game": like,
                "timestamp": random.choice(pd.date_range(
                    release_dict[like], 
                    datetime.now())).isoformat()
            }
        )
if likes:
    with open(NEO4J_LIKES, "w") as f:
        json.dump(likes, f, ensure_ascii=False, indent=4)
logger.info(f"Saved {len(likes)} likes to {NEO4J_LIKES}")

2025-01-21 02:41:34,490 - INFO - Saved 814939 likes to ..\data\clean\neo4j\likes.json
