In [17]:
import polars as pl

In [18]:
COMMENTS_PATH = '/Users/martinctl/Developer/EPFL/MA1/ada/project-data/gaming_comments.parquet'
COMMENTS_PATH_GAMES = '/Users/martinctl/Developer/EPFL/MA1/ada/project-data/gaming_comments_with_games.parquet'

# videos_df = pl.read_csv('../data/youniverse/filtered/gaming_videos_with_games.tsv', separator='\t')
# comments_df = pl.read_csv('../data/youniverse/filtered/gaming_comments.tsv', separator='\t')
videos_df = pl.read_parquet('/Users/martinctl/Developer/EPFL/MA1/ada/project-data/gaming_videos_with_games.parquet')

In [19]:
comments_df = pl.scan_parquet(COMMENTS_PATH)
comments_df.head(5).collect()

author,video_id
i64,str
2,"""9pQILRT42Cg"""
2,"""PWWRzCyuiFU"""
5,"""9MuGpmXGlsY"""
5,"""UvZPbfUkMGw"""
11,"""qj9sjQjQ19M"""


In [20]:
games_count = videos_df.group_by('video_game').agg(pl.count('video_game').alias('count')).sort('count', descending=True)
games_count

video_game,count
str,u32
"""minecraft""",766154
"""fortnite""",377337
"""call of duty""",368658
"""league of legends""",236065
"""roblox""",205762
…,…
"""pokémon unite""",2
"""the great ace attorney adventu…",2
"""road 96""",1
"""triangle strategy""",1


In [21]:
# Get only the display_id and video_game columns
joined_df = videos_df.select([
    pl.col('display_id'),
    pl.col('video_game'),
])
joined_df

# Create a set of mappings from display_id to video_game
df = joined_df.to_pandas()
videos = pl.Series('video_id', joined_df.select(pl.col('display_id')))
display_id_to_game = dict(zip(df['display_id'], df['video_game']))

def get_game(display_id: str):
    return display_id_to_game.get(display_id, None)

In [22]:
# import random
# import time

# random.seed(42)
# random_display_id = random.sample(list(display_id_to_game.keys()), 1_000_000)

# start = time.time()
# for display_id in random_display_id:
#     get_game(display_id)
    
# print((time.time() - start) * 2000, 's')

In [23]:
comments_valid = comments_df.filter(pl.col('video_id').is_in(videos))

comments_game = comments_valid.with_columns(pl.col('video_id').map_elements(get_game, return_dtype=str))

# # We can now save the comments with the game
# comments_game.sink_parquet('/Users/martinctl/Developer/EPFL/MA1/ada/project-data/gaming_comments_with_games.parquet')

comments_game_df = (pl.scan_parquet(COMMENTS_PATH_GAMES, n_rows=100_000_000)).unique()

In [25]:
from itertools import combinations
from tqdm import tqdm

# ! Take 30 minutes to run

def get_edges(comments):
    rows = comments.shape[0]
    
    curr_author = None
    games = set()
    edges = {}
    authors = 0

    for com in tqdm(comments.iter_rows(), total=rows):
        author, game = com
        if author != curr_author:
            curr_author = author
            if len(games) > 1:
                for edge in combinations(games, 2):
                    edge = tuple(sorted(edge))
                    edges[edge] = edges.get(edge, 0) + 1
            games = set()
            authors += 1

        games.add(game)
    
    return sorted(edges.items(), key=lambda x: x[1], reverse=True)

sorted_edges = get_edges(pl.read_parquet(COMMENTS_PATH_GAMES))
sorted_edges

100%|██████████| 1369135342/1369135342 [30:03<00:00, 759362.86it/s]


[(('fortnite', 'minecraft'), 8238018),
 (('minecraft', 'roblox'), 7883337),
 (('call of duty', 'minecraft'), 5680816),
 (('grand theft auto', 'minecraft'), 5109961),
 (('fortnite', 'roblox'), 4295719),
 (('call of duty', 'fortnite'), 3848304),
 (('five nights at freddys', 'minecraft'), 3805796),
 (('call of duty', 'grand theft auto'), 3583126),
 (('fortnite', 'grand theft auto'), 3274580),
 (('garrys mod', 'minecraft'), 2791642),
 (('minecraft', 'the sims'), 2530688),
 (('happy wheels', 'minecraft'), 2356728),
 (('five nights at freddys', 'roblox'), 1742120),
 (('grand theft auto', 'roblox'), 1675014),
 (('five nights at freddys', 'fortnite'), 1573621),
 (('hello neighbor', 'minecraft'), 1511571),
 (('minecraft', 'yandere simulator'), 1506154),
 (('league of legends', 'minecraft'), 1503122),
 (('clash royale', 'minecraft'), 1502185),
 (('clash of clans', 'minecraft'), 1497485),
 (('call of duty', 'garrys mod'), 1492649),
 (('garrys mod', 'grand theft auto'), 1492326),
 (('counterstrike

In [27]:
edges_pldf = pl.DataFrame({
    'game1': [edge[0][0] for edge in sorted_edges],
    'game2': [edge[0][1] for edge in sorted_edges],
    'weight': [edge[1] for edge in sorted_edges]
})

# uncomment to save the edges
# edges_pldf.write_csv('../data/games_edges.csv')

In [None]:
# # Polars version (doesn't work on full dataset)

# comments_video_game = comments_df.join(joined_df.lazy(), left_on='video_id', right_on='display_id')
# comments_channels = comments_video_game.select([
#     pl.col('author'),
#     pl.col('video_game')
# ]).unique()

# counts_per_author = (
#     comments_game_df
#     .group_by("author")
#     .agg(pl.col("video_game").count().alias("count"))
#     .filter(pl.col("count") > 1)
# )

# comments_several_channels = comments_game_df.join(counts_per_author, on="author", how="inner").select(['author', 'video_game'])

# pairs = comments_several_channels.join(comments_several_channels, on="author", how="inner").filter(pl.col('video_game') < pl.col('video_game_right')).select(['video_game', 'video_game_right'])

# pairs_grouped = pairs.group_by(['video_game', 'video_game_right']).agg(pl.col('video_game').count().alias('weight')).sort('weight', descending=True)

# pairs_grouped.collect()

### Weight adjustment and filtering

In [28]:
def adjust_weight(edges: pl.DataFrame, popularity: dict, threshold: int, alpha: float, beta: float = 1.0) -> dict:
    adjusted_edges = {}
    for edge in edges.iter_rows():
        game1, game2, weight = edge
        # Get the popularity of the games
        p1 = popularity[game1]
        p2 = popularity[game2]
        # Adjust the weight using the formula
        adjusted_weight = int((weight ** beta) * ((min(p1, p2) / max(p1, p2)) ** alpha))
        if adjusted_weight > threshold:
            adjusted_edges[(game1, game2)] = adjusted_weight
    return adjusted_edges

In [32]:
edges = pl.read_csv("../data/games_edges_all.csv")
popularity = dict(zip(games_count["video_game"], games_count["count"]))
alpha = 0.5
beta = 0.75
threshold = 5000

adjusted_edges = adjust_weight(edges, popularity, threshold, alpha, beta)


edges_dict = {}
for edge in edges.iter_rows():
    game1, game2, weight = edge
    edges_dict[(game1, game2)] = weight

unique_games = set()
for edge in adjusted_edges:
    unique_games.add(edge[0])
    unique_games.add(edge[1])
    
print(f'After filtering : {len(adjusted_edges)} edges and {len(unique_games)} nodes')

print(f'Before adjustment ({len(edges_dict)} edges)')
print(f'fortnite -> minecraft: {edges_dict[("fortnite", "minecraft")]}')
print(f'fifa 14 -> minecraft: {edges_dict[("fifa 14", "minecraft")]}')
print(f'fifa 14 -> fifa 15: {edges_dict[("fifa 14", "fifa 15")]}')

print(f'After adjustment ({len(adjusted_edges)} edges)')
print(f'fortnite -> minecraft: {adjusted_edges[("fortnite", "minecraft")]} if ("fortnite", "minecraft") in adjusted_edges else 0')
print(f'fifa 14 -> minecraft: {adjusted_edges[("fifa 14", "minecraft")] if ("fifa 14", "minecraft") in adjusted_edges else 0}')
print(f'fifa 14 -> fifa 15: {adjusted_edges[("fifa 14", "fifa 15")] if ("fifa 14", "fifa 15") in adjusted_edges else 0}')

After filtering : 916 edges and 119 nodes
Before adjustment (540605 edges)
fortnite -> minecraft: 8238018
fifa 14 -> minecraft: 437157
fifa 14 -> fifa 15: 433682
After adjustment (916 edges)
fortnite -> minecraft: 107912 if ("fortnite", "minecraft") in adjusted_edges else 0
fifa 14 -> minecraft: 0
fifa 14 -> fifa 15: 15747


In [53]:
import networkx as nx
import community as community_louvain  # Install via `pip install python-louvain`
import json

# Example input (edges list with weights after adjustment)

# Create graph
G = nx.Graph()
for (source, target), weight in adjusted_edges.items():
    G.add_edge(source, target, weight=weight)

# Create node_size which is the popularity of the game divided by 2000

# # Compute positions using Fruchterman-Reingold (spring layout)
positions = nx.forceatlas2_layout(G, max_iter=500, seed=42, dissuade_hubs=True, jitter_tolerance=0.6, gravity=50)

# Community detection using Louvain method
partition = community_louvain.best_partition(G, random_state=42)

# Prepare nodes and edges for JSON output
nodes = []
node_map = {}  # Maps node names to IDs
for i, (node, pos) in enumerate(positions.items()):
    node_map[node] = str(i)
    pop = popularity[node]
    nodes.append({
        "id": str(i),
        "name": node,
        "x": pos[0] * 1000,  # Scale positions for better visualization
        "y": pos[1] * 1000,
        "symbolSize": (pop / 2000) ** 0.5,  # Example: size proportional to degree
        "value": pop,
        "category": partition[node]
    })

edges = [
    {"source": node_map[source], "target": node_map[target]}
    for source, target in G.edges()
]

categories = [
    {"name": name}
    for name in set(partition.values())
]

# Output JSON
output = {
    "nodes": nodes,
    "edges": edges,
    "categories": categories
}

print(f'Number of nodes in the network: {len(nodes)}')
print(f'Number of edges in the network: {len(edges)}')
print(f'Number of communities in the network: {len(categories)}')

# Save to file
with open('../datastory/data/games_network.json', 'w') as f:
    json.dump(output, f, indent=2)

Number of nodes in the network: 119
Number of edges in the network: 916
Number of communities in the network: 5
