In [1]:
import numpy as np
import h5py
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from scipy.stats import wasserstein_distance
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [2]:
def compute_distributional_distance(g1, g2):
    g1 = np.array(g1)
    g2 = np.array(g2)
    
    dist = np.mean([
        wasserstein_distance(g1[i], g2[j]) 
        for i in range(g1.shape[0])
        for j in range(g2.shape[0])
    ])
    return dist

In [3]:
def compute_distance_for_pair(i, j, games, embeddings):
    return i, j, compute_distributional_distance(embeddings[games[i]], embeddings[games[j]])

In [4]:
def load_embeddings(path):
    review_embeddigns = {}
    with h5py.File(path, 'r') as f:
        for key in f.keys():
            review_embeddigns[key] = list(f[key])
    return review_embeddigns

In [None]:
review_embeddings = load_embeddings('data/positive_review_embeddings.h5')
games = list(review_embeddings.keys())
n_games = len(games)
distance_matrix = np.zeros((n_games, n_games))

results = Parallel(n_jobs=100, backend='loky')(delayed(compute_distance_for_pair)(i, j, games, review_embeddings) 
                                              for i in tqdm(range(n_games), desc=f'Computing distances') 
                                              for j in range(i+1, n_games))

Computing distances:   0%|          | 0/77 [00:00<?, ?it/s]

In [8]:
results

[(0, 1, 0.02341067940760892),
 (0, 2, 0.022470201831768587),
 (0, 3, 0.02199411595921661),
 (0, 4, 0.021124700626415995),
 (0, 5, 0.024186084497763532),
 (0, 6, 0.020626441756217518),
 (0, 7, 0.023505757863437615),
 (0, 8, 0.02685858787385296),
 (0, 9, 0.021604949495085495),
 (0, 10, 0.023631785947964404),
 (0, 11, 0.025692850660076597),
 (0, 12, 0.02131685192493714),
 (0, 13, 0.023356711609629054),
 (0, 14, 0.020310977558925334),
 (0, 15, 0.022123104859826825),
 (0, 16, 0.02594026692974096),
 (0, 17, 0.02292644807245616),
 (0, 18, 0.021699111415975046),
 (0, 19, 0.022861227268920495),
 (0, 20, 0.021172919364914323),
 (0, 21, 0.022275721559171638),
 (0, 22, 0.028301699309352207),
 (0, 23, 0.023777533855515797),
 (0, 24, 0.021771122838273798),
 (0, 25, 0.0249168420848178),
 (0, 26, 0.026647351661042158),
 (0, 27, 0.02197339946226671),
 (0, 28, 0.02850848216331657),
 (0, 29, 0.020016542599415488),
 (0, 30, 0.021248207689458794),
 (0, 31, 0.024557379528271535),
 (0, 32, 0.0233867109239719

In [13]:
for i, j, distance in results:
	distance_matrix[i, j] = distance
	distance_matrix[j, i] = distance
    
clustering = AgglomerativeClustering(n_clusters=50, metric='precomputed', linkage='average')
labels = clustering.fit_predict(distance_matrix)

clusters = {}
for game, label in zip(games, labels):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(game)

print(clusters)

{0: ['american-truck-simulator-starter-pack-california', 'apex-legends', 'battlefield-3', 'borderlands-2', 'conan-exiles', 'dark-souls-ii', 'diablo-iii', 'enter-the-gungeon', 'fallout-4', 'final-fantasy-xiv-endwalker', 'guilty-gear-xx-accent-core-plus', 'guitar-hero', 'halo-reach', 'kingdom-rush', 'league-of-legends', 'magic-the-gathering-arena', 'manhunt', 'mass-effect-2', 'overwatch', 'path-of-exile', 'pathfinder-kingmaker', 'return-to-castle-wolfenstein-enemy-territory', 'shenmue-ii', 'the-elder-scrolls-v-skyrim', 'tom-clancys-rainbow-six-siege', 'tomb-raider', 'tribes-ascend', 'valkyrie-profile-2-silmeria'], 33: ['angry-birds'], 37: ['animal-crossing-new-horizons'], 43: ['battlefield-v'], 45: ['call-of-duty-modern-warfare-2'], 39: ['command-and-conquer-red-alert'], 38: ['counter-strike-source'], 27: ['crash-bandicoot-4-its-about-time'], 29: ['diablo'], 47: ['dishonored'], 31: ['doom-eternal'], 28: ['dota-2'], 32: ['fallout-3'], 34: ['fifa-18'], 35: ['fifa-20'], 36: ['final-fantasy-