In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("bggrecommender/data/bgg_gamelist_all_details.csv")
df = df.query('~description.isnull()')
ids = df["id"]
names = df["name"]
embeddings = np.load("bggrecommender/data/embeddings.npy")
#embeddings = np.loadtxt("bggrecommender/data/embeddings.gz")

In [14]:
embeddings_comp = np.load("bggrecommender/data/embeddings.npz")["embeddings"]

In [3]:
embeddings = embeddings / np.sqrt((embeddings**2).sum(1, keepdims=True)) # L2 normalize the rows, as is common, in this case the embeddings we obtained from instructor-xl were already normalized.

In [5]:
np.sqrt((embeddings**2).sum(1, keepdims=True))

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [17]:
embeddings.shape

(24486, 768)

In [21]:
query = embeddings[12] # Use CATAN as an example
#query = embeddings[ids == 6707].squeeze()
query.shape

(768,)

In [22]:
similarities = embeddings.dot(query)
sorted_ix = np.argsort(-similarities)

top_n = []
print("top 10 results:")
for k in sorted_ix[1:21]:
  print(f"row {k}, similarity {similarities[k]}")
  top_n.append(k)

top 10 results:
row 238, similarity 0.9249699797717552
row 3872, similarity 0.9078441041138743
row 7781, similarity 0.8993142923018856
row 12958, similarity 0.8923270730570181
row 7343, similarity 0.8921123175026343
row 13037, similarity 0.8913179618970136
row 24382, similarity 0.8882377481422172
row 7402, similarity 0.8834125880273019
row 10693, similarity 0.8820077955509469
row 7789, similarity 0.8818567183142362
row 5484, similarity 0.8814609056904009
row 4378, similarity 0.8809738402659556
row 711, similarity 0.8797513701882536
row 14691, similarity 0.8794540142994617
row 15096, similarity 0.8793949653481807
row 18012, similarity 0.8773672260223191
row 13457, similarity 0.8772927690517545
row 11511, similarity 0.8767464241870259
row 23229, similarity 0.8762897664586161
row 4889, similarity 0.8752335268833704


In [23]:
list(names[top_n])

['Catan Card Game',
 'The Settlers of Zarahemla',
 'Catan Dice Game',
 'New Haven',
 'Desert Bazaar',
 'Catan: Family Edition',
 'Fugitive (Second Edition)',
 'Simply Catan',
 'Struggle for Catan',
 'Die Siedler von Catan: Junior',
 'Candamir: The First Settlers',
 'Anno 1503',
 'Roads & Boats',
 'Double Mission: Beyond the Object',
 'The Lords of Rock',
 'Bronze',
 'Bania',
 'Star Trek: Catan',
 'The Market: A Pocket Game',
 'La Strada']

### SVM

In [25]:
from sklearn import svm

# create the "Dataset"
x = embeddings
y = np.zeros(x.shape[0])
#x = np.concatenate([query[None,...], embeddings]) # x is (1001, 1536) array, with query now as the first row
#y = np.zeros(1001)
#y[12] = 1 # we have a single positive example, mark it as such
y[ids == 13] = 1

# train our (Exemplar) SVM
# docs: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
clf.fit(x, y) # train

# infer on whatever data you wish, e.g. the original data
similarities = clf.decision_function(x)
sorted_ix = np.argsort(-similarities)
top_n = []
print("top 10 results:")
for k in sorted_ix[:100]:
  print(f"row {k}, similarity {similarities[k]}")
  top_n.append(k)

top 10 results:
row 12, similarity 0.9927654268894202
row 13037, similarity 0.02769531505664169
row 238, similarity -0.04301077072541859
row 14691, similarity -0.13577372255562492
row 9993, similarity -0.1506640572050848
row 13927, similarity -0.15190723643975979
row 11511, similarity -0.15536846557135386
row 3872, similarity -0.18717087126884502
row 7781, similarity -0.19285689233556702
row 7402, similarity -0.2300652410095494
row 15288, similarity -0.25712353808070687
row 5484, similarity -0.2719627019873725
row 7789, similarity -0.2941698186977274
row 2631, similarity -0.302143327722725
row 9084, similarity -0.32932068936052183
row 8907, similarity -0.3338197756191692
row 24382, similarity -0.3447192491188129
row 18782, similarity -0.34738139766193155
row 11725, similarity -0.36762386533086067
row 2479, similarity -0.40858637005762366
row 7343, similarity -0.41714638825508765
row 17982, similarity -0.4327583319635654
row 24025, similarity -0.43891250705961893
row 711, similarity -0.

In [26]:
names[top_n]

12                                                   CATAN
13037                                Catan: Family Edition
238                                        Catan Card Game
14691                    Double Mission: Beyond the Object
9993     Catan Histories: Settlers of America – Trails ...
                               ...                        
13215                                               Castro
9045                                           Carson City
1188                                          Bauernschlau
20364                    Colonialism: Expanded 2nd edition
12035                                          Castle Dice
Name: name, Length: 100, dtype: object