In [1]:
import numpy as np
import pandas as pd

In [34]:
df = pd.read_csv("data/bgg_gamelist_cleaned.csv")
df = df.query('~description.isnull()')
ids = df["id"]
names = df["name"]
embeddings = np.load("data/embeddings.npz")["embeddings"]

In [36]:
embeddings = embeddings / np.sqrt((embeddings**2).sum(1, keepdims=True)) # L2 normalize the rows, as is common, in this case the embeddings we obtained from instructor-xl were already normalized.

In [35]:
np.sqrt((embeddings**2).sum(1, keepdims=True))

array([[1.       ],
       [1.       ],
       [1.       ],
       ...,
       [1.       ],
       [1.0000001],
       [1.       ]], dtype=float32)

In [37]:
embeddings.shape

(24610, 768)

In [38]:
query = embeddings[12] # Use Samurai as an example
#query = embeddings[ids == 6707].squeeze()
query.shape

(768,)

In [39]:
similarities = embeddings.dot(query)
sorted_ix = np.argsort(-similarities)

top_n = []
print("top 10 results:")
for k in sorted_ix[1:21]:
  print(f"row {k}, similarity {similarities[k]}")
  top_n.append(k)

top 10 results:
row 12965, similarity 0.9119023084640503
row 238, similarity 0.9105767011642456
row 3872, similarity 0.9104617834091187
row 4378, similarity 0.9054649472236633
row 11232, similarity 0.9050711393356323
row 15102, similarity 0.9046278595924377
row 24473, similarity 0.904604971408844
row 7347, similarity 0.9006145000457764
row 18820, similarity 0.8993796110153198
row 20314, similarity 0.8980516195297241
row 17991, similarity 0.8970128297805786
row 23034, similarity 0.8964637517929077
row 14698, similarity 0.8959041237831116
row 12195, similarity 0.8956476449966431
row 2146, similarity 0.8944798707962036
row 10456, similarity 0.8909525275230408
row 9090, similarity 0.8900841474533081
row 3314, similarity 0.8898947238922119
row 5635, similarity 0.8897260427474976
row 7786, similarity 0.8894286155700684


In [41]:
list(ids[top_n])

[146158,
 278,
 6778,
 8166,
 107529,
 178958,
 370597,
 24083,
 244330,
 269504,
 230266,
 329588,
 172980,
 131366,
 3076,
 84876,
 40370,
 5406,
 12681,
 27710]

In [40]:
list(names[top_n])

['New Haven',
 'Catan Card Game',
 'The Settlers of Zarahemla',
 'Anno 1503',
 'Kingdom Builder',
 'The Lords of Rock',
 'Fugitive (Second Edition)',
 'Desert Bazaar',
 'Scarabya',
 'Two Robots',
 'Scare It!',
 'Chili Mafia',
 'Double Mission: Beyond the Object',
 'Eight-Minute Empire',
 'Puerto Rico',
 'The Castles of Burgundy',
 'Islas Canarias',
 'New England',
 'Neuland',
 'Catan Dice Game']

### SVM

In [25]:
from sklearn import svm

# create the "Dataset"
x = embeddings
y = np.zeros(x.shape[0])
#x = np.concatenate([query[None,...], embeddings]) # x is (1001, 1536) array, with query now as the first row
#y = np.zeros(1001)
#y[12] = 1 # we have a single positive example, mark it as such
y[ids == 13] = 1

# train our (Exemplar) SVM
# docs: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
clf.fit(x, y) # train

# infer on whatever data you wish, e.g. the original data
similarities = clf.decision_function(x)
sorted_ix = np.argsort(-similarities)
top_n = []
print("top 10 results:")
for k in sorted_ix[:100]:
  print(f"row {k}, similarity {similarities[k]}")
  top_n.append(k)

top 10 results:
row 12, similarity 0.9921468167791094
row 8366, similarity -0.12454733979373334
row 13044, similarity -0.1461113224984164
row 24473, similarity -0.26539029629990896
row 2631, similarity -0.2879438038187119
row 3872, similarity -0.33209068221639226
row 9090, similarity -0.3336816009576016
row 7794, similarity -0.35497497511054843
row 11512, similarity -0.3564880611509442
row 3314, similarity -0.35709652488835575
row 7621, similarity -0.36093129794186907
row 14698, similarity -0.3737815749459906
row 5484, similarity -0.38100333590007396
row 7347, similarity -0.3843897128584447
row 4378, similarity -0.3884725555314986
row 8215, similarity -0.39367582995252925
row 15102, similarity -0.4060399605143724
row 431, similarity -0.4129465274048697
row 8623, similarity -0.41682610715728974
row 17991, similarity -0.4249049463034168
row 238, similarity -0.42708153121136794
row 20314, similarity -0.4321367208203536
row 20801, similarity -0.43922777554315984
row 7404, similarity -0.440

In [32]:
names[top_n[10:]]

7621                              Claim It!
14698     Double Mission: Beyond the Object
5484           Candamir: The First Settlers
7347                          Desert Bazaar
4378                              Anno 1503
                        ...                
13466                     Imperial Settlers
6775                             Terra Nova
11206                                Hawaii
8913     Settlers of Catan: Gallery Edition
20895                      Pappy Winchester
Name: name, Length: 90, dtype: object