In [1]:
import numpy as np
import pandas as pd

In [47]:
df = pd.read_csv("data/bgg_gamelist_cleaned.csv")
df = df.query('~description.isnull()')
ids = df["id"]
names = df["name"]
embeddings = np.load("data/embeddings.npz")["embeddings"]

In [52]:
embeddings = embeddings / np.sqrt((embeddings**2).sum(1, keepdims=True)) # L2 normalize the rows, as is common, in this case the embeddings we obtained from instructor-xl were already normalized.

In [53]:
np.sqrt((embeddings**2).sum(1, keepdims=True))

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)

In [54]:
embeddings.shape

(24610, 768)

In [59]:
query = embeddings[2] # Use Samurai as an example
#query = embeddings[ids == 6707].squeeze()
query.shape

(768,)

In [60]:
similarities = embeddings.dot(query)
sorted_ix = np.argsort(-similarities)

top_n = []
print("top 10 results:")
for k in sorted_ix[1:21]:
  print(f"row {k}, similarity {similarities[k]}")
  top_n.append(k)

top 10 results:
row 20176, similarity 0.8918618559837341
row 21065, similarity 0.8870534896850586
row 8896, similarity 0.8848761320114136
row 9908, similarity 0.8840431571006775
row 13796, similarity 0.8794407844543457
row 20169, similarity 0.8769826889038086
row 15232, similarity 0.8763738870620728
row 16751, similarity 0.8746436238288879
row 15876, similarity 0.873382031917572
row 11902, similarity 0.8709729909896851
row 7314, similarity 0.8709137439727783
row 13584, similarity 0.8707891702651978
row 39, similarity 0.8701850175857544
row 22313, similarity 0.870140552520752
row 8756, similarity 0.8685733079910278
row 11418, similarity 0.868300199508667
row 59, similarity 0.8682471513748169
row 20785, similarity 0.8680010437965393
row 8686, similarity 0.8678731918334961
row 21236, similarity 0.8675068616867065


In [57]:
list(ids[top_n])

[146158,
 278,
 6778,
 8166,
 107529,
 178984,
 370649,
 24083,
 244331,
 269511,
 230267,
 329593,
 172994,
 131366,
 3076,
 84876,
 40370,
 5406,
 12681,
 27710]

In [61]:
list(names[top_n])

['Jinja',
 'Small Samurai Empires',
 'Bushido: Der Weg des Kriegers',
 'Seven Card Samurai',
 'Kaleido',
 'TacTiki',
 'Joraku',
 'Rising Sun',
 'Martial Art',
 'Seven Swords',
 'Oshi',
 'Age of War',
 'Tigris & Euphrates',
 'Make Make',
 'Seii Taishogun',
 'Edo',
 'Samurai: Game of Politics and Warfare in Feudal Japan',
 'Harakiri: Blades of Honor',
 'Rice Wars',
 'Tajuto']

### SVM

In [62]:
from sklearn import svm

# create the "Dataset"
x = embeddings
y = np.zeros(x.shape[0])
#x = np.concatenate([query[None,...], embeddings]) # x is (1001, 1536) array, with query now as the first row
#y = np.zeros(1001)
#y[12] = 1 # we have a single positive example, mark it as such
y[ids == 3] = 1

# train our (Exemplar) SVM
# docs: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
clf.fit(x, y) # train

# infer on whatever data you wish, e.g. the original data
similarities = clf.decision_function(x)
sorted_ix = np.argsort(-similarities)
top_n = []
print("top 10 results:")
for k in sorted_ix[:100]:
  print(f"row {k}, similarity {similarities[k]}")
  top_n.append(k)

top 10 results:
row 2, similarity 0.9936124985445265
row 9502, similarity -0.32443506625691154
row 20696, similarity -0.3386445503357365
row 21236, similarity -0.4154693141723109
row 10135, similarity -0.4185579787387178
row 8352, similarity -0.44136167959357664
row 22453, similarity -0.46153434744901234
row 1913, similarity -0.4620022978538547
row 8756, similarity -0.47208237272862363
row 9908, similarity -0.47240341859356305
row 8364, similarity -0.4813598180769806
row 10507, similarity -0.48663675596958966
row 21873, similarity -0.4929671261679248
row 18895, similarity -0.49493684864833987
row 20176, similarity -0.49878394852262975
row 4907, similarity -0.5048031206639167
row 18282, similarity -0.5076105679846947
row 17406, similarity -0.5101065949531415
row 18275, similarity -0.5187516159150984
row 21065, similarity -0.519561338050269
row 13192, similarity -0.5242041669607571
row 19197, similarity -0.5280147100309718
row 8896, similarity -0.5346411799258728
row 15733, similarity -0

In [64]:
list(names[top_n[1:]])

['Ninjato',
 'Sencha',
 'Tajuto',
 'Takenoko',
 'Senji',
 'Night Parade of a Hundred Yokai',
 "James Clavell's Shogun",
 'Seii Taishogun',
 'Seven Card Samurai',
 'Monastery',
 'Kaiten Sushi',
 'Jubako',
 'The Way of the Bear',
 'Jinja',
 'Oriente',
 'Dôjima',
 'TA‐KE',
 'Niwa',
 'Small Samurai Empires',
 'Spirits of the Rice Paddy',
 'Shikoku',
 'Bushido: Der Weg des Kriegers',
 '曼荼羅 (Mandara)',
 'Daimyo',
 'Web of Power',
 'Java',
 'Martial Art',
 'Samurai',
 'Joraku',
 'Samurai',
 'The major four of Heizei',
 'Machi',
 'Make Make',
 'Fan & Mallet (団扇と小槌)',
 'Hatu Matu: Chief of Easter Island',
 'Rice Dice',
 'Gorinto',
 'Toledo 1085',
 'Der weiße Lotus',
 'Rising Sun',
 'Ninja Squad',
 'Samurai & Katana',
 'Kami-sama',
 'Bluff',
 'Bamboo',
 'Nippon',
 'Brian Boru: High King of Ireland',
 'Fiji',
 'Tenkatoitsu',
 'Rice Wars',
 'Cronberg',
 'Tokaido Duo',
 'Far East War 1592',
 'Matcha',
 'Oshi',
 'Qin',
 'Tenka',
 'Justinian',
 'Die Osterinsel',
 'Deities',
 'Majolica',
 'Daxu',
 'Ki