In [18]:
import numpy as np
import pandas as pd
from pathlib import Path
from rapidfuzz import fuzz, process
from html import unescape
from InstructorEmbedding import INSTRUCTOR
import pandas as pd
from tqdm.auto import tqdm
import tiktoken

  from tqdm.autonotebook import trange


In [3]:
work_dir = Path().resolve()
df = pd.read_csv(work_dir/"data/bgg_gamelist_all_details.csv")

In [4]:
df[["name", "description"]].head()

Unnamed: 0,name,description
0,Die Macher,Die Macher is a game about seven sequential po...
1,Dragonmaster,Dragonmaster is a trick-taking card game based...
2,Samurai,Samurai is set in medieval Japan. Players comp...
3,Tal der Könige,When you see the triangular box and the luxuri...
4,Acquire,"In Acquire, each player strategically invests ..."


In [5]:
df = df.query("~description.isnull()")

In [6]:
# https://www.w3schools.com/html/html_entities.asp
df["description_clean"] = df["description"].apply(unescape)

In [15]:
def anonymize_titles(row):
    
    description = row['description_clean']
    name = row['name']
    description = description.replace(name, "[GAME_NAME]")
    description = description.replace(name.lower(), "[GAME_NAME]")
    description = description.replace(name.upper(), "[GAME_NAME]")
    description = description.replace(name.title(), "[GAME_NAME]")
    description = description.replace(name.capitalize(), "[GAME_NAME]")
    
    return description

In [16]:
# Apply the anonymize_titles function to the 'description_clean' column
df['description_anon'] = df.apply(anonymize_titles, axis=1)

In [17]:
df.to_csv(work_dir/"data/bgg_gamelist_cleaned.csv", index=False, encoding="utf-8")

In [19]:
descriptions = list(df["description_anon"])
descriptions[0:5]

['[GAME_NAME] is a game about seven sequential political races in different regions of Germany. Players are in charge of national political parties, and must manage limited resources to help their party to victory. The winning party will have the most victory points after all the regional elections. There are four different ways of scoring victory points. First, each regional election can supply one to eighty victory points, depending on the size of the region and how well your party does in it. Second, if a party wins a regional election and has some media influence in the region, then the party will receive some media-control victory points. Third, each party has a national party membership which will grow as the game progresses and this will supply a fair number of victory points. Lastly, parties score some victory points if their party platform matches the national opinions at the end of the game.\n\nThe 1986 edition featured four parties from the old West Germany and supported 3-4

In [20]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

sum = 0
for description in descriptions:
    sum += num_tokens_from_string(description, "cl100k_base")
    
sum, sum / 1000 * 0.0004

(5966556, 2.3866224)

In [21]:
model = INSTRUCTOR('hkunlp/instructor-xl')
embeddings = []
instruction = "Represent the Board Game description:"

load INSTRUCTOR_Transformer
max_seq_length  512


In [23]:
for description in tqdm(descriptions):
    embedding = model.encode([[instruction, description]])
    embeddings.append(embedding)
    
embeddings = np.squeeze(np.array(embeddings))

100%|██████████| 24610/24610 [48:55<00:00,  8.38it/s] 


In [24]:
embeddings[:5]

array([[ 0.01089991, -0.00284493, -0.01085802, ..., -0.02386524,
        -0.00545416,  0.04095952],
       [ 0.01260252,  0.01407043, -0.0187649 , ..., -0.03197284,
        -0.0354678 ,  0.05217465],
       [ 0.00757259, -0.00513413, -0.02251686, ..., -0.0095973 ,
         0.01520702,  0.08106932],
       [ 0.00116381, -0.0018552 , -0.00422471, ..., -0.01492945,
         0.00567882,  0.05711071],
       [ 0.02843119, -0.02714729,  0.01903991, ..., -0.03269599,
        -0.04426932,  0.01512   ]], dtype=float32)

In [58]:
embeddings = embeddings / np.sqrt((embeddings**2).sum(1, keepdims=True))

np.savez_compressed(work_dir/"data/embeddings.npz", embeddings = embeddings)

In [43]:
np.sqrt((embeddings**2).sum(1, keepdims=True))

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)

In [28]:
query = embeddings[12] # Use CATAN as an example
#query = embeddings[ids == 6707].squeeze()
query.shape

(768,)

In [30]:
similarities = embeddings.dot(query)
sorted_ix = np.argsort(-similarities)

top_n = []
print("top results:")
for k in sorted_ix[0:21]:
  print(f"row {k}, similarity {similarities[k]}")
  top_n.append(k)

top results:
row 12, similarity 1.0000001192092896
row 12965, similarity 0.9119023084640503
row 238, similarity 0.9105767011642456
row 3872, similarity 0.9104617834091187
row 4378, similarity 0.9054649472236633
row 11232, similarity 0.9050711393356323
row 15102, similarity 0.9046278595924377
row 24473, similarity 0.904604971408844
row 7347, similarity 0.9006145000457764
row 18820, similarity 0.8993796110153198
row 20314, similarity 0.8980516195297241
row 17991, similarity 0.8970128297805786
row 23034, similarity 0.8964637517929077
row 14698, similarity 0.8959041237831116
row 12195, similarity 0.8956476449966431
row 2146, similarity 0.8944798707962036
row 10456, similarity 0.8909525275230408
row 9090, similarity 0.8900841474533081
row 3314, similarity 0.8898947238922119
row 5635, similarity 0.8897260427474976
row 7786, similarity 0.8894286155700684


In [31]:
names = df["name"]
list(names[top_n])

['CATAN',
 'New Haven',
 'Catan Card Game',
 'The Settlers of Zarahemla',
 'Anno 1503',
 'Kingdom Builder',
 'The Lords of Rock',
 'Fugitive (Second Edition)',
 'Desert Bazaar',
 'Scarabya',
 'Two Robots',
 'Scare It!',
 'Chili Mafia',
 'Double Mission: Beyond the Object',
 'Eight-Minute Empire',
 'Puerto Rico',
 'The Castles of Burgundy',
 'Islas Canarias',
 'New England',
 'Neuland',
 'Catan Dice Game']

### SVM

In [33]:
from sklearn import svm
ids = df["id"]
# create the "Dataset"
x = embeddings
y = np.zeros(x.shape[0])
#x = np.concatenate([query[None,...], embeddings]) # x is (1001, 1536) array, with query now as the first row
#y = np.zeros(1001)
#y[12] = 1 # we have a single positive example, mark it as such
y[ids == 13] = 1

# train our (Exemplar) SVM
# docs: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
clf.fit(x, y) # train

# infer on whatever data you wish, e.g. the original data
similarities = clf.decision_function(x)
sorted_ix = np.argsort(-similarities)
top_n = []
print("top 10 results:")
for k in sorted_ix[:100]:
  print(f"row {k}, similarity {similarities[k]}")
  top_n.append(k)

top 10 results:
row 12, similarity 0.9921468465167289
row 8366, similarity -0.1245473268946986
row 13044, similarity -0.14611131065267347
row 24473, similarity -0.2653902864124237
row 2631, similarity -0.28794378992105085
row 3872, similarity -0.332090672765712
row 9090, similarity -0.33368159471277814
row 7794, similarity -0.35497496652896543
row 11512, similarity -0.3564880539268225
row 3314, similarity -0.3570965137151445
row 7621, similarity -0.36093120332841444
row 14698, similarity -0.37378156805589446
row 5484, similarity -0.3810033274403306
row 7347, similarity -0.38438970536333883
row 4378, similarity -0.38847254359100125
row 8215, similarity -0.39367581931287465
row 15102, similarity -0.40603995236606105
row 431, similarity -0.41294659638761566
row 8623, similarity -0.41682609696727846
row 17991, similarity -0.4249049416649814
row 238, similarity -0.4270815218867049
row 20314, similarity -0.4321367087715029
row 20801, similarity -0.43922777636717936
row 7404, similarity -0.44

In [35]:
names[top_n]

3872         The Settlers of Zarahemla
9090                    Islas Canarias
7794     Die Siedler von Catan: Junior
11512                       Milestones
3314                       New England
Name: name, dtype: object