In [3]:
# RUN THIS CELL AND THEN RUN THE COSINE SECTION ONWARDS

import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df = pd.read_csv('data/cards_clean.csv')

In [2]:
from scripts_and_functions.functions import preprocess_text

df['text'] = preprocess_text(df['oracle_text'])
df = df.dropna(subset=['text'])

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertModel.from_pretrained('bert-base-uncased')

model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [7]:
#from scripts_and_functions.functions import bert_embedding

def bert_embedding(text):
    '''
    Takes in a string and returns the BERT embeddings of the string.
    '''
    # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Batch size 1

    # Get the embeddings
    with torch.no_grad():
        outputs = model(input_ids)

    # outputs[0] contains the hidden states of the last layer
    # We take the embeddings from the first token of the last layer which corresponds to [CLS]
    embeddings = outputs[0][0, 0, :].numpy()

    return embeddings

df['bert'] = df['text'].apply(bert_embedding)
df['bert'] = df['bert'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))

### Cosine Similarity

In [4]:
df = pd.read_csv('data/cards_bert.csv')
df['bert'] = df['bert'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))  # must run after reading CSV to convert string data to numpy array

In [16]:
test_names = ['Torbran, Thane of Red Fell', 'Rocco, Street Chef', 'Sai, Master Thopterist']
test_cards = df[df['name'].isin(test_names)]

In [17]:
cosine = cosine_similarity(list(df['bert']), test_cards['bert'].iloc[0].reshape(1, -1))
top5 = cosine.flatten().argsort()[-5:][::-1]
top5

array([ 1823, 14870,  5949, 10336,  3127], dtype=int64)

### Nearest Neighbors

In [18]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(df['bert'].tolist())

In [19]:
test_cards.head(3)

Unnamed: 0.1,Unnamed: 0,id,name,mana_cost,cmc,type_line,oracle_text,power,toughness,colors,color_identity,keywords,legalities,text,bert
1823,1823,064ce69c-da9c-4d7b-8ec1-4ad300c011d1,"Torbran, Thane of Red Fell",{1}{R}{R}{R},4.0,Legendary Creature — Dwarf Noble,If a red source you control would deal damage ...,2,4,['R'],['R'],[],"{'standard': 'not_legal', 'future': 'not_legal...",if a red source you control would deal damage ...,"[-0.304124594, -0.172151417, 0.0978919193, 0.1..."
6279,6279,19316cbb-d1af-4ab7-b588-78637503e986,"Sai, Master Thopterist",{2}{U},3.0,Legendary Creature — Human Artificer,"Whenever you cast an artifact spell, create a ...",1,4,['U'],['U'],[],"{'standard': 'not_legal', 'future': 'not_legal...","whenever you cast an artifact spell, create a ...","[-0.0737882555, -0.374748558, -0.142685503, -0..."
12970,12970,421c2ed3-be60-4814-a82c-a0e3fbb97e63,"Rocco, Street Chef",{R}{G}{W},3.0,Legendary Creature — Elf Druid,"At the beginning of your end step, each player...",2,4,"['G', 'R', 'W']","['G', 'R', 'W']",['Food'],"{'standard': 'legal', 'future': 'legal', 'hist...","at the beginning of your end step, each player...","[-0.0846293569, -0.849071622, -0.0179633982, -..."


In [30]:
distances, indices = nn.kneighbors(test_cards['bert'].tolist())
indices

array([[ 1823, 14870,  5949, 10336,  2650],
       [ 6279,  3150,  5533, 25877,  3496],
       [12970, 17523,  8455,  5178, 14659]], dtype=int64)

In [23]:
for rank, index in enumerate(indices[0], start=1):
    print(f"Rank: {rank}, Index: {index}, Distance: {distances[0][rank-1]}")

Rank: 1, Index: 1823, Distance: 0.0
Rank: 2, Index: 14870, Distance: 2.96660180306115
Rank: 3, Index: 5949, Distance: 4.703805388099132
Rank: 4, Index: 10336, Distance: 4.788979196948505
Rank: 5, Index: 2650, Distance: 4.9183919577294635
