### Data cleaning and importing

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_pickle('data/cards_clean_final.pkl')

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

In [6]:
df.head(1)

Unnamed: 0,id,name,highres_image,mana_cost,cmc,type_line,oracle_text,power,toughness,color_identity,...,Disturb,oracle_text_tokens,type_line_tokens,glove_embedding,glove_embedding_type_line,word2vec_embedding,word2vec_embedding_type_line,fasttext_embedding,fasttext_embedding_type_line,bert_embedding
0,0000579f-7b35-4ed3-b44c-db2a538066fe,Fury Sliver,True,{5}{R},6.0,Creature — Sliver,All Sliver creatures have double strike.,3.0,3.0,[R],...,0,"[all, sliver, creatures, have, double strike]","[creature, sliver]","[-0.1752575, 0.62444496, 0.66865623, -0.255527...","[-0.28311002, 0.507995, 0.9661, 0.11280501, 0....","[0.051513672, 0.017791748, -0.048034668, 0.063...","[0.19970703, 0.0063476562, -0.0904541, -0.0258...","[0.0073304246, -0.0107843, 0.017301675, 0.0127...","[0.00787795, 0.0245347, 0.008605201, 0.0164214...","[-0.014190583, 0.041132353, 0.060768433, -0.15..."


In [9]:
# import re

# def preprocess_text(text):
#     if isinstance(text, float):
#         text = str(text)
#     text = text.lower()
#     text = text.replace("\n", " ")
#     text = re.sub(r'{[^}]+}', 'symbol', text)
#     text = re.sub(r'[^\w\s]', '', text)
#     return text
# df['oracle_text_processed'] = df['oracle_text'].apply(preprocess_text)

In [11]:
X = vectorizer.fit_transform(df['oracle_text_processed'])
df['tfidf'] = list(X.toarray())

### Cosine Similarity

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
test_names = ['Torbran, Thane of Red Fell', 'Rocco, Street Chef', 'Sai, Master Thopterist']
test_cards = df[df['name'].isin(test_names)]

In [17]:
cosine_sim = cosine_similarity(list(df['tfidf']), test_cards['tfidf'].iloc[0].reshape(1, -1))
top5 = cosine_sim.flatten().argsort()[-5:][::-1]
top5

array([ 1841, 14915, 16296, 25800, 28166], dtype=int64)

In [19]:
df.iloc[top5]['oracle_text']

1887     If a red source you control would deal damage ...
15300    If a red or artifact source you control would ...
16719    Flying, haste\nIf a spell would deal damage to...
26465    If a red spell would deal damage to a permanen...
28869    Protection from red\nIf a red source would dea...
Name: oracle_text, dtype: object

### Nearest Neighbors

In [12]:
from sklearn.neighbors import NearestNeighbors

# Create an instance of NearestNeighbors
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(df['tfidf'].tolist())

In [14]:
test_cards.head(3)

Unnamed: 0,id,name,mana_cost,cmc,type_line,oracle_text,power,toughness,colors,color_identity,keywords,legalities,text,tfidf
1899,064ce69c-da9c-4d7b-8ec1-4ad300c011d1,"Torbran, Thane of Red Fell",{1}{R}{R}{R},4.0,Legendary Creature — Dwarf Noble,If a red source you control would deal damage ...,2,4,['R'],['R'],[],"{'standard': 'not_legal', 'future': 'not_legal...",if a red source you control would deal damage ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6514,19316cbb-d1af-4ab7-b588-78637503e986,"Sai, Master Thopterist",{2}{U},3.0,Legendary Creature — Human Artificer,"Whenever you cast an artifact spell, create a ...",1,4,['U'],['U'],[],"{'standard': 'not_legal', 'future': 'not_legal...",whenever you cast an artifact spell create a 1...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
13452,421c2ed3-be60-4814-a82c-a0e3fbb97e63,"Rocco, Street Chef",{R}{G}{W},3.0,Legendary Creature — Elf Druid,"At the beginning of your end step, each player...",2,4,"['G', 'R', 'W']","['G', 'R', 'W']",['Food'],"{'standard': 'legal', 'future': 'legal', 'hist...",at the beginning of your end step each player ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [15]:
distances, indices = nn.kneighbors(test_cards['tfidf'].tolist())

In [16]:
for rank, index in enumerate(indices[0], start=1):
    print(f"Rank: {rank}, Index: {index}, Distance: {distances[0][rank-1]}")

Rank: 1, Index: 1823, Distance: 0.0
Rank: 2, Index: 14870, Distance: 0.2103649050410412
Rank: 3, Index: 16261, Distance: 0.6196337873034967
Rank: 4, Index: 25815, Distance: 0.7512734237693233
Rank: 5, Index: 11355, Distance: 0.7618967789432729
