### Data cleaning and importing

In [29]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/cards_clean.csv')

In [30]:
df.dropna(subset=['oracle_text'], inplace=True)

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

In [92]:
import re

def preprocess_text(text):
    if isinstance(text, float):
        text = str(text)
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r'{[^}]+}', 'symbol', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['text'] = df['oracle_text'].apply(preprocess_text)
df['text'].head()

0              all sliver creatures have double strike
1    when kor outfitter enters the battlefield you ...
2    flying when siren lookout enters the battlefie...
3    enchant creature target a creature as you cast...
4    paradox  draw a card for each spell youve cast...
Name: text_processed, dtype: object

In [93]:
X = vectorizer.fit_transform(df['text'])
df['tfidf'] = list(X.toarray())

### Cosine Similarity

In [61]:
from sklearn.metrics.pairwise import cosine_similarity

In [99]:
test_names = ['Torbran, Thane of Red Fell', 'Rocco, Street Chef', 'Sai, Master Thopterist']
test_cards = df[df['name'].isin(test_names)]

In [None]:
cosine_sim = cosine_similarity(list(df['tfidf']), test_cards['tfidf'].iloc[0].reshape(1, -1))
top5 = cosine_sim.flatten().argsort()[-5:][::-1]
top5

In [98]:
df.iloc[top5]['text']

1899     if a red source you control would deal damage ...
15436    if a red or artifact source you control would ...
16876    flying haste if a spell would deal damage to y...
26799    if a red spell would deal damage to a permanen...
11782    flying first strike if a source would deal dam...
Name: text_processed, dtype: object

### Nearest Neighbors

In [62]:
from sklearn.neighbors import NearestNeighbors

# Create an instance of NearestNeighbors
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(df['tfidf'].tolist())

In [67]:
test_case = test_cards.iloc[0]
test_vec = test_case['tfidf']
test_case['name']

'Torbran, Thane of Red Fell'

In [70]:
distances, indices = nn.kneighbors([test_vec])

In [71]:
for rank, index in enumerate(indices[0], start=1):
    print(f"Rank: {rank}, Index: {index}, Distance: {distances[0][rank-1]}")

Rank: 1, Index: 1823, Distance: 0.0
Rank: 2, Index: 14870, Distance: 0.2103649050410412
Rank: 3, Index: 16261, Distance: 0.6196337873034967
Rank: 4, Index: 25815, Distance: 0.7512734237693233
Rank: 5, Index: 11355, Distance: 0.7618967789432729


In [72]:
df.iloc[14870]['oracle_text']

'If a red or artifact source you control would deal damage to an opponent or a permanent an opponent controls, it deals that much damage plus 1 instead.'

In [82]:
df[df.name == 'Trace of Abundance']['oracle_text'].values

array(["Enchant land\nEnchanted land has shroud. (It can't be the target of spells or abilities.)\nWhenever enchanted land is tapped for mana, its controller adds an additional one mana of any color."],
      dtype=object)