In [605]:
#importations and models

import pandas as pd
import gensim.downloader as api
import numpy as np
import re
import gensim.downloader as api
from gensim.models import Word2Vec, FastText

# Load GloVe embeddings from gensim's online repository
glove_word_vectors = api.load('glove-wiki-gigaword-100')

# Load Word2Vec embeddings from gensim's online repository
word2vec_model = api.load('word2vec-google-news-300')

# Load FastText embeddings from gensim's online repository
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

In [606]:
#original file upload
file_path = "mycards.json"
df = pd.read_json(file_path)

In [607]:
#cleaning
df = df[df.lang == 'en']
df = df[df.layout != 'token']
df = df[df.layout != 'art_series']
df = df[df.layout != 'double_faced_token']
df = df[df.layout != 'vanguard']
df = df[df.layout != 'emblem']
df = df[df.layout != 'planar']
df = df[df.layout != 'scheme']
df = df[df.type_line != 'Hero']
df = df[df.type_line != 'Conspiracy']
df = df[~((df.name == 'Plains') | (df.name == 'Island') | (df.name == 'Swamp') | (df.name == 'Mountain') | (df.name == 'Forest'))]
df = df[~((df.name == 'Plains // Plains') | (df.name == 'Island // Island') | (df.name == 'Swamp // Swamp') | (df.name == 'Mountain // Mountain') | (df.name == 'Forest // Forest'))]
df=df[df.set_type != 'memorabilia']
df = df.drop_duplicates(subset=['name'])
df = df[df['set_name'] != 'Unhinged']
df = df[df['set_name'] != 'Unglued']
df = df[df['set_name'] != 'Unstable']

#two-sided
df_explode = df[df['name'].str.contains("//")]
df_explode = df_explode.explode('card_faces')
df_explode.reset_index(drop=True, inplace=True)
df_normalized = pd.json_normalize(df_explode['card_faces'])

#keeping id numbers
df_normalized['id']=df_explode['id']
df_normalized['color_identity']=df_explode['color_identity']
df_normalized['colors']=df_explode['colors']
df_normalized['keywords']=df_explode['keywords']
df_normalized['legalities']=df_explode['legalities']

# Concatenate the normalized DataFrame with the original DataFrame
df = pd.concat([df, df_normalized], ignore_index=True)

#dropping duals after 2-sided split
df=df[~df.name.str.contains('//')]

#column dropping
cols_to_drop = np.array(df.columns[[0, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15, 16, 23, 27, 28, 29, 30, 31, 32, 33] + list(range(34, 85))])
df.drop(columns=cols_to_drop, inplace=True)
cols_to_drop = np.array(df.columns[list(range(12, 20))])
df.drop(columns=cols_to_drop, inplace=True)

#further cleaning
df['oracle_text'] = df['oracle_text'].fillna('')
df['power'] = df['power'].fillna(0)
df['toughness'] = df['toughness'].fillna(0)
df['power'] = pd.to_numeric(df['power'], errors='coerce')
df['toughness'] = pd.to_numeric(df['toughness'], errors='coerce')
df['power'].fillna(power_mean, inplace=True)
df['toughness'].fillna(toughness_mean, inplace=True)

#silly card
df=df[~df.name.str.contains('The Ultimate Nightmare of Wizards of the Coast® Customer Service')]

In [608]:
#getting mana counts
def count_mana_symbols(text):
    counts = {'colorless': 0, 'U': 0, 'G': 0, 'W': 0, 'B': 0, 'R': 0, 'X': 0, 'C' :0, 'P':0, 'S':0}
    pattern = r'{([^}]*)}'
    mana_symbols = re.findall(pattern, text)
    for symbol in mana_symbols:
        if symbol.isdigit():
            counts['colorless'] += int(symbol)
        elif '/' in symbol:
            colors = symbol.split('/')
            for color in colors:
                if color in counts:
                    counts[color] += 0.5
                else:
                    print(f"Unexpected color: {color}")
        elif symbol in counts:
            counts[symbol] += 1
        else:
            print(f"Unexpected symbol: {symbol}")
    
    return counts
df = df.join(df['mana_cost'].astype(str).apply(lambda x: pd.Series(count_mana_symbols(x))))

#filling cmc
df.loc[df.cmc.isna(), 'cmc'] = df[['colorless', 'U', 'G', 'W', 'B', 'R', 'X', 'C', 'P', 'S']].sum(axis=1)

column_mapping = {
    'colorless': 'colorless_int',
    'U': 'U_int',
    'G': 'G_int',
    'W': 'W_int',
    'B': 'B_int',
    'R': 'R_int',
    'X': 'X_int',
    'C': 'C_int',
    'P': 'P_int',
    'S': 'S_int'
}

# Rename the columns using the mapping
df.rename(columns=column_mapping, inplace=True)

# Define a list of all possible colors
all_colors = ['W', 'U', 'B', 'R', 'G']

# Create separate binary columns for each color and one for "None"
for color in all_colors + ['None']:
    df[f'color_{color}'] = df['color_identity'].apply(lambda x: 1 if color in x else 0 if color in x else 1 if x == [] else 0)


Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2
Unexpected color: 2


In [609]:
from collections import Counter
keyword_counts = Counter(keyword for sublist in df['keywords'] for keyword in sublist)
selected_keywords = [keyword for keyword, count in keyword_counts.items() if count > 50]
for keyword in selected_keywords:
    df[keyword] = df['keywords'].apply(lambda x: 1 if keyword in x else 0)

In [610]:
# Function to preprocess text
def preprocess_text(text):
    if isinstance(text, float):
        text = str(text)
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r'{[^}]+}', 'symbol', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize text based on word boundaries
    tokens = re.findall(r'\b\w+\b', text)
    
    # Specify multi-word phrases to be merged
    multi_word_phrases = ['first strike', 'double strike']  # Add other phrases as needed
    
    # Merge consecutive tokens forming multi-word phrases
    merged_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and ' '.join([tokens[i], tokens[i+1]]) in multi_word_phrases:
            merged_tokens.append(' '.join([tokens[i], tokens[i+1]]))
            i += 2
        else:
            merged_tokens.append(tokens[i])
            i += 1
    
    return merged_tokens


In [611]:
# Function to obtain embeddings using GloVe
def get_glove_embedding(tokens):
    embeddings = []
    for token in tokens:
        try:
            embeddings.append(glove_word_vectors[token])
        except KeyError:
            # Handle out-of-vocabulary tokens
            pass
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(glove_word_vectors.vector_size)

# Function to obtain embeddings using Word2Vec
def get_word2vec_embedding(tokens):
    embeddings = []
    for token in tokens:
        if token in word2vec_model:
            embeddings.append(word2vec_model[token])
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

# Function to obtain embeddings using FastText
def get_fasttext_embedding(tokens):
    embeddings = []
    for token in tokens:
        if token in fasttext_model:
            embeddings.append(fasttext_model[token])
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(fasttext_model.vector_size)

# Apply preprocessing and get embeddings for each embedding type
df['oracle_text_tokens'] = df['oracle_text'].apply(preprocess_text)
df['type_line_tokens'] = df['type_line'].apply(preprocess_text)
df['glove_embedding'] = df['oracle_text_tokens'].apply(get_glove_embedding)
df['glove_embedding_type_line'] = df['type_line_tokens'].apply(get_glove_embedding)
df['word2vec_embedding'] = df['oracle_text_tokens'].apply(get_word2vec_embedding)
df['word2vec_embedding_type_line'] = df['type_line_tokens'].apply(get_word2vec_embedding)
df['fasttext_embedding'] = df['oracle_text_tokens'].apply(get_fasttext_embedding)
df['fasttext_embedding_type_line'] = df['type_line_tokens'].apply(get_fasttext_embedding)

In [612]:
df.to_csv("cards_clean_final.csv", index=False)


In [613]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# 1. Separate features into different groups
numerical_features = df[['cmc', 'power', 'toughness', 'colorless_int', 'U_int', 'G_int', 'W_int', 'B_int', 'R_int', 'X_int', 'C_int', 'P_int', 'S_int']]
binary_features = df[list(df.columns[22:86])]
text_embeddings = df[list(df.columns[88:94])]

# 2. Normalize numerical features
scaler = StandardScaler()
normalized_numerical_features = scaler.fit_transform(numerical_features)

# 3. Reshape embedding features
reshaped_glove_embedding = np.array(df['glove_embedding'].tolist())
reshaped_word2vec_embedding = np.array(df['word2vec_embedding'].tolist())
reshaped_fasttext_embedding = np.array(df['fasttext_embedding'].tolist())

# 4. Concatenate features
X = np.concatenate([
    normalized_numerical_features, 
    binary_features.values, 
    reshaped_glove_embedding,
    reshaped_word2vec_embedding,
    reshaped_fasttext_embedding
], axis=1)

# 5. Train KNN model
knn = NearestNeighbors(n_neighbors=5, algorithm='auto')
knn.fit(X)



In [614]:
def get_features_for_card(card_name):
    # Extract features for the given card name
    card_numerical_features = df.loc[df['name'] == card_name, numerical_features.columns].values
    card_binary_features = df.loc[df['name'] == card_name, binary_features.columns].values
    card_text_embeddings = df.loc[df['name'] == card_name, text_embeddings.columns].values
    
    # Normalize numerical features
    card_numerical_features_normalized = scaler.transform(card_numerical_features)
    
    # Reshape embedding features
    reshaped_glove_embedding = np.array(df.loc[df['name'] == card_name, 'glove_embedding'].tolist())
    reshaped_word2vec_embedding = np.array(df.loc[df['name'] == card_name, 'word2vec_embedding'].tolist())
    reshaped_fasttext_embedding = np.array(df.loc[df['name'] == card_name, 'fasttext_embedding'].tolist())

    # Concatenate features
    card_features = np.concatenate([
        card_numerical_features_normalized, 
        card_binary_features, 
        reshaped_glove_embedding,
        reshaped_word2vec_embedding,
        reshaped_fasttext_embedding
    ], axis=1)
    
    return card_features

def find_similar_cards(card_name, k=5):
    # Get features for the input card
    input_card_features = get_features_for_card(card_name)
    
    # Find most similar cards to the input card
    distances, indices = knn.kneighbors(input_card_features)
    
    # Output most similar cards
    similar_cards = df.iloc[indices[0]]
    
    return similar_cards

# Example usage:
input_card_name = "Fury Sliver"
similar_cards = find_similar_cards(input_card_name)
similar_cards['name']




0                Fury Sliver
19971       Battering Sliver
17850          Battle Sliver
27176         Cyclops Tyrant
9530     Bonesplitter Sliver
Name: name, dtype: object

In [None]:
#%pip install torch

In [None]:
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

if 'oracle_text_preprocessed' not in df.columns:
    df['oracle_text_preprocessed'] = df['oracle_text'].apply(lambda x: x.lower() if isinstance(x, str) else x)

def encode_text_with_bert(text):
    if isinstance(text, str) and text.strip():  # Check if the input text is a non-empty string
        tokens = tokenizer.tokenize(text)
        if tokens:  # Check if tokens are generated
            inputs = tokenizer.encode_plus(tokens, return_tensors='pt', padding=True, truncation=True)
            with torch.no_grad():
                outputs = model(**inputs)
                embeddings = outputs[0].mean(dim=1)  # Access the output tensor and use the mean of the hidden states as embeddings
            return embeddings.numpy()
    # Return zeros if the input text is not valid for encoding
    return np.zeros((1, model.config.hidden_size))

df['bert_embedding'] = df['oracle_text_preprocessed'].apply(encode_text_with_bert)