# Imports

In [None]:
import gensim
from gensim import corpora
from pprint import pprint
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import numpy as np
import sklearn
import pandas as pd
import umap
from pymongo import MongoClient
from sklearn.neighbors import NearestNeighbors
import tqdm
import pickle

# Read Data 

In [None]:
type(spark)

In [None]:
df = spark.read.parquet('/mtgp/artifacts/dataset/M21_cards.parquet')

In [None]:
as_dict = df.rdd.map(lambda row: row.asDict())

In [None]:
cards = as_dict.collect()

In [None]:
num_cards = len()

# Tokenize

In [None]:
field_text = "filteredText"

data_list = [{'number': card['number'], 'text': card[field_text], 'name': card['name']} for card in cards if field_text in card if card[field_text] is not None]
card_names = list(map(lambda card: card['number'], data_list))
data = list(map(lambda card: card['text'], data_list))

In [None]:
tagged_data = [
    TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)])
    for i, _d in enumerate(data) if _d is not None
]

In [None]:
num_tagged_cards = len(tagged_data)

# Train model

In [None]:
max_epochs = 100
vec_size = 1000
alpha = 0.025

In [None]:
model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1)
model.build_vocab(tagged_data)

In [None]:
for epoch in tqdm.tqdm(range(max_epochs)):
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

In [None]:
# model.save("d2v.model")

In [None]:
X = list()

for i in tqdm.tqdm(range(len(model.docvecs))):
    X.append(model.docvecs[i])
    
docvecs = np.array(X)

# Project and embed data

In [None]:
rng = np.random.RandomState(0)

In [None]:
pca = sklearn.decomposition.PCA(random_state=rng)
X_docvecs = pca.fit_transform(docvecs)

In [None]:
mdl_umap = umap.UMAP(n_neighbors=10, metric='cosine', random_state=rng, transform_seed=rng)
embedded_docvecs = mdl_umap.fit_transform(docvecs)

In [None]:
# with open("../mdl_umap.pickle","wb") as fp:
#     pickle.dump(mdl_umap, fp)
# fp.close()

# Find similar cards (up to 20) and cache the results

In [None]:
similarity_results = list()

for card, embedded_vector in zip(cards, embedded_docvecs):
    knn = NearestNeighbors(n_neighbors=1, metric='euclidean')
    knn.fit(embedded_docvecs)
    
    distances, neighs_ids = knn.kneighbors(np.reshape(embedded_vector, [1, 2]), 20, return_distance=True)
    
    neighs_ids = neighs_ids.astype(np.int32)[0]
    resolve_cards = np.array(cards)[neighs_ids].tolist()
    resolve_cards_ids = list(map(lambda kvp: kvp['number'], resolve_cards))
    
    similarity_results.append({'card_number': card['number'], 'similar': resolve_cards_ids})

# Save feature vectors 

In [None]:
client = MongoClient("localhost")
db = client['mtggg']

In [None]:
# feats = [{'number': card['number'], 'docvect': feature_vector.tolist(), 'embedded_vect': embedded_vector.tolist()} 
#              for card, feature_vector, embedded_vector in zip(cards, docvecs, embedded_docvecs)
# ]

In [None]:
# result = db.ml.feats.v1.insert_many(feats)
# result.acknowledged

In [None]:
# result = db.ml.similar.insert_many(similarity_results)
# result.acknowledged

# Plot scatter

In [None]:
mdl_umap = umap.UMAP(n_neighbors=10, metric='cosine', random_state=rng, transform_seed=rng)
embedded_docvecs = mdl_umap.fit_transform(docvecs)

In [None]:
df = pd.DataFrame(np.hstack([embedded_docvecs, np.reshape(data, [num_tagged_cards, 1]), np.reshape(card_names, [num_tagged_cards, 1])]), columns=['x', 'y', 'text', 'name'])

In [None]:
import plotly
import plotly.express as px
plotly.offline.init_notebook_mode(connected=True)

fig = px.scatter(data_frame=df, x='x', y='y', hover_data=['name', 'text'])
fig.show()