# Imports

In [76]:
import gensim
from gensim import corpora
from pprint import pprint
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import numpy as np
import sklearn
import umap
from pymongo import MongoClient
from sklearn.neighbors import NearestNeighbors
import tqdm
import pickle

# Read Data 

In [63]:
type(spark)

pyspark.sql.session.SparkSession

In [2]:
df = spark.read.parquet('../dataset/THB_cards.parquet')

In [3]:
as_dict = df.rdd.map(lambda row: row.asDict())

In [4]:
cards = as_dict.collect()

# Tokenize

In [46]:
field_text = "filteredText"

data_list = [{'number': card['number'], 'text': card[field_text]} for card in cards if field_text in card if card[field_text] is not None]
data = list(map(lambda card: card['text'], data_list))

In [47]:
tagged_data = [
    TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)])
    for i, _d in enumerate(data) if _d is not None
]

# Train model

In [48]:
max_epochs = 100
vec_size = 50
alpha = 0.025

In [49]:
model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1)
model.build_vocab(tagged_data)

In [50]:
for epoch in tqdm.tqdm(range(max_epochs)):
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

100%|██████████| 100/100 [00:11<00:00,  8.80it/s]


In [10]:
# model.save("d2v.model")

In [11]:
X = list()

for i in tqdm.tqdm(range(len(model.docvecs))):
    X.append(model.docvecs[i])
    
docvecs = np.array(X)

100%|██████████| 268/268 [00:00<00:00, 29699.68it/s]


# Project and embed data

In [71]:
rng = np.random.RandomState(0)

In [72]:
# pca = sklearn.decomposition.PCA(random_state=rng)
# X = pca.fit_transform(docvecs)

In [73]:
mdl_umap = umap.UMAP(n_neighbors=5, metric='cosine', random_state=rng, transform_seed=rng)
embedded_docvecs = mdl_umap.fit_transform(docvecs)

In [74]:
with open("../mdl_umap.pickle","wb") as fp:
    pickle.dump(mdl_umap, fp)
fp.close()

# Find similar cards (up to 20) and cache the results

In [111]:
similarity_results = list()

for card, embedded_vector in zip(cards, embedded_docvecs):
    knn = NearestNeighbors(n_neighbors=10)
    knn.fit(embedded_docvecs)
    
    distances, neighs_ids = knn.kneighbors(np.reshape(embedded_vector,[1, 2]), 20, return_distance=True)
    
    similarity_results.append({'number': card['number'], 'similar': neighs_ids[0].tolist()})

# Save feature vectors 

In [101]:
client = MongoClient("localhost")
db = client['mtgp']

In [102]:
feats = [{'number': card['number'], 'docvect': feature_vector.tolist(), 'embedded_vect': embedded_vector.tolist()} 
             for card, feature_vector, embedded_vector in zip(cards, docvecs, embedded_docvecs)
]

In [103]:
result = db.ml.feats.v1.insert_many(feats)
result.acknowledged

True

In [112]:
result = db.ml.similar.v1.insert_many(similarity_results)
result.acknowledged

True

# Plot scatter

In [18]:
# import plotly.express as px
# fig = px.scatter(x=xy[:, 0], y=xy[:, 1])
# fig.show()

In [19]:
# import matplotlib.pyplot as plt
# plt.scatter(xy[:, 0], xy[:, 1])