<div align="center"><h3>HW2, Part1</h3></div>
<div align="center"><h5>Mohammadreza Ghofrani, 400131076</h5></div>

In [1]:
from hazm import *
import numpy as np
import pandas as pd
from pprint import pprint
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from gensim.corpora import Dictionary
from gensim.models import Word2Vec, TfidfModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
class MaxHolder:
    def __init__(self):
        self._max = -np.inf
        self._max_index = None

    @property
    def max(self):
        return (self._max, self._max_index)

    @max.setter
    def max(self, value):
        v = value[0]
        v_idx = value[1]
        if v > self._max:
            self._max = v
            self._max_index = v_idx

    @max.deleter
    def max(self):
        del self._max

# Step 1: Word Representation

In [3]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [4]:
# Here each article is transformed into a list of its tokens
def tokenizer(text):
    normalizer = Normalizer()
    txt = normalizer.normalize(text)
    return word_tokenize(txt)

tokens_of_articles = list()

for article in train_df['article']:
    article_tokens = tokenizer(article)
    tokens_of_articles.append(article_tokens)

In [5]:
word2vec_model = Word2Vec(sentences=tokens_of_articles, vector_size=300, min_count=0,
                          window=5, workers=4, sg=1, epochs=5)

# Step 2: Document Representation

## Part 1

In [6]:
dct = Dictionary(tokens_of_articles)
bow_corpus = [dct.doc2bow(line) for line in tokens_of_articles]
tfidf_model = TfidfModel(bow_corpus)

In [7]:
def get_tfidf_repr(doc):
    doc_repr = 0
    doc_weight_sum = 0
    for word_id, tfidf_score in doc:
        word = dct[word_id]
        word2vec_score = word2vec_model.wv[word]
        doc_repr += (tfidf_score * word2vec_score)
        doc_weight_sum += tfidf_score
    return doc_repr / doc_weight_sum

In [8]:
word2vec_representation_of_documents = list()
for doc in tfidf_model[bow_corpus]:
    tfidf_repr = get_tfidf_repr(doc)
    word2vec_representation_of_documents.append(tfidf_repr)

## Part 2

In [9]:
tagged_articles = [TaggedDocument(art, [i]) for i, art in enumerate(tokens_of_articles)]
doc2vec_model = Doc2Vec(tagged_articles, vector_size=300, window=5,
                        min_count=0, workers=6, dm=0, epochs=7)
doc2vec_model.build_vocab(tagged_articles)
doc2vec_model.train(tagged_articles, total_examples=doc2vec_model.corpus_count,
                    epochs=doc2vec_model.epochs)

In [10]:
doc2vec_representation_of_documents = list()
for art_tokens in tokens_of_articles:
    doc2vec_repr = doc2vec_model.infer_vector(art_tokens)
    doc2vec_representation_of_documents.append(doc2vec_repr)

# Step 3: Similar Documents

## Part 1

In [11]:
print('Using TF-IDF&word2vec')
for ref_docid in ['Doc1', 'Doc3', 'Doc5', 'Doc25', 'Doc36']:
    article = test_df[test_df['id'] == ref_docid]['article'].values[0]
    ref_article_tokens = tokenizer(article)
    ref_article_embedding = get_tfidf_repr(tfidf_model[dct.doc2bow(ref_article_tokens)])

    max_holder = MaxHolder()
    for docindex, _ in enumerate(tokens_of_articles):
        cur_article_embedding = word2vec_representation_of_documents[docindex]
        sim = cosine_similarity(ref_article_embedding[np.newaxis], cur_article_embedding[np.newaxis])
        max_holder.max = (sim, docindex)

    sim, docindex = max_holder.max
    sim_docid = train_df.iloc[docindex][0]
    print(f'\tmost similar document to {ref_docid} is {sim_docid} with similarity {sim[0][0]:.4f}')

Using TF-IDF&word2vec
	most similar document to Doc1 is Doc165 with similarity 0.9894
	most similar document to Doc3 is Doc19 with similarity 0.9961
	most similar document to Doc5 is Doc26 with similarity 0.9885
	most similar document to Doc25 is Doc679 with similarity 1.0000
	most similar document to Doc36 is Doc7 with similarity 0.9885


## Part 2

In [12]:
print('Using doc2vec')
for ref_docid in ['Doc1', 'Doc3', 'Doc5', 'Doc25', 'Doc36']:
    article = test_df[test_df['id'] == ref_docid]['article'].values[0]
    ref_article_tokens = tokenizer(article)
    ref_article_embedding = doc2vec_model.infer_vector(ref_article_tokens)

    max_holder = MaxHolder()
    for docindex, _ in enumerate(tokens_of_articles):
        cur_article_embedding = doc2vec_representation_of_documents[docindex]
        sim = cosine_similarity(ref_article_embedding[np.newaxis], cur_article_embedding[np.newaxis])
        max_holder.max = (sim, docindex)

    sim, docindex = max_holder.max
    sim_docid = train_df.iloc[docindex][0]
    print(f'\tmost similar document to {ref_docid} is {sim_docid} with similarity {sim[0][0]:.3f}')

Using doc2vec
	most similar document to Doc1 is Doc33 with similarity 0.675
	most similar document to Doc3 is Doc19 with similarity 0.861
	most similar document to Doc5 is Doc0 with similarity 0.628
	most similar document to Doc25 is Doc679 with similarity 0.996
	most similar document to Doc36 is Doc406 with similarity 0.613


# Step 4: Similar Words

## Finding most similar words

In [13]:
sim_word_embeddings = list()
sim_word_embeddings_label = list()
sim_word_embeddings_cluster = list()
for i, w in enumerate(['تهران', 'بهداشت', 'دفاع', 'رودخانه', 'سرد', 'فرهنگ', 'استقلال']):
    most_sim_words = word2vec_model.wv.most_similar(positive=[w], topn=3)

    print(f'most similar words to {w} are:')
    pprint(most_sim_words)
    print()

    for sim_word, _ in most_sim_words + [(w,1)]:
        sim_word_embeddings.append(word2vec_model.wv.get_vector(sim_word))
        sim_word_embeddings_label.append(sim_word)
        sim_word_embeddings_cluster.append(i)

sim_word_embeddings = np.array(sim_word_embeddings)

most similar words to تهران are:
[('تبریز', 0.6092485785484314),
 ('اصفهان', 0.6041479110717773),
 ('مشهد', 0.5989333391189575)]

most similar words to بهداشت are:
[('باروری', 0.8025951385498047),
 ('مراقبتهای', 0.7638570666313171),
 ('بهورزان', 0.7540599703788757)]

most similar words to دفاع are:
[('مقدس', 0.703788697719574),
 ('ضدموشکی', 0.6654548645019531),
 ('پشتیبانی', 0.6579294800758362)]

most similar words to رودخانه are:
[('دریاچه', 0.8565274477005005),
 ('کارون', 0.8331038355827332),
 ('ارتفاعات', 0.8268086910247803)]

most similar words to سرد are:
[('زمستان', 0.7902213931083679),
 ('مرطوب', 0.7889065146446228),
 ('زمستان\u200cهای', 0.7770519852638245)]

most similar words to فرهنگ are:
[('ارشاد', 0.7776681780815125),
 ('سمعی', 0.6865524649620056),
 ('هنر', 0.6761164665222168)]

most similar words to استقلال are:
[('پاس', 0.70670086145401),
 ('صنام', 0.6973098516464233),
 ('پیکان', 0.6805547475814819)]



## Visualizing most similar words

### 2D visualization

In [14]:
pca_2d = PCA(n_components=2)
sim_word_2d_embeddings = pca_2d.fit_transform(sim_word_embeddings)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=sim_word_2d_embeddings[:,0],
    y=sim_word_2d_embeddings[:,1],
    mode="markers+text",
    marker=dict(color=sim_word_embeddings_cluster),
    name="Markers and Text",
    text=sim_word_embeddings_label,
    textposition="bottom center"
))

fig.update_layout(
    title_text="Visualization of word embeddings in 2D",
    title_x=0.5,
    font=dict(
        family="Vazir",
        size=13
    ),
    width=1000,
    height=700
)

fig.write_image("../images/2d_embedding_similarity.png")

### 3D visualization

In [15]:
pca_3d = PCA(n_components=3)
sim_word_2d_embeddings = pca_3d.fit_transform(sim_word_embeddings)

fig = go.Figure()
fig.add_trace(go.Scatter3d(
    x=sim_word_2d_embeddings[:,0],
    y=sim_word_2d_embeddings[:,1],
    z=sim_word_2d_embeddings[:,2],
    mode="markers+text",
    marker=dict(color=sim_word_embeddings_cluster),
    name="Markers and Text",
    text=sim_word_embeddings_label,
    textposition="bottom center"
))

fig.update_layout(
    title_text="Visualization of word embeddings in 3D",
    title_x=0.5,
    font=dict(
        family="Vazir",
        size=13
    ),
    width=800,
    height=1000
)

fig.write_image("../images/3d_embedding_similarity.png")