In [None]:
import numpy as np
import torch
import pandas as pd
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

***
The data file has four news articles, three of them are about the Tesla Roadster car in space and the fourth is about a different topic (a gas company). I want to check the similarity of the four articles using cosine similarity and Eulcidean distance using different vector representations of words. 
Will follow the article https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630. 
The expectation is that the first three articles will be assessed as similar, while the fourth one different from them all.

A nice article on the interpretation of cosine similarity and Euclidean distance
https://www.baeldung.com/cs/euclidean-distance-vs-cosine-similarity
Cosine similarity is a metric used to measure how similar the documents are irrespective of their size. The cosine similarity is advantageous because even if the two similar documents are far apart by the Euclidean distance (due to the size of the document), chances are they may still be oriented closer together. 

***

In [None]:
data = pd.read_csv('data/roadster_news.csv', header=None)[0]

In [None]:
data

***
First, let's compute cosine similarity and ED using the Tf-Idf matrix
***



In [None]:
vectorizer = TfidfVectorizer(stop_words=nltk.corpus.stopwords.words('russian'), 
                             token_pattern=r'\b[^\d\W]{4,20}\b') #"\b[a-zA-z]+'?[a-zA-Z]+'\b",
tfidf_mat = vectorizer.fit_transform(data)              

In [None]:
cosi = [] #cosine similarity
ed = [] #euclidean distance
for r1, r2 in itertools.combinations(range(tfidf_mat.shape[0]), 2):
    c = np.dot(tfidf_mat[r1], tfidf_mat[r2].T).toarray()[0][0]
    d = np.sqrt((tfidf_mat[r2] - tfidf_mat[r1]).power(2).sum())
    cosi.append((r1, r2, c))
    ed.append((r1, r2, d))
#
#ed = euclidean_distances(tfidf_mat)
#cosine_similarity(tfidf_mat)

In [None]:
cosi.sort(key=lambda v:v[2], reverse=True)
ed.sort(key=lambda v:v[2])
print(f'Most similar texts are {cosi[0][0]} and {cosi[0][1]} (cosine similarity is {cosi[0][2]} ):')
print(f'Least similar texts are {cosi[-1][0]} and {cosi[-1][1]} (cosine similarity is {cosi[-1][2]} ):')
print(f'The smallest distance between {ed[0][0]} and {ed[0][1]} (distance is {ed[0][2]} ):')
print(f'Biggest distance between texts {ed[-1][0]} and {ed[-1][1]} (distance is {ed[-1][2]} ):')      

In [None]:
#print(data[cosi[-1][0]])
#print('-')
#print(data[cosi[-1][1]])

***
Now let's try using GloVe word embeddings. For simplicity, we will consider each document as one sentence and work with doc vectors. Because I use articles written in Russian, I use word embeddings from Navec (https://github.com/natasha/navec#downloads) that were trained using Russian news articles (navec_news_v1_1B_250K_300d_100q.tar). 
***

In [None]:
from navec import Navec
nv = Navec.load('data/embeddings/navec_news_v1_1B_250K_300d_100q.tar')

In [None]:
tokens = list(map(vectorizer.build_tokenizer(),data))
#min_token_len = 3
tokens = [[t.lower() for t in doc_toks if t in vectorizer.vocabulary_] for doc_toks in tokens]

***
Extract from the article:
Now we have to represent every document as a single vector. We can either average or sum over every word vector and convert every 64X300 representation into a 300-dimensional representation. But averaging or summing over all the words would lose the semantic and contextual meaning of the documents. Different lengths of the documents would also have an adverse effect on such operations.

One better way of doing this could be taking a weighted average of word vectors using the tf-idf weights. This can handle the variable length problem to a certain extent but cannot keep the semantic and contextual meaning of words. After doing that we can use the pairwise distances to calculate similar documents as we did in the tf-idf model.
***

In [None]:
from sklearn.preprocessing import normalize

In [None]:
#Sum up weigted embeddings of words in each document to create a vector representation of the document. 
emb_sz = nv.pq.dim
tfidf_df = pd.DataFrame(tfidf_mat.toarray())
docs_emb_glove = np.zeros((len(data), emb_sz))
for i in range(len(data)):
    for t in tokens[i]:
        if t in nv.vocab:
            docs_emb_glove[i] += nv[t] * tfidf_df[vectorizer.vocabulary_[t]][i]

In [None]:
docs_emb_glove_norm = normalize(docs_emb_glove, axis=1, norm='l2')
cosi = [] #cosine similarity
ed = [] #euclidean distance
for r1, r2 in itertools.combinations(range(docs_emb_glove.shape[0]), 2):
    c = np.dot(docs_emb_glove_norm[r1], docs_emb_glove_norm[r2].T)
    d = np.sqrt(np.power(docs_emb_glove[r2] - docs_emb_glove[r1], 2).sum())
    cosi.append((r1, r2, c))
    ed.append((r1, r2, d))
#cosine_similarity(docs_emb)
#euclidean_distances(docs_emb)

In [None]:
cosi.sort(key=lambda v:v[2], reverse=True)
ed.sort(key=lambda v:v[2])
print(f'Most similar texts are {cosi[0][0]} and {cosi[0][1]} (cosine similarity is {cosi[0][2]} ):')
print(f'Least similar texts are {cosi[-1][0]} and {cosi[-1][1]} (cosine similarity is {cosi[-1][2]} ):')
print(f'The smallest distance between {ed[0][0]} and {ed[0][1]} (distance is {ed[0][2]} ):')
print(f'Biggest distance between texts {ed[-1][0]} and {ed[-1][1]} (distance is {ed[-1][2]} ):')

***
Now use Word2Vec embeddings. 
I used CBOW embeddings (news_upos_cbow_300_2_2017.bin.gz) from RusVectores trained on news articles: https://rusvectores.org/ru/models/
For preprocessing and POS-tagging I used this script https://github.com/akutuzov/webvectors/blob/master/preprocessing/rus_preprocessing_udpipe.py

***

In [None]:
import gensim

In [None]:
rusvec = gensim.models.KeyedVectors.load_word2vec_format('./data/embeddings/news_0_300_2.bin', binary=True)

In [None]:
pos_tokens = []
with open('./data/roadster_news_pos.txt', ) as f:
    lines = f.readlines()
    pos_tokens = [line.split() for line in lines if len(line) > 1]

In [None]:
def fake_tokenizer(text):
    return text
#build tfidf matrix for tagged and lemmatized tokens
vectorizer_rusvec = TfidfVectorizer(tokenizer=fake_tokenizer, lowercase=False)
tfidf_mat_rusvec = vectorizer_rusvec.fit_transform(pos_tokens)

In [None]:
#Sum up weigted embeddings of words in each document to create a vector representation of the document. 
emb_sz = rusvec.vector_size
docs_emb_w2v = np.zeros((len(data), emb_sz))
tfidf_rusvec_df = pd.DataFrame(tfidf_mat_rusvec.toarray())
for i in range(len(pos_tokens)):
    for t in pos_tokens[i]:
        if t in rusvec:
            docs_emb_w2v[i] += rusvec.get_vector(t) * tfidf_rusvec_df[vectorizer_rusvec.vocabulary_[t]][i]
           

In [None]:
docs_emb_w2v_norm = normalize(docs_emb_w2v, axis=1, norm='l2')
cosi = [] #cosine similarity
ed = [] #euclidean distance
for r1, r2 in itertools.combinations(range(docs_emb_w2v.shape[0]), 2):
    c = np.dot(docs_emb_w2v_norm[r1], docs_emb_w2v_norm[r2].T)
    d = np.sqrt(np.power(docs_emb_w2v[r2] - docs_emb_w2v[r1], 2).sum())
    cosi.append((r1, r2, c))
    ed.append((r1, r2, d))

In [None]:
cosi.sort(key=lambda v:v[2], reverse=True)
ed.sort(key=lambda v:v[2])
print(f'Most similar texts are {cosi[0][0]} and {cosi[0][1]} (cosine similarity is {cosi[0][2]} ):')
print(f'Least similar texts are {cosi[-1][0]} and {cosi[-1][1]} (cosine similarity is {cosi[-1][2]} ):')
print(f'The smallest distance between {ed[0][0]} and {ed[0][1]} (distance is {ed[0][2]} ):')
print(f'Biggest distance between texts {ed[-1][0]} and {ed[-1][1]} (distance is {ed[-1][2]} ):')

In [None]:
cosine_similarity(docs_emb_w2v)

In [None]:
euclidean_distances(docs_emb_w2v)