In [11]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

import nltk
import re
import gensim
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariiabogdanova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
documents = []
for i in range (0, 50):
    FILENAME = "Data/Docs/{}.txt".format(i)
    with open(FILENAME, 'r', encoding="utf8", errors="ignore") as inputfile:
        lines = inputfile.readlines()
        for line in lines:
            documents.append(line)

# Document Similarity: word2vec

In [13]:
documents_df = pd.DataFrame(documents,columns=['documents'])

# removing special characters and stop words from the text

stopwords_list = stopwords.words('english')
documents_df['documents_cleaned'] = documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stopwords_list) )
documents_df.head(5)

Unnamed: 0,documents,documents_cleaned
0,The national executive of the strife-torn Demo...,national executive strife torn democrats last ...
1,Cash-strapped financial services group AMP has...,cash strapped financial services group amp she...
2,The United States government has said it wants...,united states government said wants see presid...
3,A radical armed Islamist group with ties to Te...,radical armed islamist group ties tehran baghd...
4,Washington has sharply rebuked Russia over bom...,washington sharply rebuked russia bombings geo...


In [14]:
tfidfvectoriser = TfidfVectorizer(max_features = 64)
tfidfvectoriser.fit(documents_df.documents_cleaned)
tfidf_vectors = tfidfvectoriser.transform(documents_df.documents_cleaned)
tfidf_vectors = tfidf_vectors.toarray()

In [15]:
# tokenize and pad every document to make them of the same size
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents_df.documents_cleaned)
tokenized_documents = tokenizer.texts_to_sequences(documents_df.documents_cleaned)
tokenized_paded_documents = pad_sequences(tokenized_documents,maxlen = 64,padding = 'post')
vocab_size = len(tokenizer.word_index)+1

print(tokenized_paded_documents[0])

[203  65 455 456  39   3  21 204 205 457 103  28  15 206 104 105  22 458
 106 459 460 461   7   1 107 462 106 463 207   7   1  66 107 203  65   3
  21 208 209 108   1 210 211 105  22 212  15 104 464 465  22 109 110 111
 466 213 214 467   0   0   0   0   0   0]


In [18]:
W2V_PATH = "GoogleNews-vectors-negative300.bin.gz"
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)

In [19]:
embedding_matrix = np.zeros((vocab_size,300))
for word,i in tokenizer.word_index.items():
    if word in model_w2v:
        embedding_matrix[i] = model_w2v[word]
        
# creating document-word embeddings
document_word_embeddings = np.zeros((len(tokenized_paded_documents),64,300))

for i in range(len(tokenized_paded_documents)):
    for j in range(len(tokenized_paded_documents[0])):
        document_word_embeddings[i][j] = embedding_matrix[tokenized_paded_documents[i][j]]
        
# tf-idf vectors do not keep the original sequence of words, converting them into actual word sequences from the documents
document_embeddings = np.zeros((len(tokenized_paded_documents),300))
words = tfidfvectoriser.get_feature_names()

for i in range(len(document_word_embeddings)):
    for j in range(len(words)):
        document_embeddings[i] += embedding_matrix[tokenizer.word_index[words[j]]]*tfidf_vectors[i][j]
        
document_embeddings_word2vec = document_embeddings/np.sum(tfidf_vectors,axis=1).reshape(-1,1)
    



In [20]:
pairwise_similarities_word2vec = cosine_similarity(document_embeddings_word2vec)

# with open('Binaries/pairwise_similarities_word2vec.pickle','wb') as f:
#     pickle.dump(pairwise_similarities_word2vec, f)

# Document Similarity: doc2vec

In [24]:
tagged_data = [TaggedDocument(words = word_tokenize(doc), tags = [i]) for i, doc in enumerate(documents_df.documents_cleaned)]

model_d2v = Doc2Vec(vector_size = 100, alpha = 0.025, min_count=1)
model_d2v.build_vocab(tagged_data)

model_d2v.train(tagged_data, total_examples = model_d2v.corpus_count, epochs = 200)

document_embeddings_doc2vec = np.zeros((documents_df.shape[0],100))
for i in range(len(document_embeddings)):
    document_embeddings_doc2vec[i] = model_d2v.docvecs[i]

  document_embeddings_doc2vec[i] = model_d2v.docvecs[i]


In [25]:
pairwise_similarities_doc2vec = cosine_similarity(document_embeddings_doc2vec)

# with open('Binaries/pairwise_similarities_doc2vec.pickle','wb') as f:
#     pickle.dump(pairwise_similarities_doc2vec, f)

In [26]:
# Saving scores to csv
human_evaluation_data = pd.read_csv("Data/AverageSimilarities_fixed.csv")
human_evaluation_data["Similarity_word2vec"] = pairwise_similarities_word2vec[human_evaluation_data.Document_1-1, human_evaluation_data.Document_2-1]
human_evaluation_data["Similarity_doc2vec"] = pairwise_similarities_doc2vec[human_evaluation_data.Document_1-1, human_evaluation_data.Document_2-1]
human_evaluation_data.head(5)

human_evaluation_data.to_csv('Data/AverageSimilarities_fixed.csv', index=False)
human_evaluation_data.head(10)

Unnamed: 0,Document_1,Document_2,Similarity_avg,Similarity_avg_normalized,Similarity_word2vec,Similarity_doc2vec,Similarity_tf_idf
0,1,2,1.5,0.125,0.202667,0.41292,0.021084
1,1,3,1.2,0.05,0.512574,0.224046,0.004666
2,1,4,1.0,0.0,0.279664,0.201932,0.028945
3,1,5,1.5,0.125,0.372543,0.225909,0.001599
4,1,6,2.5,0.375,0.250421,0.37509,0.013378
5,1,7,1.3,0.075,0.266212,0.253162,0.017224
6,1,8,1.2,0.05,0.291865,0.346643,0.020166
7,1,9,1.0,0.0,0.461878,0.529536,0.065105
8,1,10,1.3,0.075,0.308305,0.205616,0.003985
9,1,11,1.3,0.075,0.35048,0.15723,0.003457


In [30]:
np.corrcoef(human_evaluation_data.Similarity_avg, human_evaluation_data.Similarity_doc2vec)

array([[1.        , 0.41866115],
       [0.41866115, 1.        ]])