In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats


import re
import os
import gensim
import pickle

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.keyedvectors import KeyedVectors

import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download("stopwords")
from nltk.corpus import stopwords
import stanza
print("Downloading English model...")
stanza.download('en')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariiabogdanova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mariiabogdanova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Downloading English model...


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2022-04-27 17:44:49 INFO: Downloading default packages for language: en (English)...
2022-04-27 17:44:50 INFO: File exists: /Users/mariiabogdanova/stanza_resources/en/default.zip.
2022-04-27 17:44:54 INFO: Finished downloading models and saved to /Users/mariiabogdanova/stanza_resources.


# Document Similarity: word2vec

In [2]:
def tokenize_and_normalize(doc_str, stopwords):
    """Tokenizes, lemmatizes, lowercases and removes stop words.
    
    this function takes in a path to a song, reads the song file,
    tokenizes it into words, then lemmatizes and lowercases these words.
    finally, stopwords given to the function are removed from the list of song lemmas
    
    Parameters
    ----------
    file_name : str
        a path to a text file
    stopwords : list of strings
        stopwords that should be removed
    
    Returns
    -------
    normalized_song : list of strings
        a song represented as a list of its lemmas
    """
    
    nlp = stanza.Pipeline(lang='en', processors='tokenize, lemma',  verbose=False)
    doc = nlp(doc_str)
    words = doc.iter_words()
    normalized_doc = []
    for w in words:
        w = w.lemma.lower()
        if not w in stopwords:
            normalized_doc.append(w)
    normalized_doc = ' '.join(normalized_doc)
    return normalized_doc



In [3]:
## Remove the numbers at the start and end of the documents.
DATAFILE = "./Data/LeePincombeWelshDocuments.txt"
CLEANFILE = "./Data/cleanLPW.txt"
INDIVIDUAL_DOCS = "./Data/Docs"
stopwords_english = stopwords.words('english')
normalized_docs = []
if (os.path.exists(CLEANFILE)):
    os.remove(CLEANFILE)
i = 0
with open(DATAFILE, 'r', encoding="utf8", errors="ignore") as inputfile:
     lines = inputfile.readlines()
     for line in lines[1:-1]:
        start_removed = re.sub("(\d*\.\s)", "", line, 1)
        end_removed = re.sub("\(\d* words\)", "", start_removed, 1)
        normalized_docs.append(tokenize_and_normalize(end_removed, stopwords_english).split())
        with open(CLEANFILE, 'a+') as outputfile:
            outputfile.write(end_removed)
        with open(INDIVIDUAL_DOCS+f"/{i}.txt", "w+") as docfile:
            docfile.write(end_removed)
            i = i + 1

documents = []
for doc in normalized_docs:
    documents.append(' '.join(doc))

In [4]:
human_evaluation_data = pd.read_csv("Data/AverageSimilarities_fixed.csv")
human_evaluation_data.head(5)

Unnamed: 0,Document_1,Document_2,Similarity_avg,Similarity_avg_normalized,Similarity_word2vec,Similarity_doc2vec,Similarity_tf_idf,Similarities_SBERT
0,1,2,1.5,0.125,0.202667,0.41292,0.021084,0.226575
1,1,3,1.2,0.05,0.512574,0.224046,0.004666,0.249689
2,1,4,1.0,0.0,0.279664,0.201932,0.028945,0.231258
3,1,5,1.5,0.125,0.372543,0.225909,0.001599,0.160115
4,1,6,2.5,0.375,0.250421,0.37509,0.013378,0.189578


In [5]:
documents_df = pd.DataFrame(documents,columns=['documents'])

stopwords_list = stopwords.words('english')
documents_df['documents_cleaned'] = documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stopwords_list) )
documents_df.head(5)

Unnamed: 0,documents,documents_cleaned
0,national executive strife-torn democrat last n...,national executive strife torn democrat last n...
1,cash-strapped financial service group amp shel...,cash strapped financial service group amp shel...
2,united state government say want see president...,united state government say want see president...
3,radical armed islamist group tie tehran baghda...,radical armed islamist group tie tehran baghda...
4,washington sharply rebuked russia bombing geor...,washington sharply rebuked russia bombing geor...


In [6]:
def vectorize(w2v_model, doc: str) -> np.ndarray:
    doc = doc.lower()
    words = [w for w in doc.split(" ")]
    word_vecs = []
    for word in words:
        try:
            vec = w2v_model[word]
            word_vecs.append(vec)
        except KeyError:
            pass
    vector = np.mean(word_vecs, axis=0)
    return vector

def _cosine_sim(vecA, vecB):
    csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
    if np.isnan(np.sum(csim)):
        return 0
    return csim

def calculate_similarity(w2v_model, source_doc, target_docs=None, threshold=0):
    if not target_docs:
        return []

    if isinstance(target_docs, str):
        target_docs = [target_docs]

    source_vec = vectorize(w2v_model, source_doc)
    results = []
    for doc in target_docs:
        target_vec = vectorize(w2v_model, doc)
        sim_score = _cosine_sim(source_vec, target_vec)
        if sim_score > threshold:
            results.append({"score": sim_score, "doc": doc})
        results.sort(key=lambda k: k["score"], reverse=True)

    return results

In [7]:
# you can download pre-trained word2vec here https://code.google.com/archive/p/word2vec/

model_path = 'GoogleNews-vectors-negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [8]:
data = []

for doc1, doc2 in zip(human_evaluation_data.Document_1.values, human_evaluation_data.Document_2.values):
    sim_scores = calculate_similarity(w2v_model, documents_df.documents_cleaned.values[doc1-1], documents_df.documents_cleaned.values[doc2-1])
    data.append(sim_scores[0]['score'])

In [9]:
human_evaluation_data["Similarity_word2vec"] = data
human_evaluation_data.to_csv('Data/AverageSimilarities_fixed.csv', index=False)

In [11]:
scipy.stats.pearsonr(human_evaluation_data.Similarity_avg, human_evaluation_data.Similarity_word2vec)[0]

0.6116588694445468