In [None]:
# Load libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk, re, string, os
import gensim, spacy, glove
import sent2vec

from IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity
from scipy import linalg, mat, dot
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_curve, auc

from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.wrappers import FastText
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
# Load datasets

sick = open("SICK.txt", "r")
msr1 = open("msr_train.txt", "r")
msr2 = open("msr_test.txt", "r")
#ppdb = open("ppdb-2.0-l-all", "r")

## Data munging

In [None]:
def read_text_file(file):
    
    '''Reads lines in file and appends to a corpus list'''
    
    corpus = []
    for i, line in enumerate(file):
        if i==0:
            continue
        else:
            corpus.append(line)
    
    return corpus

In [None]:
def read_more_files(file1, file2=None):
    
    '''Reads lines in two files and appends them to a corpus list'''
    
    if file2==None:
        corpus = read_text_file(file1)
        
    else:
        corpus1 = read_text_file(file1)
        corpus2 = read_text_file(file2)
        corpus = corpus1 + corpus2
    
    print(len(corpus))
    return corpus   

In [None]:
# Implementation

#ppdb_corpus = read_more_files(ppdb)
#ppdb_corpus

./fasttext skipgram -input billwords_preprocessed.txt -dim 300 -thread 20 -minCount 5 -neg 5 -ws 5 -lr 0.025 -output modelft_sg

./fasttext cbow -input billwords_preprocessed.txt -dim 300 -thread 20 -minCount 5 -neg 5 -ws 5 -lr 0.05 -output modelft_cbow

./fasttext sent2vec -input billwords_preprocessed.txt -output modelsn_train -minCount 5 -dim 300 -epoch 5 -lr 0.2 -wordNgrams 1 -loss ns -neg 5 -thread 20

./fasttext sent2vec -input billwords_preprocessed.txt -output modelsn_unibi -minCount 5 -dim 300 -epoch 5 -lr 0.2 -wordNgrams 2 -loss ns -neg 5 -thread 20

In [None]:
# Implementation

sick_corpus = read_more_files(sick)
sick_corpus

In [None]:
# Implementation

msr_corpus = read_more_files(msr1, msr2)
msr_corpus

In [None]:
def extract_sentences_scores(corpus):
    
    '''Creates a list of sentence pairs omitting punctuation and a list of similarity scores'''
    
    sentences, scores = [], []
    for line in corpus:
        words = line.split('\t')
        
        for i, word in enumerate(words):
            try:
                int(word[0])
                continue
                
            except:
                sent1 = words[i].lower().strip()
                sent2 = words[i+1].lower().strip()
                
                translator = str.maketrans("", "", string.punctuation)
                sent1 = sent1.translate(translator)
                sent2 = sent2.translate(translator)
                
                sentences.append([sent1, sent2])
                
                if i==1:
                    score = words[4]
                    scores.append(float(score))
                
                elif i==3:
                    score = words[0]
                    scores.append(float(score))
                
                break
            
            else: continue
        
    return sentences, scores

In [None]:
# Implementation

sick_sentences, sick_scores = extract_sentences_scores(sick_corpus)
msr_sentences, msr_scores = extract_sentences_scores(msr_corpus)

In [None]:
sick_sentences

In [None]:
msr_sentences

## Stemming and Lemmatization

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

def stemming_words(sentences):
    
    '''Stems words in sentences using Porter Stemmer'''
    
    porter_stemmer = PorterStemmer()
    
    sentences_stem = []
    for pair in sentences:
        pair_new = []
        for sent in pair:
            sent_new = []
            words = word_tokenize(sent)
            for word in words:
                sent_new.append(porter_stemmer.stem(word))
            doc = ' '.join(sent_new)
            pair_new.append(doc)
        
        sentences_stem.append(pair_new)
    
    return sentences_stem
        

In [None]:
# Implementation

sick_sentences_stem = stemming_words(sick_sentences)
msr_sentences_stem = stemming_words(msr_sentences)

In [None]:
sick_sentences_stem

In [None]:
msr_sentences_stem

In [None]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    '''Gets POS tags from Wordnet'''
    
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''
    

In [None]:
from nltk import pos_tag

def pos_tagging(sentences):
    
    '''Part-of-Speech tagging using Wordnet treebank tags'''
    
    sentences_pos = []
    for pair in sentences:
        pair_new = []
        for sent in pair:
            sent_new = []
            words = word_tokenize(sent)
            tagged = pos_tag(words)
            for token, tag in tagged:
                wntag = get_wordnet_pos(tag)
                wordset = (token, wntag)
                sent_new.append(wordset)
            pair_new.append(sent_new) 
        sentences_pos.append(pair_new)
    
    return sentences_pos

In [None]:
# Implementation

sick_sentences_pos = pos_tagging(sick_sentences)
msr_sentences_pos = pos_tagging(msr_sentences)

In [None]:
sick_sentences_pos

In [None]:
msr_sentences_pos

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatizing_words(sentences):
    
    '''Lemmatizes words in sentences that are POS-tagged'''
    
    sentences_pos = pos_tagging(sentences)
    
    lemmatizer = WordNetLemmatizer()
    
    sentences_lemma = []
    for pair in sentences_pos:
        pair_new = []
        for sent in pair:
            sent_new = []
            for wordset in sent:
                try:
                    sent_new.append(lemmatizer.lemmatize(wordset[0], pos=wordset[1]))
                except:
                    sent_new.append(wordset[0])
                doc = ' '.join(sent_new)
            pair_new.append(doc)
        
        sentences_lemma.append(pair_new)
    
    return sentences_lemma

In [None]:
# Implementation

sick_sentences_lemma = lemmatizing_words(sick_sentences)
msr_sentences_lemma = lemmatizing_words(msr_sentences)

In [None]:
sick_sentences_lemma

In [None]:
msr_sentences_lemma

## Filtering stopwords

In [None]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
    
def remove_stopwords(sentences, stopwords):
    
    '''Removes English stopwords from sentences'''
    
    filtered_words = []
    for pair in sentences:
        pair_new = []
        for sent in pair:
            sent_new = []
            words = word_tokenize(sent)
            for word in words:
                if word not in stopwords:
                    sent_new.append(word)
            doc = ' '.join(sent_new)
            pair_new.append(doc)
        
        filtered_words.append(pair_new)
    
    return filtered_words

In [None]:
# Implementation

sick_filtered = remove_stopwords(sick_sentences_lemma, stopwords)
msr_filtered = remove_stopwords(msr_sentences_lemma, stopwords)

In [None]:
sick_filtered

In [None]:
msr_filtered

In [None]:
def convert_scores_to_binary(scores_list, threshold):
    
    '''Converts a percentage accuracy score to either a 0 or a 1'''
    
    bin_scores = []
    for score in scores_list:
        if score < threshold:
            bin_scores.append(0)
        else:
            bin_scores.append(1)
    
    return bin_scores

## Bag-of-Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def run_bow_model(sentences):
    
    '''Creates BoW model and calculates cosine similarity of each sentence pair'''
    
    count_vec = CountVectorizer(stop_words="english", analyzer='word', ngram_range=(1, 2), 
                               max_df=1.0, min_df=0.5, max_features=None)
    
    bow_matrix, bow_scores = [], []
    for pair in sentences:
        bow_model = count_vec.fit_transform(pair).toarray()
        bow_matrix.append(bow_model)

        cos_sim = cosine_similarity(bow_model[0,:].reshape(1,-1), bow_model[1,:].reshape(1, -1))
        bow_scores.append(float(cos_sim))
        
    
    return bow_scores

In [None]:
# Implementation

sick_bow_scores = run_bow_model(sick_sentences)
msr_bow_scores = run_bow_model(msr_sentences)

sick_bow_stem_scores = run_bow_model(sick_sentences_stem)
msr_bow_stem_scores = run_bow_model(msr_sentences_stem)

sick_bow_lem_scores = run_bow_model(sick_sentences_lemma)
msr_bow_lem_scores = run_bow_model(msr_sentences_lemma)

sick_bow_stw_scores = run_bow_model(sick_filtered)
msr_bow_stw_scores = run_bow_model(msr_filtered)

In [None]:
sick_bow_scores

In [None]:
msr_bow_scores

In [None]:
df_sick_bow = pd.DataFrame({'Relatedness': sick_scores, 'BoW': sick_bow_scores, 'BoW_stem': sick_bow_stem_scores,
                           'BoW_lemma': sick_bow_lem_scores, 'BoW_filt': sick_bow_stw_scores})
df_sick_bow

In [None]:
# Pearson's correlation of scores
df_sick_bow.corr(method='pearson')

In [None]:
# Spearman's correlation of scores
df_sick_bow.corr(method='spearman')

In [None]:
# Convert scores to binary

msr_bow_binscores = convert_scores_to_binary(msr_bow_scores, 0.7)
msr_bow_stem_binscores = convert_scores_to_binary(msr_bow_stem_scores, 0.7)
msr_bow_lem_binscores = convert_scores_to_binary(msr_bow_lem_scores, 0.7)
msr_bow_stw_binscores = convert_scores_to_binary(msr_bow_stw_scores, 0.7)

In [None]:
df_msr_bow = pd.DataFrame({'Similarity': msr_scores, 'BoW': msr_bow_binscores, 'BoW_stem': msr_bow_stem_binscores,
                           'BoW_lemma': msr_bow_lem_binscores, 'BoW_filt': msr_bow_stw_binscores})
df_msr_bow

In [None]:
# Confusion matrix of BoW scores

#plt.figure(figsize=(6, 4))

msr_bow_cf = pd.DataFrame(confusion_matrix(df_msr_bow['Similarity'], df_msr_bow['BoW']),  
                      columns=['BoW 0', 'BoW 1'], index=['MSR 0', 'MSR 1'])

msr_bow_acc = accuracy_score(df_msr_bow['Similarity'], df_msr_bow['BoW'])

msr_bow_stem_cf = pd.DataFrame(confusion_matrix(df_msr_bow['Similarity'], df_msr_bow['BoW_stem']),  
                      columns=['BoW_stem 0', 'BoW_stem 1'], index=['MSR 0', 'MSR 1'])

msr_bow_stem_acc = accuracy_score(df_msr_bow['Similarity'], df_msr_bow['BoW_stem'])

msr_bow_lem_cf = pd.DataFrame(confusion_matrix(df_msr_bow['Similarity'], df_msr_bow['BoW_lemma']),  
                      columns=['BoW_lemma 0', 'BoW_lemma 1'], index=['MSR 0', 'MSR 1'])

msr_bow_lem_acc = accuracy_score(df_msr_bow['Similarity'], df_msr_bow['BoW_lemma'])

msr_bow_stw_cf = pd.DataFrame(confusion_matrix(df_msr_bow['Similarity'], df_msr_bow['BoW_filt']),  
                      columns=['BoW_filt 0', 'BoW_filt 1'], index=['MSR 0', 'MSR 1'])

msr_bow_stw_acc = accuracy_score(df_msr_bow['Similarity'], df_msr_bow['BoW_filt'])

#sns.heatmap(msr_bow_cf, annot=True, cmap='Blues')
#plt.show()
display(msr_bow_cf)
print('Accuracy:', msr_bow_acc)
display(msr_bow_stem_cf)
print('Accuracy:', msr_bow_stem_acc)
display(msr_bow_lem_cf)
print('Accuracy:', msr_bow_lem_acc)
display(msr_bow_stw_cf)
print('Accuracy:', msr_bow_stw_acc)

In [None]:
# ROC curve and AUC score

plt.figure(figsize=(6, 4))
# ROC curve; AUC score
fpr, tpr, thresholds = roc_curve(msr_scores, msr_bow_stw_scores, pos_label=1)
roc_auc_dt = auc(fpr, tpr)

plt.title('ROC Curve - BoW_filt')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.4f'% roc_auc_dt)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## TF-IDF model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def run_tfidf_model(sentences):
    
    '''Creates TF-IDF model and calculates cosine similarity of each sentence pair'''
    
    tfidf_vec = TfidfVectorizer(stop_words="english", analyzer='word', ngram_range=(1, 2), 
                               max_df=1.0, min_df=0.5, max_features=None)
    
    tfidf_matrix, tfidf_scores = [], []
    for pair in sentences:
        tfidf_model = tfidf_vec.fit_transform(pair).toarray()
        tfidf_matrix.append(tfidf_model)

        cos_sim = cosine_similarity(tfidf_model[0,:].reshape(1,-1), tfidf_model[1,:].reshape(1, -1))
        tfidf_scores.append(float(cos_sim))
           
    return tfidf_scores

In [None]:
# Implementation

sick_tfidf_scores = run_tfidf_model(sick_sentences)
msr_tfidf_scores = run_tfidf_model(msr_sentences)

sick_tfidf_stem_scores = run_tfidf_model(sick_sentences_stem)
msr_tfidf_stem_scores = run_tfidf_model(msr_sentences_stem)

sick_tfidf_lem_scores = run_tfidf_model(sick_sentences_lemma)
msr_tfidf_lem_scores = run_tfidf_model(msr_sentences_lemma)

sick_tfidf_stw_scores = run_tfidf_model(sick_filtered)
msr_tfidf_stw_scores = run_tfidf_model(msr_filtered)

In [None]:
sick_tfidf_scores

In [None]:
msr_tfidf_scores

In [None]:
df_sick_tfidf = pd.DataFrame({'Relatedness': sick_scores, 'TF-IDF': sick_tfidf_scores, 
                              'TF-IDF_stem': sick_tfidf_stem_scores, 'TF-IDF_lemma': sick_tfidf_lem_scores, 
                              'TF-IDF_filt': sick_tfidf_stw_scores})
df_sick_tfidf

In [None]:
# Pearson's correlation of scores
df_sick_tfidf.corr(method='pearson')

In [None]:
# Spearman's correlation of scores
df_sick_tfidf.corr(method='spearman')

In [None]:
# Convert scores to binary

msr_tfidf_binscores = convert_scores_to_binary(msr_tfidf_scores, 0.7)
msr_tfidf_stem_binscores = convert_scores_to_binary(msr_tfidf_stem_scores, 0.7)
msr_tfidf_lem_binscores = convert_scores_to_binary(msr_tfidf_lem_scores, 0.7)
msr_tfidf_stw_binscores = convert_scores_to_binary(msr_tfidf_stw_scores, 0.7)

msr_scores = convert_scores_to_binary(msr_scores)

In [None]:
df_msr_tfidf = pd.DataFrame({'Similarity': msr_scores, 'TF-IDF': msr_tfidf_binscores, 
                             'TF-IDF_stem': msr_tfidf_stem_binscores, 'TF-IDF_lemma': msr_tfidf_lem_binscores, 
                             'TF-IDF_filt': msr_tfidf_stw_binscores})
df_msr_tfidf

In [None]:
# Confusion matrix of TF-IDF scores

#plt.figure(figsize=(6, 4))

msr_tfidf_cf = pd.DataFrame(confusion_matrix(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF']),  
                      columns=['TF-IDF 0', 'TF-IDF 1'], index=['MSR 0', 'MSR 1'])

msr_tfidf_acc = accuracy_score(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF'])

msr_tfidf_stem_cf = pd.DataFrame(confusion_matrix(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF_stem']),  
                      columns=['TF-IDF_stem 0', 'TF-IDF_stem 1'], index=['MSR 0', 'MSR 1'])

msr_tfidf_stem_acc = accuracy_score(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF_stem'])

msr_tfidf_lem_cf = pd.DataFrame(confusion_matrix(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF_lemma']),  
                      columns=['TF-IDF_lemma 0', 'TF-IDF_lemma 1'], index=['MSR 0', 'MSR 1'])

msr_tfidf_lem_acc = accuracy_score(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF_lemma'])

msr_tfidf_stw_cf = pd.DataFrame(confusion_matrix(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF_filt']),  
                      columns=['TF-IDF_filt 0', 'TF-IDF_filt 1'], index=['MSR 0', 'MSR 1'])

msr_tfidf_stw_acc = accuracy_score(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF_filt'])

#sns.heatmap(msr_tfidf_cf, annot=True, cmap='Blues')
#plt.show()
display(msr_tfidf_cf)
print('Accuracy:', msr_tfidf_acc)
display(msr_tfidf_stem_cf)
print('Accuracy:', msr_tfidf_stem_acc)
display(msr_tfidf_lem_cf)
print('Accuracy:', msr_tfidf_lem_acc)
display(msr_tfidf_stw_cf)
print('Accuracy:', msr_tfidf_stw_acc)

## Word2Vec

In [None]:
# Load Google's pre-trained Word2Vec model

modelwd_pret = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [None]:
modelwd_pret['dog']

In [None]:
modelwd_pret.most_similar(positive=['yes', 'yeah'])

In [None]:
modelwd_pret.similarity('good', 'great')

In [None]:
def create_words_list(sentences):
    
    '''Creates a list with separate words in each sentence'''
    
    words_list = []

    for pair in sentences:
        pair_new = []
        for sent in pair:
            words = sent.split()
            pair_new.append(words)

        words_list.append(pair_new)
    
    return words_list

In [None]:
# Implementation

sick_words_list = create_words_list(sick_filtered)
msr_words_list = create_words_list(msr_filtered)

In [None]:
sick_words_list

In [None]:
msr_words_list

In [None]:
def run_gensim_model(words_list, model):
    
    '''Runs Word2Vec and GloVe models in gensim and computes cosine similarity scores for each sentence pair'''
    
    matrix_list, scores = [], []

    for pair in words_list:
        sum_list = []
        for sent in pair:
            embeddings_list = []
            for word in sent:
                try:
                    embeddings_list.append(model[word])
                except:
                    embeddings_list.append(np.array(0))
            sum_list.append(sum(embeddings_list)/len(embeddings_list))
    
        matrix_list.append(sum_list)
        cos_sim = cosine_similarity(sum_list[0].reshape(1, -1), sum_list[1].reshape(1, -1))
        #cos_sim = dot(sum_list[0], sum_list[1].T)/(linalg.norm(sum_list[0])*linalg.norm(sum_list[1]))
        scores.append(float(cos_sim))
        
    return scores

In [None]:
# Implementation

sick_wdpret_scores = run_gensim_model(sick_words_list, modelwd_pret)
msr_wdpret_scores = run_gensim_model(msr_words_list, modelwd_pret)

In [None]:
sick_wdpret_scores

In [None]:
msr_wdpret_scores

In [None]:
def run_spacy_model(words_list, model):
    
    '''Runs Word2Vec and GloVe models in SpaCy and computes cosine similarity scores for each sentence pair'''
    
    scores = []
    for pair in words_list:
        token1 = model(pair[0])
        token2 = model(pair[1])
        scores.append(token1.similarity(token2))

    return scores

In [None]:
modelwd_spacy = spacy.load("en", vectors="GoogleNews-vectors-negative300.bin")

In [None]:
# Implementation

sick_wdspacy_scores = run_spacy_model(sick_filtered, modelwd_spacy)
msr_wdspacy_scores = run_spacy_model(msr_filtered, modelwd_spacy)

In [None]:
sick_wdspacy_scores

In [None]:
msr_wdspacy_scores

In [None]:
# Save UTF-8 version of text for training

billwords_raw = open("/project/1-billion-word-language-modeling-benchmark-r13output.tar", "r", encoding="ISO-8859-1")
billwords_clean = open('/project/billion_words_dataset/billwords_preprocessed.txt', 'w')

for i, line in enumerate(billwords_raw):
    if i==0:
        pos = line.find("While")
        line = line[pos:]
        
    billwords_clean.write(line)                

billwords_raw.close()
billwords_clean.close()

In [None]:
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
    
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            translator = str.maketrans("", "", string.punctuation)
            for line in open(os.path.join(self.dirname, fname)):
                line = line.translate(translator)
                line = line.lower().strip()
                yield line.split()

In [None]:
# Assign documents to directory of dataset for training

documents = MySentences("/project/billion_words_dataset")

In [None]:
# Train CBOW model

modelwd_cbow = Word2Vec(documents, size=300, window=5, min_count=5, workers=20, negative=5, sg=0)
#modelwd_train.train(documents, total_examples=len(documents), epochs=20)

In [None]:
# Train Skip-Gram model

modelwd_sg = Word2Vec(documents, size=300, window=5, min_count=5, workers=20, negative=5, sg=1)

In [None]:
# Save CBOW model

modelwd_cbow.wv.save_word2vec_format('modelwd_cbow.bin', binary=True)

In [None]:
# Save Skip-Gram model

modelwd_sg.wv.save_word2vec_format('modelwd_sg.bin', binary=True)

In [None]:
modelwd_cbow.wv.most_similar(positive='good', negative='bad')

In [None]:
modelwd_cbow.wv.similarity('person', 'people')

In [None]:
# Implementatin

sick_wdcbow_scores = run_gensim_model(sick_words_list, modelwd_cbow)
msr_wdcbow_scores = run_gensim_model(msr_words_list, modelwd_cbow)

sick_wdsg_scores = run_gensim_model(sick_words_list, modelwd_sg)
msr_wdsg_scores = run_gensim_model(msr_words_list, modelwd_sg)

In [None]:
sick_wdcbow_scores

In [None]:
msr_wdsg_scores

In [None]:
df_sick_w2v = pd.DataFrame({'Relatedness': sick_scores, 'Word2Vec_gensim': sick_wdpret_scores, 
                            'Word2Vec_spacy': sick_wdspacy_scores, 'Word2Vec_cbow': sick_wdcbow_scores,
                            'Word2Vec_sg': sick_wdsg_scores})
df_sick_w2v

In [None]:
df_sick_w2v.corr(method='pearson')

In [None]:
df_sick_w2v.corr(method='spearman')

In [None]:
# Convert scores to binary

msr_wdpret_binscores = convert_scores_to_binary(msr_wdpret_scores, 0.7)
msr_wdspacy_binscores = convert_scores_to_binary(msr_wdspacy_scores, 0.7)
msr_wdcbow_binscores = convert_scores_to_binary(msr_wdcbow_scores, 0.7)
msr_wdsg_binscores = convert_scores_to_binary(msr_wdsg_scores, 0.7)

In [None]:
df_msr_w2v = pd.DataFrame({'Similarity': msr_scores, 'Word2Vec_gensim': msr_wdpret_binscores, 
                           'Word2Vec_spacy': msr_wdspacy_binscores, 'Word2Vec_cbow': msr_wdcbow_binscores,
                           'Word2Vec_sg': msr_wdsg_binscores})
df_msr_w2v

In [None]:
# Confusion matrix of Word2Vec scores

#plt.figure(figsize=(6, 4))

msr_wdpret_cf = pd.DataFrame(confusion_matrix(df_msr_w2v['Similarity'], df_msr_w2v['Word2Vec_gensim']),  
                      columns=['W2V_gensim 0', 'W2V_gensim 1'], index=['MSR 0', 'MSR 1'])

msr_wdpret_acc = accuracy_score(df_msr_w2v['Similarity'], df_msr_w2v['Word2Vec_gensim'])

msr_wdspacy_cf = pd.DataFrame(confusion_matrix(df_msr_w2v['Similarity'], df_msr_w2v['Word2Vec_spacy']),  
                      columns=['W2V_spacy 0', 'W2V_spacy 1'], index=['MSR 0', 'MSR 1'])

msr_wdspacy_acc = accuracy_score(df_msr_w2v['Similarity'], df_msr_w2v['Word2Vec_spacy'])

msr_wdcbow_cf = pd.DataFrame(confusion_matrix(df_msr_w2v['Similarity'], df_msr_w2v['Word2Vec_cbow']),  
                      columns=['W2V_cbow 0', 'W2V_cbow 1'], index=['MSR 0', 'MSR 1'])

msr_wdcbow_acc = accuracy_score(df_msr_w2v['Similarity'], df_msr_w2v['Word2Vec_cbow'])

msr_wdsg_cf = pd.DataFrame(confusion_matrix(df_msr_w2v['Similarity'], df_msr_w2v['Word2Vec_sg']),  
                      columns=['W2V_sg 0', 'W2V_sg 1'], index=['MSR 0', 'MSR 1'])

msr_wdsg_acc = accuracy_score(df_msr_w2v['Similarity'], df_msr_w2v['Word2Vec_sg'])

#sns.heatmap(msr_wdpret_cf, annot=True, cmap='Blues')
#plt.show()
display(msr_wdpret_cf)
print('Accuracy:', msr_wdpret_acc)
display(msr_wdspacy_cf)
print('Accuracy:', msr_wdspacy_acc)
display(msr_wdcbow_cf)
print('Accuracy:', msr_wdcbow_acc)
display(msr_wdsg_cf)
print('Accuracy:', msr_wdsg_acc)

## GloVe

In [None]:
# Load pretrained corpus of GloVe vectors

glove2word2vec(glove_input_file="glove.6B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")

modelgl_pret = KeyedVectors.load_word2vec_format('gensim_glove_vectors.txt', binary=False)  

In [None]:
# Implementation

sick_glpret_scores = run_gensim_model(sick_words_list, modelgl_pret)
msr_glpret_scores = run_gensim_model(msr_words_list, modelgl_pret)

In [None]:
sick_glpret_scores

In [None]:
msr_glpret_scores

In [None]:
#modelgl_spacy = spacy.load("en_vectors_web_lg")

In [None]:
modelgl_spacy = spacy.load("en", vectors="glove.6B.300d.txt")

In [None]:
# Implementation

sick_glspacy_scores = run_spacy_model(sick_filtered, modelgl_spacy)
msr_glspacy_scores = run_spacy_model(msr_filtered, modelgl_spacy)

In [None]:
sick_glspacy_scores

In [None]:
msr_glspacy_scores

In [None]:
from glove import Corpus, Glove

def run_glove_model(words_list, model):
    
    '''Runs GloVe model in glove and computes cosine similarity scores for each sentence pair'''
    
    matrix_list, scores = [], []

    for pair in words_list:
        sum_list = []
        for sent in pair:
            embeddings_list = []
            for word in sent:
                try:
                    embeddings_list.append(model.word_vectors[model.dictionary[word]])
                except:
                    embeddings_list.append(np.array(0))
            sum_list.append(sum(embeddings_list)/len(embeddings_list))
    
        matrix_list.append(sum_list)
        cos_sim = cosine_similarity(sum_list[0].reshape(1, -1), sum_list[1].reshape(1, -1))
        #cos_sim = dot(sum_list[0], sum_list[1].T)/(linalg.norm(sum_list[0])*linalg.norm(sum_list[1]))
        scores.append(float(cos_sim))
        
    return scores

In [None]:
# Train GloVe model using dataset assigned to documents

documents = MySentences("/project/billion_words_dataset")

corpus = Corpus()

corpus.fit(documents, window=5)
 
modelgl_train = Glove(no_components=300, learning_rate=0.05)

In [None]:
# Train model with epochs

modelgl_train.fit(corpus.matrix, epochs=5, no_threads=20, verbose=True)

In [None]:
# Add word embeddings to corpus dictionary

modelgl_train.add_dictionary(corpus.dictionary)

In [None]:
modelgl_train.save('modelgl_train.txt')

In [None]:
# Load trained file from demo.sh

glove2word2vec(glove_input_file="/project/GloVe/modelgl_train.txt", word2vec_output_file="modelgl_vectors.txt")

# Load pretrained corpus of GloVe vectors
modelgl_train = KeyedVectors.load_word2vec_format('modelgl_vectors.txt', binary=False)  

In [None]:
modelgl_train.most_similar('frog')

In [None]:
modelgl_train.word_vectors[modelgl_train.dictionary['frog']]

In [None]:
# Implementation

sick_gltrain_scores = run_gensim_model(sick_words_list, modelgl_train)
msr_gltrain_scores = run_gensim_model(msr_words_list, modelgl_train)

In [None]:
sick_gltrain_scores

In [None]:
msr_gltrain_scores

In [None]:
# Convert scores to binary

msr_glpret_binscores = convert_scores_to_binary(msr_glpret_scores, 0.7)
msr_glspacy_binscores = convert_scores_to_binary(msr_glspacy_scores, 0.7)
msr_gltrain_binscores = convert_scores_to_binary(msr_gltrain_scores, 0.7)

In [None]:
df_sick_glv = pd.DataFrame({'Relatedness': sick_scores, 'GloVe_gensim': sick_glpret_scores, 
                              'GloVe_spacy': sick_glspacy_scores, 'GloVe_train': sick_gltrain_scores})
df_sick_glv

In [None]:
df_sick_glv.corr(method='pearson')

In [None]:
df_sick_glv.corr(method='spearman')

In [None]:
df_msr_glv = pd.DataFrame({'Similarity': msr_scores, 'GloVe_gensim': msr_glpret_binscores, 
                              'GloVe_spacy': msr_glspacy_binscores, 'GloVe_train': msr_gltrain_binscores})
df_msr_glv

In [None]:
# Confusion matrix of GloVe scores

#plt.figure(figsize=(6, 4))

msr_glpret_cf = pd.DataFrame(confusion_matrix(df_msr_glv['Similarity'], df_msr_glv['GloVe_gensim']),  
                      columns=['GloVe_gensim 0', 'GloVe_gensim 1'], index=['MSR 0', 'MSR 1'])

msr_glpret_acc = accuracy_score(df_msr_glv['Similarity'], df_msr_glv['GloVe_gensim'])

msr_glspacy_cf = pd.DataFrame(confusion_matrix(df_msr_glv['Similarity'], df_msr_glv['GloVe_spacy']),  
                      columns=['GloVe_spacy 0', 'GloVe_spacy 1'], index=['MSR 0', 'MSR 1'])

msr_glspacy_acc = accuracy_score(df_msr_glv['Similarity'], df_msr_glv['GloVe_spacy'])

msr_gltrain_cf = pd.DataFrame(confusion_matrix(df_msr_glv['Similarity'], df_msr_glv['GloVe_train']),  
                      columns=['GloVe_train 0', 'GloVe_train 1'], index=['MSR 0', 'MSR 1'])

msr_gltrain_acc = accuracy_score(df_msr_glv['Similarity'], df_msr_glv['GloVe_train'])

#sns.heatmap(msr_glpret_cf, annot=True, cmap='Blues')
#plt.show()
display(msr_glpret_cf)
print('Accuracy:', msr_glpret_acc)
display(msr_glspacy_cf)
print('Accuracy:', msr_glspacy_acc)
display(msr_gltrain_cf)
print('Accuracy:', msr_gltrain_acc)

## fastText

In [None]:
# Load trained Skip-Gram word embeddings file

modelft_sg = FastText.load_fasttext_format('/project/fastText/modelft_sg.bin')

In [None]:
# Load trained CBOW word embeddings file

modelft_cbow = FastText.load_fasttext_format('/project/fastText/modelft_cbow.bin')

In [None]:
modelft_cbow.most_similar('teacher')

In [None]:
modelft_cbow.similarity('teacher', 'lecturer')

In [None]:
# Implementation

sick_ftsg_scores = run_gensim_model(sick_words_list, modelft_sg)
sick_ftcbow_scores = run_gensim_model(sick_words_list, modelft_cbow)

msr_ftsg_scores = run_gensim_model(msr_words_list, modelft_sg)
msr_ftcbow_scores = run_gensim_model(msr_words_list, modelft_cbow)

In [None]:
sick_ftcbow_scores

In [None]:
msr_ftsg_scores

In [None]:
# Convert scores to binary

msr_ftsg_binscores = convert_scores_to_binary(msr_ftsg_scores, 0.7)
msr_ftcbow_binscores = convert_scores_to_binary(msr_ftcbow_scores, 0.7)

In [None]:
df_sick_ft = pd.DataFrame({'Relatedness': sick_scores, 'fastText_cbow': sick_ftcbow_scores,
                            'fastText_sg': sick_ftsg_scores})
df_sick_ft

In [None]:
df_sick_ft.corr(method='pearson')

In [None]:
df_sick_ft.corr(method='spearman')

In [None]:
df_msr_ft = pd.DataFrame({'Similarity': msr_scores, 'fastText_cbow': msr_ftcbow_binscores,
                           'fastText_sg': msr_ftsg_binscores})
df_msr_ft

In [None]:
# Confusion matrix of fastText scores

#plt.figure(figsize=(6, 4))

msr_ftcbow_cf = pd.DataFrame(confusion_matrix(df_msr_ft['Similarity'], df_msr_ft['fastText_cbow']),  
                      columns=['FT_cbow 0', 'FT_cbow 1'], index=['MSR 0', 'MSR 1'])

msr_ftcbow_acc = accuracy_score(df_msr_ft['Similarity'], df_msr_ft['fastText_cbow'])

msr_ftsg_cf = pd.DataFrame(confusion_matrix(df_msr_ft['Similarity'], df_msr_ft['fastText_sg']),  
                      columns=['FT_sg 0', 'FT_sg 1'], index=['MSR 0', 'MSR 1'])

msr_ftsg_acc = accuracy_score(df_msr_ft['Similarity'], df_msr_ft['fastText_sg'])

#sns.heatmap(msr_wdpret_cf, annot=True, cmap='Blues')
#plt.show()
display(msr_ftcbow_cf)
print('Accuracy:', msr_ftcbow_acc)
display(msr_ftsg_cf)
print('Accuracy:', msr_ftsg_acc)

## Sent2Vec

In [None]:
# Load model

modelsn_train = sent2vec.Sent2vecModel()
modelsn_train.load_model('/project/sent2vec/modelsn_train.bin')

In [None]:
# Load model

modelsn_unibi = sent2vec.Sent2vecModel()
modelsn_unibi.load_model('/project/sent2vec/modelsn_unibi.bin')

In [None]:
def run_sent2vec_model(sentences, model):
    
    '''Runs Word2Vec and GloVe models in gensim and computes cosine similarity scores for each sentence pair'''
    
    scores = []

    for pair in sentences:
        embeddings_list = []
        for sent in pair:
            try:
                embeddings_list.append(model.embed_sentence(sent))
            except:
                embeddings_list.append(np.array(0))
                
        cos_sim = cosine_similarity(embeddings_list[0].reshape(1, -1), embeddings_list[1].reshape(1, -1))
        #cos_sim = dot(sum_list[0], sum_list[1].T)/(linalg.norm(sum_list[0])*linalg.norm(sum_list[1]))
        scores.append(float(cos_sim))
        
    return scores

In [None]:
# Implementation

sick_sntrain_scores = run_sent2vec_model(sick_filtered, modelsn_train)
msr_sntrain_scores = run_sent2vec_model(msr_filtered, modelsn_train)

sick_snunibi_scores = run_sent2vec_model(sick_filtered, modelsn_unibi)
msr_snunibi_scores = run_sent2vec_model(msr_filtered, modelsn_unibi)

In [None]:
sick_snunibi_scores

In [None]:
msr_snunibi_scores

In [None]:
# Convert scores to binary

msr_sntrain_binscores = convert_scores_to_binary(msr_sntrain_scores, 0.7)
msr_snunibi_binscores = convert_scores_to_binary(msr_snunibi_scores, 0.7)

In [None]:
df_sick_s2v = pd.DataFrame({'Relatedness': sick_scores, 'sent2vec_train': sick_sntrain_scores, 
                            'sent2vec_unibi': sick_snunibi_scores})
df_sick_s2v

In [None]:
df_sick_s2v.corr(method='pearson')

In [None]:
df_sick_s2v.corr(method='spearman')

In [None]:
df_msr_s2v = pd.DataFrame({'Similarity': msr_scores, 'sent2vec_train': msr_sntrain_binscores, 
                           'sent2vec_unibi': msr_snunibi_binscores})
df_msr_s2v

In [None]:
# Confusion matrix of fastText scores

#plt.figure(figsize=(6, 4))

msr_sntrain_cf = pd.DataFrame(confusion_matrix(df_msr_s2v['Similarity'], df_msr_s2v['sent2vec_train']),  
                      columns=['S2V_train 0', 'S2V_train 1'], index=['MSR 0', 'MSR 1'])

msr_sntrain_acc = accuracy_score(df_msr_s2v['Similarity'], df_msr_s2v['sent2vec_train'])

msr_snunibi_cf = pd.DataFrame(confusion_matrix(df_msr_s2v['Similarity'], df_msr_s2v['sent2vec_unibi']),  
                      columns=['S2V_unibi 0', 'S2V_unibi 1'], index=['MSR 0', 'MSR 1'])

msr_snunibi_acc = accuracy_score(df_msr_s2v['Similarity'], df_msr_s2v['sent2vec_unibi'])

#sns.heatmap(msr_wdpret_cf, annot=True, cmap='Blues')
#plt.show()
display(msr_sntrain_cf)
print('Accuracy:', msr_sntrain_acc)
display(msr_snunibi_cf)
print('Accuracy:', msr_snunibi_acc)

## Doc2Vec

In [None]:
def create_docs_list(sentences):
    
    '''Creates a list of sentences corresponding to a document'''
    
    docs_list = []
    for pair in sentences:
        docs_list.append(pair[0])
        docs_list.append(pair[1])
        
    return docs_list

In [None]:
def create_tagged_documents(sentences):
    
    '''Gives numerical tags to documents in a list'''
    
    docs_list = create_docs_list(sentences)
    
    tagged_docs = []
    for i, doc in enumerate(docs_list):
        tagged_docs.append(TaggedDocument(words=doc.split(), tags=[i]))
    
    return docs_list, tagged_docs

In [None]:
sick_docs_list, sick_tagged_docs = create_tagged_documents(sick_filtered)
msr_docs_list, msr_tagged_docs = create_tagged_documents(msr_filtered)

In [None]:
sick_tagged_docs

In [None]:
msr_tagged_docs

In [None]:
class MyDocuments(object):
    def __init__(self, dirname):
        self.dirname = dirname
    
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            translator = str.maketrans("", "", string.punctuation)
            for i, line in enumerate(open(os.path.join(self.dirname, fname))):
                line = line.translate(translator)
                line = line.lower().strip()
                yield TaggedDocument(words=line.split(), tags=[i])

In [None]:
# Assign documents to directory of dataset for training

tagged_docs = MyDocuments("/project/billion_words_dataset")

In [None]:
modeldc_dm = Doc2Vec(tagged_docs, vector_size=300, window=5, min_count=5, workers=20, alpha=0.025, min_alpha=0.025, 
                     negative=5, dm=1, dbow_words=0)

In [None]:
def train_doc2vec_model(tagged_docs, dm, dbow_words):
    
    '''Train a different Doc2Vec model using specific parameters'''
    
    model = Doc2Vec(tagged_docs, vector_size=300, window=5, min_count=1, workers=4, alpha=0.025, min_alpha=0.025, 
                     negative=5, epoch=20, dm=dm, dbow_words=dbow_words)
    
    return model

In [None]:
# Implementation

sick_modeldc_dm = train_doc2vec_model(sick_tagged_docs, dm=1, dbow_words=0)
sick_modeldc_dbow0 = train_doc2vec_model(sick_tagged_docs, dm=0, dbow_words=0)
sick_modeldc_dbow1 = train_doc2vec_model(sick_tagged_docs, dm=0, dbow_words=1)

msr_modeldc_dm = train_doc2vec_model(msr_tagged_docs, dm=1, dbow_words=0)
msr_modeldc_dbow0 = train_doc2vec_model(msr_tagged_docs, dm=0, dbow_words=0)
msr_modeldc_dbow1 = train_doc2vec_model(msr_tagged_docs, dm=0, dbow_words=1)

In [None]:
sick_modeldc_dm.most_similar('kid')

In [None]:
msr_modeldc_dm.most_similar('kid')

In [None]:
sick_modeldc_dm.infer_vector(sick_docs_list[0])

In [None]:
def run_doc2vec_model(docs_list, model):
    
    '''Runs Doc2Vec models and computes cosine similarity scores for each sentence (document) pair'''
    
    scores = []

    i = 0
    while i < len(docs_list):

        vector1 = model.infer_vector(docs_list[i])
        vector2 = model.infer_vector(docs_list[i+1])
        
        cos_sim = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))
        #cos_sim = dot(vector1.T, vector2)/(linalg.norm(vector1)*linalg.norm(vector2))
        #print(cos_sim)
        scores.append(float(cos_sim))
        i += 2
        
    return scores

In [None]:
# Implementation

sick_dcdm_scores = run_doc2vec_model(sick_docs_list, sick_modeldc_dm)
sick_dcdbow0_scores = run_doc2vec_model(sick_docs_list, sick_modeldc_dbow0)
sick_dcdbow1_scores = run_doc2vec_model(sick_docs_list, sick_modeldc_dbow1)

msr_dcdm_scores = run_doc2vec_model(msr_docs_list, msr_modeldc_dm)
msr_dcdbow0_scores = run_doc2vec_model(msr_docs_list, msr_modeldc_dbow0)
msr_dcdbow1_scores = run_doc2vec_model(msr_docs_list, msr_modeldc_dbow1)

In [None]:
# Convert scores to binary

msr_dcdm_binscores = convert_scores_to_binary(msr_dcdm_scores, 0.7)
msr_dcdbow0_binscores = convert_scores_to_binary(msr_dcdbow0_scores, 0.7)
msr_dcdbow1_binscores = convert_scores_to_binary(msr_dcdbow1_scores, 0.7)

In [None]:
len(sick_dcdm_scores)

In [None]:
df_sick_d2v = pd.DataFrame({'Relatedness': sick_scores, 'Doc2Vec_dm': sick_dcdm_scores, 'Doc2Vec_dbow0': sick_dcdbow0_scores,
                              'Doc2Vec_dbow1': sick_dcdbow1_scores})
df_sick_d2v

In [None]:
df_sick_d2v.corr(method='pearson')

In [None]:
df_sick_d2v.corr(method='spearman')

In [None]:
df_msr_d2v = pd.DataFrame({'Similarity': msr_scores, 'Doc2Vec_dm': msr_dcdm_binscores, 'Doc2Vec_dbow0': msr_dcdbow0_binscores,
                              'Doc2Vec_dbow1': msr_dcdbow1_binscores})
df_msr_d2v

In [None]:
# Confusion matrix of Doc2Vec scores

#plt.figure(figsize=(6, 4))

msr_dcdm_cf = pd.DataFrame(confusion_matrix(df_msr_d2v['Similarity'], df_msr_d2v['Doc2Vec_dm']),  
                      columns=['D2V_dm 0', 'D2V_dm 1'], index=['MSR 0', 'MSR 1'])

msr_dcdbow0_cf = pd.DataFrame(confusion_matrix(df_msr_d2v['Similarity'], df_msr_d2v['Doc2Vec_dbow0']),  
                      columns=['D2V_dbow0 0', 'D2V_dbow0 1'], index=['MSR 0', 'MSR 1'])

msr_dcdbow1_cf = pd.DataFrame(confusion_matrix(df_msr_d2v['Similarity'], df_msr_d2v['Doc2Vec_dbow1']),  
                      columns=['D2V_dbow1 0', 'D2V_dbow1 1'], index=['MSR 0', 'MSR 1'])

#sns.heatmap(msr_dcdm_cf, annot=True, cmap='Blues')
#plt.show()
print(msr_dcdm_cf, '\n')
print(msr_dcdbow0_cf, '\n')
print(msr_dcdbow1_cf, '\n')