In [None]:
# Load libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk, re, string, os
import gensim, spacy

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

In [19]:
# Load datasets

sick = open("SICK.txt", "r")
msr1 = open("msr_train.txt", "r")
msr2 = open("msr_test.txt", "r")
#ppdb = open("ppdb-2.0-l-all", "r")

## Data munging

In [20]:
def read_text_file(file):
    
    '''Reads lines in file and appends to a corpus list'''
    
    corpus = []
    for i, line in enumerate(file):
        if i==0:
            continue
        else:
            corpus.append(line)
    
    return corpus

In [21]:
def read_more_files(file1, file2=None):
    
    '''Reads lines in two files and appends them to a corpus list'''
    
    if file2==None:
        corpus = read_text_file(file1)
        
    else:
        corpus1 = read_text_file(file1)
        corpus2 = read_text_file(file2)
        corpus = corpus1 + corpus2
    
    print(len(corpus))
    return corpus   

In [22]:
# Implementation

#ppdb_corpus = read_more_files(ppdb)
#ppdb_corpus

In [23]:
# Implementation

sick_corpus = read_more_files(sick)
sick_corpus

9840


['1\tA group of kids is playing in a yard and an old man is standing in the background\tA group of boys in a yard is playing and a man is standing in the background\tNEUTRAL\t4.5\tA_neutral_B\tB_neutral_A\tA group of children playing in a yard, a man in the background.\tA group of children playing in a yard, a man in the background.\tFLICKR\tFLICKR\tTRAIN\n',
 '2\tA group of children is playing in the house and there is no man standing in the background\tA group of kids is playing in a yard and an old man is standing in the background\tNEUTRAL\t3.2\tA_contradicts_B\tB_neutral_A\tA group of children playing in a yard, a man in the background.\tA group of children playing in a yard, a man in the background.\tFLICKR\tFLICKR\tTRAIN\n',
 '3\tThe young boys are playing outdoors and the man is smiling nearby\tThe kids are playing outdoors near a man with a smile\tENTAILMENT\t4.7\tA_entails_B\tB_entails_A\tThe children are playing outdoors, while a man smiles nearby.\tThe children are playing 

In [24]:
# Implementation

msr_corpus = read_more_files(msr1, msr2)
msr_corpus

5801


['1\t702876\t702977\tAmrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.\tReferring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.\n',
 "0\t2108705\t2108831\tYucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion.\tYucaipa bought Dominick's in 1995 for $693 million and sold it to Safeway for $1.8 billion in 1998.\n",
 "1\t1330381\t1330521\tThey had published an advertisement on the Internet on June 10, offering the cargo for sale, he added.\tOn June 10, the ship's owners had published an advertisement on the Internet, offering the explosives for sale.\n",
 '0\t3344667\t3344648\tAround 0335 GMT, Tab shares were up 19 cents, or 4.4%, at A$4.56, having earlier set a record high of A$4.57.\tTab shares jumped 20 cents, or 4.6%, to set a record closing high at A$4.57.\n',
 '1\t1236820\t1236712\tThe stock rose $2.11, or about 11 percent, to close Friday at $21.51 on

In [25]:
def extract_sentences_scores(corpus):
    
    '''Creates a list of sentence pairs omitting punctuation and a list of similarity scores'''
    
    sentences, scores = [], []
    for line in corpus:
        words = line.split('\t')
        
        for i, word in enumerate(words):
            try:
                int(word[0])
                continue
                
            except:
                sent1 = words[i].lower().strip()
                sent2 = words[i+1].lower().strip()
                
                translator = str.maketrans("", "", string.punctuation)
                sent1 = sent1.translate(translator)
                sent2 = sent2.translate(translator)
                
                sentences.append([sent1, sent2])
                
                if i==1:
                    score = words[4]
                    scores.append(float(score))
                
                elif i==3:
                    score = words[0]
                    scores.append(float(score))
                
                break
            
            else: continue
        
    return sentences, scores

In [26]:
# Implementation

sick_sentences, sick_scores = extract_sentences_scores(sick_corpus)
msr_sentences, msr_scores = extract_sentences_scores(msr_corpus)

In [27]:
sick_sentences

[['a group of kids is playing in a yard and an old man is standing in the background',
  'a group of boys in a yard is playing and a man is standing in the background'],
 ['a group of children is playing in the house and there is no man standing in the background',
  'a group of kids is playing in a yard and an old man is standing in the background'],
 ['the young boys are playing outdoors and the man is smiling nearby',
  'the kids are playing outdoors near a man with a smile'],
 ['the young boys are playing outdoors and the man is smiling nearby',
  'there is no boy playing outdoors and there is no man smiling'],
 ['the kids are playing outdoors near a man with a smile',
  'a group of kids is playing in a yard and an old man is standing in the background'],
 ['there is no boy playing outdoors and there is no man smiling',
  'a group of kids is playing in a yard and an old man is standing in the background'],
 ['a group of boys in a yard is playing and a man is standing in the backgro

In [28]:
msr_sentences

[['amrozi accused his brother whom he called the witness of deliberately distorting his evidence',
  'referring to him as only the witness amrozi accused his brother of deliberately distorting his evidence'],
 ['yucaipa owned dominicks before selling the chain to safeway in 1998 for 25 billion',
  'yucaipa bought dominicks in 1995 for 693 million and sold it to safeway for 18 billion in 1998'],
 ['they had published an advertisement on the internet on june 10 offering the cargo for sale he added',
  'on june 10 the ships owners had published an advertisement on the internet offering the explosives for sale'],
 ['around 0335 gmt tab shares were up 19 cents or 44 at a456 having earlier set a record high of a457',
  'tab shares jumped 20 cents or 46 to set a record closing high at a457'],
 ['the stock rose 211 or about 11 percent to close friday at 2151 on the new york stock exchange',
  'pge corp shares jumped 163 or 8 percent to 2103 on the new york stock exchange on friday'],
 ['revenu

## Stemming and Lemmatization

In [29]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

def stemming_words(sentences):
    
    '''Stems words in sentences using Porter Stemmer'''
    
    porter_stemmer = PorterStemmer()
    
    sentences_stem = []
    for pair in sentences:
        pair_new = []
        for sent in pair:
            sent_new = []
            words = word_tokenize(sent)
            for word in words:
                sent_new.append(porter_stemmer.stem(word))
            doc = ' '.join(sent_new)
            pair_new.append(doc)
        
        sentences_stem.append(pair_new)
    
    return sentences_stem
        

In [30]:
# Implementation

sick_sentences_stem = stemming_words(sick_sentences)
msr_sentences_stem = stemming_words(msr_sentences)

In [31]:
sick_sentences_stem

[['a group of kid is play in a yard and an old man is stand in the background',
  'a group of boy in a yard is play and a man is stand in the background'],
 ['a group of children is play in the hous and there is no man stand in the background',
  'a group of kid is play in a yard and an old man is stand in the background'],
 ['the young boy are play outdoor and the man is smile nearbi',
  'the kid are play outdoor near a man with a smile'],
 ['the young boy are play outdoor and the man is smile nearbi',
  'there is no boy play outdoor and there is no man smile'],
 ['the kid are play outdoor near a man with a smile',
  'a group of kid is play in a yard and an old man is stand in the background'],
 ['there is no boy play outdoor and there is no man smile',
  'a group of kid is play in a yard and an old man is stand in the background'],
 ['a group of boy in a yard is play and a man is stand in the background',
  'the young boy are play outdoor and the man is smile nearbi'],
 ['a group of 

In [32]:
msr_sentences_stem

[['amrozi accus hi brother whom he call the wit of deliber distort hi evid',
  'refer to him as onli the wit amrozi accus hi brother of deliber distort hi evid'],
 ['yucaipa own dominick befor sell the chain to safeway in 1998 for 25 billion',
  'yucaipa bought dominick in 1995 for 693 million and sold it to safeway for 18 billion in 1998'],
 ['they had publish an advertis on the internet on june 10 offer the cargo for sale he ad',
  'on june 10 the ship owner had publish an advertis on the internet offer the explos for sale'],
 ['around 0335 gmt tab share were up 19 cent or 44 at a456 have earlier set a record high of a457',
  'tab share jump 20 cent or 46 to set a record close high at a457'],
 ['the stock rose 211 or about 11 percent to close friday at 2151 on the new york stock exchang',
  'pge corp share jump 163 or 8 percent to 2103 on the new york stock exchang on friday'],
 ['revenu in the first quarter of the year drop 15 percent from the same period a year earlier',
  'with th

In [33]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    '''Gets POS tags from Wordnet'''
    
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''
    

In [34]:
from nltk import pos_tag

def pos_tagging(sentences):
    
    '''Part-of-Speech tagging using Wordnet treebank tags'''
    
    sentences_pos = []
    for pair in sentences:
        pair_new = []
        for sent in pair:
            sent_new = []
            words = word_tokenize(sent)
            tagged = pos_tag(words)
            for token, tag in tagged:
                wntag = get_wordnet_pos(tag)
                wordset = (token, wntag)
                sent_new.append(wordset)
            pair_new.append(sent_new) 
        sentences_pos.append(pair_new)
    
    return sentences_pos

In [35]:
# Implementation

sick_sentences_pos = pos_tagging(sick_sentences)
msr_sentences_pos = pos_tagging(msr_sentences)

In [36]:
sick_sentences_pos

[[[('a', ''),
   ('group', 'n'),
   ('of', ''),
   ('kids', 'n'),
   ('is', 'v'),
   ('playing', 'v'),
   ('in', ''),
   ('a', ''),
   ('yard', 'n'),
   ('and', ''),
   ('an', ''),
   ('old', 'a'),
   ('man', 'n'),
   ('is', 'v'),
   ('standing', 'v'),
   ('in', ''),
   ('the', ''),
   ('background', 'n')],
  [('a', ''),
   ('group', 'n'),
   ('of', ''),
   ('boys', 'n'),
   ('in', ''),
   ('a', ''),
   ('yard', 'n'),
   ('is', 'v'),
   ('playing', 'v'),
   ('and', ''),
   ('a', ''),
   ('man', 'n'),
   ('is', 'v'),
   ('standing', 'v'),
   ('in', ''),
   ('the', ''),
   ('background', 'n')]],
 [[('a', ''),
   ('group', 'n'),
   ('of', ''),
   ('children', 'n'),
   ('is', 'v'),
   ('playing', 'v'),
   ('in', ''),
   ('the', ''),
   ('house', 'n'),
   ('and', ''),
   ('there', ''),
   ('is', 'v'),
   ('no', ''),
   ('man', 'n'),
   ('standing', 'v'),
   ('in', ''),
   ('the', ''),
   ('background', 'n')],
  [('a', ''),
   ('group', 'n'),
   ('of', ''),
   ('kids', 'n'),
   ('is', 'v'),


In [37]:
msr_sentences_pos

[[[('amrozi', 'n'),
   ('accused', 'v'),
   ('his', ''),
   ('brother', 'n'),
   ('whom', ''),
   ('he', ''),
   ('called', 'v'),
   ('the', ''),
   ('witness', 'n'),
   ('of', ''),
   ('deliberately', 'r'),
   ('distorting', 'v'),
   ('his', ''),
   ('evidence', 'n')],
  [('referring', 'v'),
   ('to', ''),
   ('him', ''),
   ('as', ''),
   ('only', 'r'),
   ('the', ''),
   ('witness', 'n'),
   ('amrozi', 'n'),
   ('accused', 'v'),
   ('his', ''),
   ('brother', 'n'),
   ('of', ''),
   ('deliberately', 'r'),
   ('distorting', 'v'),
   ('his', ''),
   ('evidence', 'n')]],
 [[('yucaipa', 'n'),
   ('owned', 'v'),
   ('dominicks', 'n'),
   ('before', ''),
   ('selling', 'v'),
   ('the', ''),
   ('chain', 'n'),
   ('to', ''),
   ('safeway', 'v'),
   ('in', ''),
   ('1998', ''),
   ('for', ''),
   ('25', ''),
   ('billion', '')],
  [('yucaipa', 'r'),
   ('bought', 'v'),
   ('dominicks', 'n'),
   ('in', ''),
   ('1995', ''),
   ('for', ''),
   ('693', ''),
   ('million', ''),
   ('and', ''),


In [38]:
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatizing_words(sentences):
    
    '''Lemmatizes words in sentences that are POS-tagged'''
    
    sentences_pos = pos_tagging(sentences)
    
    lemmatizer = WordNetLemmatizer()
    
    sentences_lemma = []
    for pair in sentences_pos:
        pair_new = []
        for sent in pair:
            sent_new = []
            for wordset in sent:
                try:
                    sent_new.append(lemmatizer.lemmatize(wordset[0], pos=wordset[1]))
                except:
                    sent_new.append(wordset[0])
                doc = ' '.join(sent_new)
            pair_new.append(doc)
        
        sentences_lemma.append(pair_new)
    
    return sentences_lemma

In [39]:
# Implementation

sick_sentences_lemma = lemmatizing_words(sick_sentences)
msr_sentences_lemma = lemmatizing_words(msr_sentences)

In [40]:
sick_sentences_lemma

[['a group of kid be play in a yard and an old man be stand in the background',
  'a group of boy in a yard be play and a man be stand in the background'],
 ['a group of child be play in the house and there be no man stand in the background',
  'a group of kid be play in a yard and an old man be stand in the background'],
 ['the young boy be play outdoors and the man be smile nearby',
  'the kid be play outdoors near a man with a smile'],
 ['the young boy be play outdoors and the man be smile nearby',
  'there be no boy playing outdoors and there be no man smile'],
 ['the kid be play outdoors near a man with a smile',
  'a group of kid be play in a yard and an old man be stand in the background'],
 ['there be no boy playing outdoors and there be no man smile',
  'a group of kid be play in a yard and an old man be stand in the background'],
 ['a group of boy in a yard be play and a man be stand in the background',
  'the young boy be play outdoors and the man be smile nearby'],
 ['a gro

In [41]:
msr_sentences_lemma

[['amrozi accuse his brother whom he call the witness of deliberately distort his evidence',
  'refer to him as only the witness amrozi accuse his brother of deliberately distort his evidence'],
 ['yucaipa own dominick before sell the chain to safeway in 1998 for 25 billion',
  'yucaipa buy dominick in 1995 for 693 million and sell it to safeway for 18 billion in 1998'],
 ['they have publish an advertisement on the internet on june 10 offer the cargo for sale he add',
  'on june 10 the ship owner have publish an advertisement on the internet offer the explosive for sale'],
 ['around 0335 gmt tab share be up 19 cent or 44 at a456 have early set a record high of a457',
  'tab share jump 20 cent or 46 to set a record closing high at a457'],
 ['the stock rise 211 or about 11 percent to close friday at 2151 on the new york stock exchange',
  'pge corp share jump 163 or 8 percent to 2103 on the new york stock exchange on friday'],
 ['revenue in the first quarter of the year drop 15 percent f

## Filtering stopwords

In [42]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
    
def remove_stopwords(sentences, stopwords):
    
    '''Removes English stopwords from sentences'''
    
    filtered_words = []
    for pair in sentences:
        pair_new = []
        for sent in pair:
            sent_new = []
            words = word_tokenize(sent)
            for word in words:
                if word not in stopwords:
                    sent_new.append(word)
            doc = ' '.join(sent_new)
            pair_new.append(doc)
        
        filtered_words.append(pair_new)
    
    return filtered_words

In [43]:
# Implementation

sick_filtered = remove_stopwords(sick_sentences_lemma, stopwords)
msr_filtered = remove_stopwords(msr_sentences_lemma, stopwords)

In [44]:
sick_filtered

[['group kid play yard old man stand background',
  'group boy yard play man stand background'],
 ['group child play house man stand background',
  'group kid play yard old man stand background'],
 ['young boy play outdoors man smile nearby',
  'kid play outdoors near man smile'],
 ['young boy play outdoors man smile nearby',
  'boy playing outdoors man smile'],
 ['kid play outdoors near man smile',
  'group kid play yard old man stand background'],
 ['boy playing outdoors man smile',
  'group kid play yard old man stand background'],
 ['group boy yard play man stand background',
  'young boy play outdoors man smile nearby'],
 ['group child play house man stand background',
  'young boy play outdoors man smile nearby'],
 ['young boy play outdoors man smile nearby',
  'group kid play yard old man stand background'],
 ['brown dog attack another animal front tall man pant',
  'brown dog attack another animal front man pant'],
 ['brown dog attack another animal front man pant',
  'brown do

In [45]:
msr_filtered

[['amrozi accuse brother call witness deliberately distort evidence',
  'refer witness amrozi accuse brother deliberately distort evidence'],
 ['yucaipa dominick sell chain safeway 1998 25 billion',
  'yucaipa buy dominick 1995 693 million sell safeway 18 billion 1998'],
 ['publish advertisement internet june 10 offer cargo sale add',
  'june 10 ship owner publish advertisement internet offer explosive sale'],
 ['around 0335 gmt tab share 19 cent 44 a456 early set record high a457',
  'tab share jump 20 cent 46 set record closing high a457'],
 ['stock rise 211 11 percent close friday 2151 new york stock exchange',
  'pge corp share jump 163 8 percent 2103 new york stock exchange friday'],
 ['revenue first quarter year drop 15 percent period year earlier',
  'scandal hanging stewart company revenue first quarter year drop 15 percent period year earlier'],
 ['nasdaq weekly gain 1727 12 percent closing 152015 friday',
  'techlaced nasdaq composite ixic rally 3046 point 204 percent 152015'

## Bag-of-Words model

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def run_bow_model(sentences):
    
    '''Creates BoW model and calculates cosine similarity of each sentence pair'''
    
    count_vec = CountVectorizer(stop_words="english", analyzer='word', ngram_range=(1, 2), 
                               max_df=1.0, min_df=0.5, max_features=None)
    
    bow_matrix, bow_scores = [], []
    for pair in sentences:
        bow_model = count_vec.fit_transform(pair)
        bow_model = bow_model.toarray()
        bow_matrix.append(bow_model)

        cos_sim = cosine_similarity(bow_model[0,:].reshape(1,-1), bow_model[1,:].reshape(1, -1))
        bow_scores.append(float(cos_sim))
        
    
    return bow_matrix, bow_scores

In [31]:
# Implementation

sick_bow_matrix, sick_bow_scores = run_bow_model(sick_sentences)
msr_bow_matrix, msr_bow_scores = run_bow_model(msr_sentences)

sick_bow_stem_matrix, sick_bow_stem_scores = run_bow_model(sick_sentences_stem)
msr_bow_stem_matrix, msr_bow_stem_scores = run_bow_model(msr_sentences_stem)

sick_bow_lem_matrix, sick_bow_lem_scores = run_bow_model(sick_sentences_lemma)
msr_bow_lem_matrix, msr_bow_lem_scores = run_bow_model(msr_sentences_lemma)

sick_bow_stw_matrix, sick_bow_stw_scores = run_bow_model(sick_filtered)
msr_bow_stw_matrix, msr_bow_stw_scores = run_bow_model(msr_filtered)

In [32]:
sick_bow_matrix

[array([[1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1]],
       dtype=int64),
 array([[1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0],
        [1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]],
       dtype=int64),
 array([[1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0]],
       dtype=int64),
 array([[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0]], dtype=int64),
 array([[0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1]],
       dtype=int64),
 array([[0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0],
        [1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1]],
       dtype=int64),
 array([[1, 1, 0, 1, 1, 1, 1, 0, 1,

In [33]:
msr_bow_matrix

[array([[1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1],
        [1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]],
       dtype=int64),
 array([[0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
         1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
         0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0]], dtype=int64),
 array([[1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1,
         1, 1, 0, 0],
        [1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
         1, 0, 1, 1]], dtype=int64),
 array([[1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,
         1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1],
        [0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]], dtype=int64),
 array([[1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
         1, 0, 1, 0, 0, 1, 1

## TF-IDF model

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

def run_tfidf_model(sentences):
    
    '''Creates TF-IDF model and calculates cosine similarity of each sentence pair'''
    
    tfidf_vec = TfidfVectorizer(stop_words="english", analyzer='word', ngram_range=(1, 2), 
                               max_df=1.0, min_df=0.5, max_features=None)
    
    tfidf_matrix, tfidf_scores = [], []
    for pair in sentences:
        tfidf_model = tfidf_vec.fit_transform(pair)
        tfidf_model = tfidf_model.toarray()
        tfidf_matrix.append(tfidf_model)

        cos_sim = cosine_similarity(tfidf_model[0,:].reshape(1,-1), tfidf_model[1,:].reshape(1, -1))
        tfidf_scores.append(float(cos_sim))
           
    return tfidf_matrix, tfidf_scores

In [35]:
# Implementation

sick_tfidf_matrix, sick_tfidf_scores = run_tfidf_model(sick_sentences)
msr_tfidf_matrix, msr_tfidf_scores = run_tfidf_model(msr_sentences)

sick_tfidf_stem_matrix, sick_tfidf_stem_scores = run_tfidf_model(sick_sentences_stem)
msr_tfidf_stem_matrix, msr_tfidf_stem_scores = run_tfidf_model(msr_sentences_stem)

sick_tfidf_lem_matrix, sick_tfidf_lem_scores = run_tfidf_model(sick_sentences_lemma)
msr_tfidf_lem_matrix, msr_tfidf_lem_scores = run_tfidf_model(msr_sentences_lemma)

sick_tfidf_stw_matrix, sick_tfidf_stw_scores = run_tfidf_model(sick_filtered)
msr_tfidf_stw_matrix, msr_tfidf_stw_scores = run_tfidf_model(msr_filtered)

In [36]:
sick_tfidf_matrix

[array([[0.21404236, 0.        , 0.        , 0.21404236, 0.        ,
         0.30082908, 0.30082908, 0.30082908, 0.21404236, 0.21404236,
         0.30082908, 0.30082908, 0.21404236, 0.        , 0.30082908,
         0.21404236, 0.21404236, 0.21404236, 0.30082908, 0.        ],
        [0.23651397, 0.33241213, 0.33241213, 0.23651397, 0.33241213,
         0.        , 0.        , 0.        , 0.23651397, 0.23651397,
         0.        , 0.        , 0.23651397, 0.33241213, 0.        ,
         0.23651397, 0.23651397, 0.23651397, 0.        , 0.33241213]]),
 array([[0.23031454, 0.32369906, 0.32369906, 0.23031454, 0.32369906,
         0.        , 0.32369906, 0.32369906, 0.        , 0.        ,
         0.23031454, 0.23031454, 0.        , 0.        , 0.23031454,
         0.32369906, 0.        , 0.23031454, 0.23031454, 0.        ,
         0.        ],
        [0.20941475, 0.        , 0.        , 0.20941475, 0.        ,
         0.29432513, 0.        , 0.        , 0.29432513, 0.29432513,
        

In [37]:
msr_tfidf_matrix

[array([[0.23001377, 0.23001377, 0.23001377, 0.23001377, 0.23001377,
         0.32327633, 0.        , 0.32327633, 0.32327633, 0.23001377,
         0.23001377, 0.23001377, 0.23001377, 0.23001377, 0.        ,
         0.        , 0.23001377, 0.        , 0.32327633],
        [0.23001377, 0.23001377, 0.23001377, 0.23001377, 0.23001377,
         0.        , 0.32327633, 0.        , 0.        , 0.23001377,
         0.23001377, 0.23001377, 0.23001377, 0.23001377, 0.32327633,
         0.32327633, 0.23001377, 0.32327633, 0.        ]]),
 array([[0.        , 0.        , 0.        , 0.        , 0.18665039,
         0.26233061, 0.26233061, 0.26233061, 0.        , 0.        ,
         0.18665039, 0.        , 0.        , 0.        , 0.26233061,
         0.26233061, 0.18665039, 0.        , 0.26233061, 0.        ,
         0.        , 0.26233061, 0.26233061, 0.18665039, 0.        ,
         0.26233061, 0.26233061, 0.26233061, 0.        , 0.        ,
         0.18665039, 0.        , 0.26233061],
        

In [38]:
df_sick_bow = pd.DataFrame({'Relatedness': sick_scores, 'BoW': sick_bow_scores, 'BoW_stem': sick_bow_stem_scores,
                           'BoW_lemma': sick_bow_lem_scores, 'BoW_filt': sick_bow_stw_scores})
df_sick_bow

Unnamed: 0,Relatedness,BoW,BoW_stem,BoW_lemma,BoW_filt
0,4.500,0.572892,0.572892,0.572892,0.572892
1,3.200,0.501280,0.501280,0.501280,0.501280
2,4.700,0.334497,0.501745,0.501745,0.501745
3,3.600,0.647150,0.832050,0.554700,0.554700
4,3.400,0.311400,0.311400,0.311400,0.311400
5,3.300,0.172133,0.172133,0.086066,0.086066
6,3.700,0.230769,0.230769,0.230769,0.230769
7,3.000,0.153846,0.153846,0.153846,0.153846
8,3.700,0.143223,0.143223,0.143223,0.143223
9,4.900,0.836242,0.859338,0.836242,0.836242


In [39]:
df_sick_tfidf = pd.DataFrame({'Relatedness': sick_scores, 'TF-IDF': sick_tfidf_scores, 
                              'TF-IDF_stem': sick_tfidf_stem_scores, 'TF-IDF_lemma': sick_tfidf_lem_scores, 
                              'TF-IDF_filt': sick_tfidf_stw_scores})
df_sick_tfidf

Unnamed: 0,Relatedness,TF-IDF,TF-IDF_stem,TF-IDF_lemma,TF-IDF_filt
0,4.500,0.404992,0.404992,0.404992,0.404992
1,3.200,0.337619,0.337619,0.337619,0.337619
2,4.700,0.203006,0.338174,0.338174,0.338174
3,3.600,0.487191,0.729728,0.390186,0.390186
4,3.400,0.186771,0.186771,0.186771,0.186771
5,3.300,0.095554,0.095554,0.045574,0.045574
6,3.700,0.131849,0.131849,0.131849,0.131849
7,3.000,0.084286,0.084286,0.084286,0.084286
8,3.700,0.078040,0.078040,0.078040,0.078040
9,4.900,0.724107,0.758156,0.724107,0.724107


In [40]:
# Pearson's correlation of scores
df_sick_bow.corr(method='pearson')

Unnamed: 0,Relatedness,BoW,BoW_stem,BoW_lemma,BoW_filt
Relatedness,1.0,0.545892,0.568393,0.569404,0.569422
BoW,0.545892,1.0,0.972989,0.965416,0.965419
BoW_stem,0.568393,0.972989,1.0,0.981616,0.981618
BoW_lemma,0.569404,0.965416,0.981616,1.0,0.999999
BoW_filt,0.569422,0.965419,0.981618,0.999999,1.0


In [41]:
# Spearman's correlation of scores
df_sick_bow.corr(method='spearman')

Unnamed: 0,Relatedness,BoW,BoW_stem,BoW_lemma,BoW_filt
Relatedness,1.0,0.548808,0.560434,0.56333,0.563354
BoW,0.548808,1.0,0.971279,0.964409,0.964414
BoW_stem,0.560434,0.971279,1.0,0.982938,0.982943
BoW_lemma,0.56333,0.964409,0.982938,1.0,0.999999
BoW_filt,0.563354,0.964414,0.982943,0.999999,1.0


In [42]:
# Pearson's correlation of scores
df_sick_tfidf.corr(method='pearson')

Unnamed: 0,Relatedness,TF-IDF,TF-IDF_stem,TF-IDF_lemma,TF-IDF_filt
Relatedness,1.0,0.472194,0.493539,0.494923,0.494934
TF-IDF,0.472194,1.0,0.974425,0.965412,0.965413
TF-IDF_stem,0.493539,0.974425,1.0,0.976208,0.976209
TF-IDF_lemma,0.494923,0.965412,0.976208,1.0,1.0
TF-IDF_filt,0.494934,0.965413,0.976209,1.0,1.0


In [43]:
# Spearman's correlation of scores
df_sick_tfidf.corr(method='spearman')

Unnamed: 0,Relatedness,TF-IDF,TF-IDF_stem,TF-IDF_lemma,TF-IDF_filt
Relatedness,1.0,0.548133,0.559361,0.561944,0.561968
TF-IDF,0.548133,1.0,0.971182,0.964537,0.964542
TF-IDF_stem,0.559361,0.971182,1.0,0.982808,0.982812
TF-IDF_lemma,0.561944,0.964537,0.982808,1.0,0.999999
TF-IDF_filt,0.561968,0.964542,0.982812,0.999999,1.0


In [44]:
def convert_scores_to_binary(scores_list):
    
    '''Converts a percentage score to either a 0 or a 1'''
    
    bin_scores = []
    for score in scores_list:
        if score < 0.7:
            bin_scores.append(0)
        else:
            bin_scores.append(1)
    
    return bin_scores

In [45]:
# Convert scores to binary

msr_bow_binscores = convert_scores_to_binary(msr_bow_scores)
msr_bow_stem_binscores = convert_scores_to_binary(msr_bow_stem_scores)
msr_bow_lem_binscores = convert_scores_to_binary(msr_bow_lem_scores)
msr_bow_stw_binscores = convert_scores_to_binary(msr_bow_stw_scores)

msr_tfidf_binscores = convert_scores_to_binary(msr_tfidf_scores)
msr_tfidf_stem_binscores = convert_scores_to_binary(msr_tfidf_stem_scores)
msr_tfidf_lem_binscores = convert_scores_to_binary(msr_tfidf_lem_scores)
msr_tfidf_stw_binscores = convert_scores_to_binary(msr_tfidf_stw_scores)

msr_scores = convert_scores_to_binary(msr_scores)

In [46]:
df_msr_bow = pd.DataFrame({'Similarity': msr_scores, 'BoW': msr_bow_binscores, 'BoW_stem': msr_bow_stem_binscores,
                           'BoW_lemma': msr_bow_lem_binscores, 'BoW_filt': msr_bow_stw_binscores})
df_msr_bow

Unnamed: 0,Similarity,BoW,BoW_stem,BoW_lemma,BoW_filt
0,1,1,1,1,1
1,0,0,0,0,0
2,1,0,0,0,0
3,0,0,0,0,0
4,1,0,0,0,0
5,1,1,1,1,1
6,0,0,0,0,0
7,1,0,0,0,0
8,0,0,0,0,0
9,1,0,0,0,0


In [47]:
df_msr_tfidf = pd.DataFrame({'Similarity': msr_scores, 'TF-IDF': msr_tfidf_binscores, 
                             'TF-IDF_stem': msr_tfidf_stem_binscores, 'TF-IDF_lemma': msr_tfidf_lem_binscores, 
                             'TF-IDF_filt': msr_tfidf_stw_binscores})
df_msr_tfidf

Unnamed: 0,Similarity,TF-IDF,TF-IDF_stem,TF-IDF_lemma,TF-IDF_filt
0,1,0,0,0,0
1,0,0,0,0,0
2,1,0,0,0,0
3,0,0,0,0,0
4,1,0,0,0,0
5,1,1,1,1,1
6,0,0,0,0,0
7,1,0,0,0,0
8,0,0,0,0,0
9,1,0,0,0,0


In [48]:
from sklearn.metrics import confusion_matrix

# Confusion matrix of BoW scores

#plt.figure(figsize=(6, 4))

msr_bow_cf = pd.DataFrame(confusion_matrix(df_msr_bow['Similarity'], df_msr_bow['BoW']),  
                      columns=['BoW 0', 'BoW 1'], index=['MSR 0', 'MSR 1'])

msr_bow_stem_cf = pd.DataFrame(confusion_matrix(df_msr_bow['Similarity'], df_msr_bow['BoW_stem']),  
                      columns=['BoW_stem 0', 'BoW_stem 1'], index=['MSR 0', 'MSR 1'])

msr_bow_lem_cf = pd.DataFrame(confusion_matrix(df_msr_bow['Similarity'], df_msr_bow['BoW_lemma']),  
                      columns=['BoW_lemma 0', 'BoW_lemma 1'], index=['MSR 0', 'MSR 1'])

msr_bow_stw_cf = pd.DataFrame(confusion_matrix(df_msr_bow['Similarity'], df_msr_bow['BoW_filt']),  
                      columns=['BoW_filt 0', 'BoW_filt 1'], index=['MSR 0', 'MSR 1'])

#sns.heatmap(msr_bow_cf, annot=True, cmap='Blues')
#plt.show()
print(msr_bow_cf, '\n')
print(msr_bow_stem_cf, '\n')
print(msr_bow_lem_cf, '\n')
print(msr_bow_stw_cf, '\n')

       BoW 0  BoW 1
MSR 0   1723    178
MSR 1   2682   1218 

       BoW_stem 0  BoW_stem 1
MSR 0        1692         209
MSR 1        2522        1378 

       BoW_lemma 0  BoW_lemma 1
MSR 0         1695          206
MSR 1         2552         1348 

       BoW_filt 0  BoW_filt 1
MSR 0        1695         206
MSR 1        2548        1352 



In [49]:
# Confusion matrix of TF-IDF scores

#plt.figure(figsize=(6, 4))

msr_tfidf_cf = pd.DataFrame(confusion_matrix(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF']),  
                      columns=['TF-IDF 0', 'TF-IDF 1'], index=['MSR 0', 'MSR 1'])

msr_tfidf_stem_cf = pd.DataFrame(confusion_matrix(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF_stem']),  
                      columns=['TF-IDF_stem 0', 'TF-IDF_stem 1'], index=['MSR 0', 'MSR 1'])

msr_tfidf_lem_cf = pd.DataFrame(confusion_matrix(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF_lemma']),  
                      columns=['TF-IDF_lemma 0', 'TF-IDF_lemma 1'], index=['MSR 0', 'MSR 1'])

msr_tfidf_stw_cf = pd.DataFrame(confusion_matrix(df_msr_tfidf['Similarity'], df_msr_tfidf['TF-IDF_filt']),  
                      columns=['TF-IDF_filt 0', 'TF-IDF_filt 1'], index=['MSR 0', 'MSR 1'])

#sns.heatmap(msr_tfidf_cf, annot=True, cmap='Blues')
#plt.show()
print(msr_tfidf_cf, '\n')
print(msr_tfidf_stem_cf, '\n')
print(msr_tfidf_lem_cf, '\n')
print(msr_tfidf_stw_cf, '\n')

       TF-IDF 0  TF-IDF 1
MSR 0      1866        35
MSR 1      3561       339 

       TF-IDF_stem 0  TF-IDF_stem 1
MSR 0           1864             37
MSR 1           3469            431 

       TF-IDF_lemma 0  TF-IDF_lemma 1
MSR 0            1860              41
MSR 1            3464             436 

       TF-IDF_filt 0  TF-IDF_filt 1
MSR 0           1860             41
MSR 1           3464            436 



## Word2Vec

In [47]:
from gensim.models import Word2Vec, KeyedVectors

# Load Google's pre-trained Word2Vec model
modelwd_pret = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [51]:
modelwd_pret['dog']

array([ 5.12695312e-02, -2.23388672e-02, -1.72851562e-01,  1.61132812e-01,
       -8.44726562e-02,  5.73730469e-02,  5.85937500e-02, -8.25195312e-02,
       -1.53808594e-02, -6.34765625e-02,  1.79687500e-01, -4.23828125e-01,
       -2.25830078e-02, -1.66015625e-01, -2.51464844e-02,  1.07421875e-01,
       -1.99218750e-01,  1.59179688e-01, -1.87500000e-01, -1.20117188e-01,
        1.55273438e-01, -9.91210938e-02,  1.42578125e-01, -1.64062500e-01,
       -8.93554688e-02,  2.00195312e-01, -1.49414062e-01,  3.20312500e-01,
        3.28125000e-01,  2.44140625e-02, -9.71679688e-02, -8.20312500e-02,
       -3.63769531e-02, -8.59375000e-02, -9.86328125e-02,  7.78198242e-03,
       -1.34277344e-02,  5.27343750e-02,  1.48437500e-01,  3.33984375e-01,
        1.66015625e-02, -2.12890625e-01, -1.50756836e-02,  5.24902344e-02,
       -1.07421875e-01, -8.88671875e-02,  2.49023438e-01, -7.03125000e-02,
       -1.59912109e-02,  7.56835938e-02, -7.03125000e-02,  1.19140625e-01,
        2.29492188e-01,  

In [52]:
modelwd_pret.most_similar(positive=['yes', 'yeah'])

  if np.issubdtype(vec.dtype, np.int):


[('Yeah', 0.7643884420394897),
 ('hey', 0.7373077869415283),
 ('Yes', 0.7288317680358887),
 ('Uh', 0.7206948399543762),
 ('mso_style_qformat', 0.7124799489974976),
 ('Oh', 0.7074445486068726),
 ('mso_style_noshow', 0.7043254971504211),
 ('Yeah_yeah', 0.6987195014953613),
 ('Oh_yeah', 0.6973084211349487),
 ('SADY', 0.6953877806663513)]

In [53]:
modelwd_pret.similarity('good', 'great')

  if np.issubdtype(vec.dtype, np.int):


0.72915095

In [54]:
def create_words_list(sentences):
    
    '''Creates a list with separate words in each sentence'''
    
    words_list = []

    for pair in sentences:
        pair_new = []
        for sent in pair:
            words = sent.split()
            pair_new.append(words)

        words_list.append(pair_new)
    
    return words_list

In [55]:
# Implementation

sick_words_list = create_words_list(sick_filtered)
msr_words_list = create_words_list(msr_filtered)

In [56]:
sick_words_list

[[['group', 'kid', 'play', 'yard', 'old', 'man', 'stand', 'background'],
  ['group', 'boy', 'yard', 'play', 'man', 'stand', 'background']],
 [['group', 'child', 'play', 'house', 'man', 'stand', 'background'],
  ['group', 'kid', 'play', 'yard', 'old', 'man', 'stand', 'background']],
 [['young', 'boy', 'play', 'outdoors', 'man', 'smile', 'nearby'],
  ['kid', 'play', 'outdoors', 'near', 'man', 'smile']],
 [['young', 'boy', 'play', 'outdoors', 'man', 'smile', 'nearby'],
  ['boy', 'playing', 'outdoors', 'man', 'smile']],
 [['kid', 'play', 'outdoors', 'near', 'man', 'smile'],
  ['group', 'kid', 'play', 'yard', 'old', 'man', 'stand', 'background']],
 [['boy', 'playing', 'outdoors', 'man', 'smile'],
  ['group', 'kid', 'play', 'yard', 'old', 'man', 'stand', 'background']],
 [['group', 'boy', 'yard', 'play', 'man', 'stand', 'background'],
  ['young', 'boy', 'play', 'outdoors', 'man', 'smile', 'nearby']],
 [['group', 'child', 'play', 'house', 'man', 'stand', 'background'],
  ['young', 'boy', 'pla

In [57]:
len(msr_words_list)

5801

In [58]:
from scipy import linalg, mat, dot

def run_gensim_model(words_list, model):
    
    '''Runs Word2Vec and GloVe models in gensim and computes cosine similarity scores for each sentence pair'''
    
    matrix_list, scores = [], []

    for pair in words_list:
        sum_list = []
        for sent in pair:
            embeddings_list = []
            for word in sent:
                try:
                    embeddings_list.append(model[word])
                except:
                    embeddings_list.append(np.array(0))
            sum_list.append(sum(embeddings_list))
    
        matrix_list.append(sum_list)
        cos_sim = cosine_similarity(sum_list[0].reshape(1, -1), sum_list[1].reshape(1, -1))
        #cos_sim = dot(sum_list[0], sum_list[1].T)/(linalg.norm(sum_list[0])*linalg.norm(sum_list[1]))
        scores.append(float(cos_sim))
        
    return matrix_list, scores

In [59]:
# Implementation

sick_wdpret_list, sick_wdpret_scores = run_gensim_model(sick_words_list, modelwd_pret)
msr_wdpret_list, msr_wdpret_scores = run_gensim_model(msr_words_list, modelwd_pret)

In [60]:
sick_wdpret_scores

[0.9636735916137695,
 0.8306825160980225,
 0.9213025569915771,
 0.9390094876289368,
 0.7341063618659973,
 0.7074224352836609,
 0.725022554397583,
 0.6616698503494263,
 0.7446925044059753,
 0.9708219170570374,
 0.947582483291626,
 0.7160701751708984,
 0.7605283260345459,
 0.6613755226135254,
 0.6162267327308655,
 0.6021708250045776,
 0.6294410228729248,
 0.6054657101631165,
 0.9624820351600647,
 0.9624820351600647,
 1.0,
 0.9307125806808472,
 0.9312766194343567,
 0.5628237724304199,
 0.5384774208068848,
 0.5472046136856079,
 0.5472046136856079,
 0.6041198372840881,
 0.6041198372840881,
 0.9999998807907104,
 0.9545743465423584,
 0.9670140743255615,
 0.957281768321991,
 0.7930080890655518,
 0.8074471950531006,
 0.611833930015564,
 0.7922298908233643,
 0.8211998343467712,
 0.7779366970062256,
 0.7922298908233643,
 0.8709091544151306,
 1.0,
 1.0,
 0.9114375114440918,
 0.9114375114440918,
 0.6363109946250916,
 0.6363109946250916,
 0.5457330942153931,
 0.6454955339431763,
 0.6454955339431763,

In [61]:
msr_wdpret_scores

[0.9622523784637451,
 0.8444454073905945,
 0.8942445516586304,
 0.8514528274536133,
 0.8924899697303772,
 0.9149340987205505,
 0.7283452749252319,
 0.7702909111976624,
 0.737818717956543,
 0.9641870260238647,
 0.9321104288101196,
 0.9475003480911255,
 0.7969210743904114,
 0.8499240279197693,
 0.9463925957679749,
 0.7930768728256226,
 0.8685598969459534,
 0.9314301013946533,
 0.7989785075187683,
 0.9457201361656189,
 0.9402678608894348,
 0.9261414408683777,
 0.9579047560691833,
 0.901711106300354,
 0.9227464199066162,
 0.942054808139801,
 0.910305917263031,
 0.7245621681213379,
 0.9046646356582642,
 0.6394489407539368,
 0.9104388356208801,
 0.9199749231338501,
 0.878268301486969,
 0.8733630180358887,
 0.8218045234680176,
 0.813219428062439,
 0.7622425556182861,
 0.9074845910072327,
 0.9627934694290161,
 0.8081318140029907,
 0.8975004553794861,
 0.8981140851974487,
 0.9113419055938721,
 0.8117059469223022,
 0.8162103891372681,
 0.9587899446487427,
 0.8742061257362366,
 0.8415859937667847

In [63]:
def run_spacy_model(words_list, model):
    
    '''Runs Word2Vec and GloVe models in SpaCy and computes cosine similarity scores for each sentence pair'''
    
    scores = []
    for pair in words_list:
        token1 = model(pair[0])
        token2 = model(pair[1])
        scores.append(token1.similarity(token2))

    return scores

In [64]:
modelwd_spacy = spacy.load("en", vectors="GoogleNews-vectors-negative300.bin")

In [None]:
# Implementation

sick_wdspacy_scores = run_spacy_model(sick_filtered, modelwd_spacy)
msr_wdspacy_scores = run_spacy_model(msr_filtered, modelwd_spacy)

In [None]:
sick_wdspacy_scores

In [None]:
msr_wdspacy_scores

In [None]:
# Convert scores to binary

msr_wdpret_binscores = convert_scores_to_binary(msr_wdpret_scores)
msr_wdspacy_binscores = convert_scores_to_binary(msr_wdspacy_scores)

In [None]:
df_sick_w2v = pd.DataFrame({'Relatedness': sick_scores, 'Word2Vec_gensim': sick_wdpret_scores, 
                              'Word2Vec_spacy': sick_wdspacy_scores})
df_sick_w2v

In [None]:
df_sick_w2v.corr('pearson')

In [None]:
df_sick_w2v.corr(method='spearman')

In [None]:
df_msr_w2v = pd.DataFrame({'Similarity': msr_scores, 'Word2Vec_gensim': msr_wdpret_binscores, 
                              'Word2Vec_spacy': msr_wdspacy_binscores})
df_msr_w2v

In [None]:
# Confusion matrix of Word2Vec scores

#plt.figure(figsize=(6, 4))

msr_wdpret_cf = pd.DataFrame(confusion_matrix(df_msr_w2v['Similarity'], df_msr_w2v['Word2Vec_gensim']),  
                      columns=['W2V_gensim 0', 'W2V_gensim 1'], index=['MSR 0', 'MSR 1'])

msr_wdspacy_cf = pd.DataFrame(confusion_matrix(df_msr_w2v['Similarity'], df_msr_w2v['Word2Vec_spacy']),  
                      columns=['W2V_spacy 0', 'W2V_spacy 1'], index=['MSR 0', 'MSR 1'])

#sns.heatmap(msr_wdpret_cf, annot=True, cmap='Blues')
#plt.show()
print(msr_wdpret_cf, '\n')
print(msr_wdspacy_cf, '\n')

In [None]:
#billwords = open("1-billion-word-language-modeling-benchmark-r13output.tar", "r", encoding="ISO-8859-1")

In [None]:
#billwords_corpus = read_more_files(billwords)

In [None]:
#billwords_corpus

In [74]:
def preprocess_training_data(corpus):
    
    '''Cleans up dataset used to train NLP models'''
    
    sentences = []
    for line in corpus:
        line = line.lower().strip()
        translator = str.maketrans("", "", string.punctuation)
        line = line.translate(translator)
        words = line.split(' ')    
        sentences.append(words)
        
    return sentences

In [160]:
# billwords_train = preprocess_training_data(billwords_corpus)

In [161]:
# billwords_train

In [11]:
billwords_prep_small = open('billwords_prep_small.txt', 'r', encoding="ISO-8859-1")

In [12]:
billwords_corpus = []
for line in billwords_prep_small:
      billwords_corpus.append(line)

billwords_corpus

In [13]:
import ast

documents = []
for string in billwords_corpus:
    string = string.strip()
    try:
        string = ast.literal_eval(string)
    except:
        continue
    
    documents.append(string)
    
documents

In [14]:
len(documents)

99999

In [78]:
billwords_raw = open("/project/1-billion-word-language-modeling-benchmark-r13output.tar", "r", encoding="ISO-8859-1")
billwords_clean = open('/project/billion_words_dataset/billwords_preprocessed.txt', 'w')

translator = str.maketrans("", "", string.punctuation)

for i, line in enumerate(billwords_raw):
    if i==0:
        pos = line.find("While")
        line = line[pos:]
        
    billwords_clean.write(line)                

billwords_raw.close()
billwords_clean.close()

In [79]:
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
    
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            translator = str.maketrans("", "", string.punctuation)
            for line in open(os.path.join(self.dirname, fname)):
                line = line.translate(translator)
                line = line.lower().strip()
                yield line.split()

In [72]:
# Assign documents to directory of dataset for training

documents = MySentences("/project/billion_words_dataset")

In [73]:
# Train CBOW model

modelwd_cbow = Word2Vec(documents, size=300, window=5, min_count=1, workers=10, negative=5, iter=10, sg=0)
#modelwd_train.train(documents, total_examples=len(documents), epochs=20)

In [None]:
# Train Skip-Gram model

modelwd_sg = Word2Vec(documents, size=300, window=5, min_count=1, workers=10, negative=5, iter=10, sg=1)

In [433]:
modelwd_train.wv.most_similar(positive='good', negative='bad')

  if np.issubdtype(vec.dtype, np.int):


[('killifer', 0.6078303456306458),
 ('doyle', 0.5975402593612671),
 ('andrew', 0.5960867404937744),
 ('maurice', 0.5941125154495239),
 ('jagielka', 0.5937049984931946),
 ('evans', 0.5906293392181396),
 ('barry', 0.5899553894996643),
 ('krasno', 0.5894507169723511),
 ('rick', 0.589293897151947),
 ('collins', 0.5891936421394348)]

In [432]:
modelwd_train.wv.similarity('person', 'people')

  if np.issubdtype(vec.dtype, np.int):


0.42196813

In [434]:
sick_wdtrain_list, sick_wdtrain_scores = run_gensim_model(sick_words_list, modelwd_train)
msr_wdtrain_list, msr_wdtrain_scores = run_gensim_model(msr_words_list, modelwd_train)



In [435]:
df_sick_w2v = pd.DataFrame({'Relatedness': sick_scores, 'Word2Vec_gensim': sick_wdpret_scores, 
                              'Word2Vec_spacy': sick_wdspacy_scores, 'Word2Vec_train': sick_wdtrain_scores})
df_sick_w2v

Unnamed: 0,Relatedness,Word2Vec_gensim,Word2Vec_spacy,Word2Vec_train
0,4.500,0.963674,0.934471,0.938273
1,3.200,0.830683,0.929274,0.855596
2,4.700,0.921303,0.906845,0.849417
3,3.600,0.939009,0.906082,0.906789
4,3.400,0.734106,0.821168,0.768219
5,3.300,0.707422,0.758366,0.749604
6,3.700,0.725023,0.735679,0.840648
7,3.000,0.661670,0.663246,0.694512
8,3.700,0.744693,0.770172,0.793686
9,4.900,0.970822,0.981019,0.986945


In [437]:
df_sick_w2v.corr()

Unnamed: 0,Relatedness,Word2Vec_gensim,Word2Vec_spacy,Word2Vec_train
Relatedness,1.0,0.710151,0.500123,0.482175
Word2Vec_gensim,0.710151,1.0,0.753489,0.75962
Word2Vec_spacy,0.500123,0.753489,1.0,0.717925
Word2Vec_train,0.482175,0.75962,0.717925,1.0


## GloVe

In [163]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove2word2vec(glove_input_file="glove.6B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")

# Load pretrained corpus of GloVe vectors
modelgl_pret = KeyedVectors.load_word2vec_format('gensim_glove_vectors.txt', binary=False)  

In [164]:
# Implementation

sick_glpret_list, sick_glpret_scores = run_gensim_model(sick_words_list, modelgl_pret)
msr_glpret_list, msr_glpret_scores = run_gensim_model(msr_words_list, modelgl_pret)

In [165]:
sick_glpret_scores

[0.9673898742423979,
 0.9049961219589818,
 0.9365488116204969,
 0.9447734840576341,
 0.8034699506121595,
 0.7703386749042295,
 0.8233882393188526,
 0.7730371351489809,
 0.8231939142595198,
 0.9747467883617701,
 0.962306086102897,
 0.7730860678741542,
 0.6984007527869968,
 0.761183139189861,
 0.5000009460317223,
 0.6006252245525774,
 0.6438669584817147,
 0.6034958598004796,
 0.9664737509713145,
 0.9664737509713145,
 1.0000000341938045,
 0.9445193999165378,
 0.9531080966093424,
 0.6131358513061222,
 0.588577641016177,
 0.5771069445601583,
 0.5771069445601583,
 0.6397163715427918,
 0.6397163715427918,
 1.0000000723991926,
 0.9657523505756506,
 0.9817542153877362,
 0.9524995144184366,
 0.8520165701143931,
 0.8433891166092244,
 0.7089169115473194,
 0.8016994357184727,
 0.8141458252780919,
 0.8016341998412582,
 0.8016994357184727,
 0.8946989218029094,
 0.9999999654533472,
 0.9999999654533472,
 0.8711320290380328,
 0.8711320290380328,
 0.6812868530187383,
 0.6812868530187383,
 0.5818715021217

In [166]:
msr_glpret_scores

[0.9742032106147726,
 0.9043876090595238,
 0.9275947156465697,
 0.8872904552182909,
 0.9277970646854505,
 0.9642216327248664,
 0.7469173745884797,
 0.8474617932649608,
 0.75912608094813,
 0.9047370037324174,
 0.9675493030540878,
 0.9670247448243426,
 0.8248834788093853,
 0.9408211125545133,
 0.9352160256899146,
 0.8558412479936678,
 0.8839382353841414,
 0.9688724630549919,
 0.8681165095928951,
 0.960717604526317,
 0.9429981569822236,
 0.9364738624628408,
 0.976381961528126,
 0.9341142238274215,
 0.9313018577141466,
 0.8978019304621326,
 0.9326207185621468,
 0.8232721090971808,
 0.8995318540454004,
 0.759246394074967,
 0.918954009144289,
 0.9572647052379305,
 0.9144596477341377,
 0.8803523248215389,
 0.9085835142237373,
 0.922425218442224,
 0.8552821434628445,
 0.9420902367790318,
 0.9399875116502616,
 0.8426563221642736,
 0.9353127960904688,
 0.8771596806353651,
 0.9475228673730194,
 0.5917024903860861,
 0.9160311981631049,
 0.9670775301756409,
 0.9299764708608842,
 0.9440563036261659,

In [167]:
#modelgl_spacy = spacy.load("en_vectors_web_lg")

In [168]:
modelgl_spacy = spacy.load("en", vectors="glove.6B.300d.txt")

In [169]:
# Implementation

sick_glspacy_scores = run_spacy_model(sick_filtered, modelgl_spacy)
msr_glspacy_scores = run_spacy_model(msr_filtered, modelgl_spacy)

In [170]:
sick_glspacy_scores

[0.9344706123876111,
 0.9292744751097476,
 0.9068453122595578,
 0.9060817133964245,
 0.821168404797968,
 0.7583656676529401,
 0.7356786706044599,
 0.663246136722744,
 0.7701721561194161,
 0.9810194667946985,
 0.9666314113861985,
 0.8447036789664796,
 0.6510374101647025,
 0.7280880701505387,
 0.8016183158009678,
 0.7673222896530584,
 0.8072751974492481,
 0.7760966084387533,
 0.976458495826572,
 0.976458495826572,
 0.9774152092996571,
 0.9476664978634464,
 0.9659220231054534,
 0.8082229038065611,
 0.7578272522041156,
 0.7540800620739555,
 0.7540800620739555,
 0.7581158738761568,
 0.7940938620873446,
 0.9799031823366021,
 0.965236626734736,
 0.9630627274361503,
 0.9606614505210506,
 0.9157441841169094,
 0.8613483123004888,
 0.8333456686440717,
 0.8849238256892656,
 0.8964006192721236,
 0.903499226090323,
 0.8870320495045504,
 0.9056118220517834,
 1.0,
 0.9546688222661494,
 0.929826643844408,
 0.929826643844408,
 0.821183011103383,
 0.821183011103383,
 0.7359334501090717,
 0.84108257045869

In [171]:
msr_glspacy_scores

[0.9767360840134615,
 0.8873469955393184,
 0.9621340979322205,
 0.8727081620887773,
 0.9332425727012864,
 0.9696835123603695,
 0.8470602917792472,
 0.8333384949300862,
 0.8960366397196374,
 0.9077433793376651,
 0.9643711394015118,
 0.9809565508684839,
 0.8242031829476619,
 0.9286346960256133,
 0.955351698093588,
 0.9448004209746196,
 0.9561128315780698,
 0.9854030101032274,
 0.8980152979101641,
 0.9688497214828292,
 0.9612203511914554,
 0.9721766190905643,
 0.979859441206365,
 0.9559653889831572,
 0.9073966165119822,
 0.9286580444730577,
 0.9654592329536447,
 0.8449997413359437,
 0.9325574755747256,
 0.8229074172208559,
 0.9033790749355779,
 0.9263850043744875,
 0.8808338871252546,
 0.8878064260314706,
 0.9357832404523879,
 0.9358373820792676,
 0.8742023510684912,
 0.9184836023747017,
 0.9566387210267541,
 0.9359824266270271,
 0.9637012083292247,
 0.8595352685813905,
 0.9262111341226641,
 0.8728135387834408,
 0.8383927013260836,
 0.9871978408262398,
 0.9199339378116387,
 0.836218356688

In [172]:
# Implementation

sick_glspacy_scores = run_spacy_model(sick_filtered, modelgl_spacy)
msr_glspacy_scores = run_spacy_model(msr_filtered, modelgl_spacy)

In [173]:
sick_glspacy_scores

[0.9344706123876111,
 0.9292744751097476,
 0.9068453122595578,
 0.9060817133964245,
 0.821168404797968,
 0.7583656676529401,
 0.7356786706044599,
 0.663246136722744,
 0.7701721561194161,
 0.9810194667946985,
 0.9666314113861985,
 0.8447036789664796,
 0.6510374101647025,
 0.7280880701505387,
 0.8016183158009678,
 0.7673222896530584,
 0.8072751974492481,
 0.7760966084387533,
 0.976458495826572,
 0.976458495826572,
 0.9774152092996571,
 0.9476664978634464,
 0.9659220231054534,
 0.8082229038065611,
 0.7578272522041156,
 0.7540800620739555,
 0.7540800620739555,
 0.7581158738761568,
 0.7940938620873446,
 0.9799031823366021,
 0.965236626734736,
 0.9630627274361503,
 0.9606614505210506,
 0.9157441841169094,
 0.8613483123004888,
 0.8333456686440717,
 0.8849238256892656,
 0.8964006192721236,
 0.903499226090323,
 0.8870320495045504,
 0.9056118220517834,
 1.0,
 0.9546688222661494,
 0.929826643844408,
 0.929826643844408,
 0.821183011103383,
 0.821183011103383,
 0.7359334501090717,
 0.84108257045869

In [174]:
msr_glspacy_scores

[0.9767360840134615,
 0.8873469955393184,
 0.9621340979322205,
 0.8727081620887773,
 0.9332425727012864,
 0.9696835123603695,
 0.8470602917792472,
 0.8333384949300862,
 0.8960366397196374,
 0.9077433793376651,
 0.9643711394015118,
 0.9809565508684839,
 0.8242031829476619,
 0.9286346960256133,
 0.955351698093588,
 0.9448004209746196,
 0.9561128315780698,
 0.9854030101032274,
 0.8980152979101641,
 0.9688497214828292,
 0.9612203511914554,
 0.9721766190905643,
 0.979859441206365,
 0.9559653889831572,
 0.9073966165119822,
 0.9286580444730577,
 0.9654592329536447,
 0.8449997413359437,
 0.9325574755747256,
 0.8229074172208559,
 0.9033790749355779,
 0.9263850043744875,
 0.8808338871252546,
 0.8878064260314706,
 0.9357832404523879,
 0.9358373820792676,
 0.8742023510684912,
 0.9184836023747017,
 0.9566387210267541,
 0.9359824266270271,
 0.9637012083292247,
 0.8595352685813905,
 0.9262111341226641,
 0.8728135387834408,
 0.8383927013260836,
 0.9871978408262398,
 0.9199339378116387,
 0.836218356688

In [184]:
# Convert scores to binary

msr_glpret_binscores = convert_scores_to_binary(msr_glpret_scores)
msr_glspacy_binscores = convert_scores_to_binary(msr_glspacy_scores)

In [186]:
df_sick_glv = pd.DataFrame({'Relatedness': sick_scores, 'GloVe_gensim': sick_glpret_scores, 
                              'GloVe_spacy': sick_glspacy_scores})
df_sick_glv

Unnamed: 0,Relatedness,GloVe_gensim,GloVe_spacy
0,4.500,0.967390,0.934471
1,3.200,0.904996,0.929274
2,4.700,0.936549,0.906845
3,3.600,0.944773,0.906082
4,3.400,0.803470,0.821168
5,3.300,0.770339,0.758366
6,3.700,0.823388,0.735679
7,3.000,0.773037,0.663246
8,3.700,0.823194,0.770172
9,4.900,0.974747,0.981019


In [176]:
df_sick_glv.corr(method='pearson')

Unnamed: 0,Relatedness,GloVe_gensim,GloVe_spacy
Relatedness,1.0,0.689832,0.500123
GloVe_gensim,0.689832,1.0,0.753391
GloVe_spacy,0.500123,0.753391,1.0


In [190]:
df_sick_glv.corr(method='spearman')

Unnamed: 0,Relatedness,GloVe_gensim,GloVe_spacy
Relatedness,1.0,0.576087,0.473922
GloVe_gensim,0.576087,1.0,0.852549
GloVe_spacy,0.473922,0.852549,1.0


In [197]:
df_msr_glv = pd.DataFrame({'Similarity': msr_scores, 'GloVe_gensim': msr_glpret_binscores, 
                              'GloVe_spacy': msr_glspacy_binscores})
df_msr_glv

Unnamed: 0,Similarity,GloVe_gensim,GloVe_spacy
0,1,1,1
1,0,1,1
2,1,1,1
3,0,1,1
4,1,1,1
5,1,1,1
6,0,1,1
7,1,1,1
8,0,1,1
9,1,1,1


In [198]:
# Confusion matrix of GloVe scores

#plt.figure(figsize=(6, 4))

msr_glpret_cf = pd.DataFrame(confusion_matrix(df_msr_glv['Similarity'], df_msr_glv['GloVe_gensim']),  
                      columns=['GloVe_gensim 0', 'GloVe_gensim 1'], index=['MSR 0', 'MSR 1'])

msr_glspacy_cf = pd.DataFrame(confusion_matrix(df_msr_glv['Similarity'], df_msr_glv['GloVe_spacy']),  
                      columns=['GloVe_spacy 0', 'GloVe_spacy 1'], index=['MSR 0', 'MSR 1'])

#sns.heatmap(msr_glpret_cf, annot=True, cmap='Blues')
#plt.show()
print(msr_glpret_cf, '\n')
print(msr_glspacy_cf, '\n')

       GloVe_gensim 0  GloVe_gensim 1
MSR 0              44            1857
MSR 1              19            3881 

       GloVe_spacy 0  GloVe_spacy 1
MSR 0              9           1892
MSR 1             14           3886 



In [None]:
# billwords_train = preprocess_training_data(billwords_corpus) 

In [None]:
#modelgl_train = Word2Vec(documents, size=300, window=5, min_count=1, workers=10, negative=5, sg=0)

# modelgl_train.train(documents, total_examples=len(documents), epochs=20)

## Doc2Vec

In [200]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [351]:
def create_docs_list(sentences):
    
    '''Creates a list of sentences corresponding to a document'''
    
    docs_list = []
    for pair in sentences:
        docs_list.append(pair[0])
        docs_list.append(pair[1])
        
    return docs_list

In [352]:
def create_tagged_documents(sentences):
    
    '''Gives numerical tags to documents in a list'''
    
    docs_list = create_docs_list(sentences)
    
    tagged_docs = []
    for i, doc in enumerate(docs_list):
        tagged_docs.append(TaggedDocument(words=doc.split(), tags=[i]))
    
    return docs_list, tagged_docs

In [353]:
sick_docs_list, sick_tagged_docs = create_tagged_documents(sick_filtered)
msr_docs_list, msr_tagged_docs = create_tagged_documents(msr_filtered)

In [357]:
sick_tagged_docs

[TaggedDocument(words=['group', 'kid', 'play', 'yard', 'old', 'man', 'stand', 'background'], tags=[0]),
 TaggedDocument(words=['group', 'boy', 'yard', 'play', 'man', 'stand', 'background'], tags=[1]),
 TaggedDocument(words=['group', 'child', 'play', 'house', 'man', 'stand', 'background'], tags=[2]),
 TaggedDocument(words=['group', 'kid', 'play', 'yard', 'old', 'man', 'stand', 'background'], tags=[3]),
 TaggedDocument(words=['young', 'boy', 'play', 'outdoors', 'man', 'smile', 'nearby'], tags=[4]),
 TaggedDocument(words=['kid', 'play', 'outdoors', 'near', 'man', 'smile'], tags=[5]),
 TaggedDocument(words=['young', 'boy', 'play', 'outdoors', 'man', 'smile', 'nearby'], tags=[6]),
 TaggedDocument(words=['boy', 'playing', 'outdoors', 'man', 'smile'], tags=[7]),
 TaggedDocument(words=['kid', 'play', 'outdoors', 'near', 'man', 'smile'], tags=[8]),
 TaggedDocument(words=['group', 'kid', 'play', 'yard', 'old', 'man', 'stand', 'background'], tags=[9]),
 TaggedDocument(words=['boy', 'playing', 'ou

In [359]:
msr_tagged_docs

[TaggedDocument(words=['amrozi', 'accuse', 'brother', 'call', 'witness', 'deliberately', 'distort', 'evidence'], tags=[0]),
 TaggedDocument(words=['refer', 'witness', 'amrozi', 'accuse', 'brother', 'deliberately', 'distort', 'evidence'], tags=[1]),
 TaggedDocument(words=['yucaipa', 'dominick', 'sell', 'chain', 'safeway', '1998', '25', 'billion'], tags=[2]),
 TaggedDocument(words=['yucaipa', 'buy', 'dominick', '1995', '693', 'million', 'sell', 'safeway', '18', 'billion', '1998'], tags=[3]),
 TaggedDocument(words=['publish', 'advertisement', 'internet', 'june', '10', 'offer', 'cargo', 'sale', 'add'], tags=[4]),
 TaggedDocument(words=['june', '10', 'ship', 'owner', 'publish', 'advertisement', 'internet', 'offer', 'explosive', 'sale'], tags=[5]),
 TaggedDocument(words=['around', '0335', 'gmt', 'tab', 'share', '19', 'cent', '44', 'a456', 'early', 'set', 'record', 'high', 'a457'], tags=[6]),
 TaggedDocument(words=['tab', 'share', 'jump', '20', 'cent', '46', 'set', 'record', 'closing', 'high'

In [238]:
def train_doc2vec_model(tagged_docs, dm, dbow_words):
    
    '''Train a different Doc2Vec model using specific parameters'''
    
    model = Doc2Vec(tagged_docs, vector_size=300, window=5, min_count=1, workers=4, alpha=0.025, min_alpha=0.025, 
                     negative=5, epoch=20, dm=dm, dbow_words=dbow_words)
    
    return model

In [242]:
# Implementation

sick_modeldc_dm = train_doc2vec_model(sick_tagged_docs, dm=1, dbow_words=0)
sick_modeldc_dbow0 = train_doc2vec_model(sick_tagged_docs, dm=0, dbow_words=0)
sick_modeldc_dbow1 = train_doc2vec_model(sick_tagged_docs, dm=0, dbow_words=1)

msr_modeldc_dm = train_doc2vec_model(msr_tagged_docs, dm=1, dbow_words=0)
msr_modeldc_dbow0 = train_doc2vec_model(msr_tagged_docs, dm=0, dbow_words=0)
msr_modeldc_dbow1 = train_doc2vec_model(msr_tagged_docs, dm=0, dbow_words=1)

In [243]:
sick_modeldc_dm.most_similar('kid')

  if __name__ == '__main__':
  if np.issubdtype(vec.dtype, np.int):


[('game', 0.9285556077957153),
 ('pool', 0.9201208353042603),
 ('video', 0.9063012599945068),
 ('outdoors', 0.9048335552215576),
 ('inside', 0.8989999890327454),
 ('swimming', 0.8987455368041992),
 ('watch', 0.8922827243804932),
 ('kneel', 0.8885712623596191),
 ('outside', 0.88653564453125),
 ('child', 0.8843095302581787)]

In [244]:
msr_modeldc_dm.most_similar('kid')

  if __name__ == '__main__':
  if np.issubdtype(vec.dtype, np.int):


[('personal', 0.9994519352912903),
 ('torture', 0.9994388222694397),
 ('meanwhile', 0.9994034767150879),
 ('generation', 0.9993965029716492),
 ('screen', 0.9993873238563538),
 ('jane', 0.999384343624115),
 ('ohio', 0.9993799924850464),
 ('virtually', 0.9993706941604614),
 ('argue', 0.9993678331375122),
 ('pollution', 0.9993661642074585)]

In [268]:
sick_modeldc_dm.infer_vector(sick_docs_list[0])

array([-0.08247297,  0.02157035,  0.00473418,  0.01368151, -0.03300228,
       -0.08050673, -0.05173094,  0.02326801,  0.05959492,  0.05368848,
        0.00291628,  0.01523313,  0.06313308,  0.04977813, -0.0129049 ,
       -0.07755645, -0.0270309 , -0.02792282, -0.02503285,  0.01468294,
       -0.02643473, -0.00414101,  0.02981551, -0.02055242,  0.02492721,
        0.01076986,  0.0054597 , -0.02110852,  0.04578355,  0.03928532,
       -0.01650585,  0.0212544 ,  0.00380129, -0.04656714,  0.03454473,
       -0.04487377, -0.0004331 ,  0.0440932 ,  0.01983645, -0.0027807 ,
       -0.00986542,  0.00059547, -0.04989032, -0.08116433,  0.01558752,
       -0.00381449, -0.04212264,  0.01634743, -0.06078423,  0.00719242,
        0.02797614, -0.01380607,  0.03998131, -0.03545693, -0.01229541,
        0.04118319,  0.03201016,  0.04294879,  0.00921317,  0.04617571,
       -0.02649327,  0.00200632,  0.05409051, -0.03719008,  0.02774858,
        0.01337745,  0.09540803,  0.02814465, -0.00855728,  0.01

In [329]:
def run_doc2vec_model(docs_list, model):
    
    '''Runs Doc2Vec models and computes cosine similarity scores for each sentence (document) pair'''
    
    scores = []

    i = 0
    while i < len(docs_list):

        vector1 = model.infer_vector(docs_list[i])
        vector2 = model.infer_vector(docs_list[i+1])
        
        cos_sim = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))
        #cos_sim = dot(vector1.T, vector2)/(linalg.norm(vector1)*linalg.norm(vector2))
        #print(cos_sim)
        scores.append(float(cos_sim))
        i += 2
        
    return scores

In [331]:
# Implementation

sick_dcdm_scores = run_doc2vec_model(sick_docs_list, sick_modeldc_dm)
sick_dcdbow0_scores = run_doc2vec_model(sick_docs_list, sick_modeldc_dbow0)
sick_dcdbow1_scores = run_doc2vec_model(sick_docs_list, sick_modeldc_dbow1)

msr_dcdm_scores = run_doc2vec_model(msr_docs_list, msr_modeldc_dm)
msr_dcdbow0_scores = run_doc2vec_model(msr_docs_list, msr_modeldc_dbow0)
msr_dcdbow1_scores = run_doc2vec_model(msr_docs_list, msr_modeldc_dbow1)

In [332]:
# Convert scores to binary

msr_dcdm_binscores = convert_scores_to_binary(msr_dcdm_scores)
msr_dcdbow0_binscores = convert_scores_to_binary(msr_dcdbow0_scores)
msr_dcdbow1_binscores = convert_scores_to_binary(msr_dcdbow1_scores)

In [343]:
len(sick_dcdm_scores)

9840

In [336]:
df_sick_d2v = pd.DataFrame({'Relatedness': sick_scores, 'Doc2Vec_dm': sick_dcdm_scores, 'Doc2Vec_dbow0': sick_dcdbow0_scores,
                              'Doc2Vec_dbow1': sick_dcdbow1_scores})
df_sick_d2v

Unnamed: 0,Relatedness,Doc2Vec_dm,Doc2Vec_dbow0,Doc2Vec_dbow1
0,4.500,0.971067,0.999253,0.959893
1,3.200,0.985530,0.999053,0.937778
2,4.700,0.980343,0.999020,0.931985
3,3.600,0.971163,0.998503,0.959802
4,3.400,0.979536,0.998689,0.965554
5,3.300,0.967686,0.998315,0.944917
6,3.700,0.979914,0.998909,0.965592
7,3.000,0.959374,0.999186,0.966088
8,3.700,0.974922,0.998949,0.924644
9,4.900,0.984301,0.999608,0.973844


In [338]:
df_sick_d2v.corr(method='pearson')

Unnamed: 0,Relatedness,Doc2Vec_dm,Doc2Vec_dbow0,Doc2Vec_dbow1
Relatedness,1.0,0.044616,0.042688,0.050674
Doc2Vec_dm,0.044616,1.0,0.995455,0.996388
Doc2Vec_dbow0,0.042688,0.995455,1.0,0.994614
Doc2Vec_dbow1,0.050674,0.996388,0.994614,1.0


In [340]:
df_sick_d2v.corr(method='spearman')

Unnamed: 0,Relatedness,Doc2Vec_dm,Doc2Vec_dbow0,Doc2Vec_dbow1
Relatedness,1.0,0.048746,0.096212,0.103158
Doc2Vec_dm,0.048746,1.0,0.731054,0.70425
Doc2Vec_dbow0,0.096212,0.731054,1.0,0.735948
Doc2Vec_dbow1,0.103158,0.70425,0.735948,1.0


In [337]:
df_msr_d2v = pd.DataFrame({'Similarity': msr_scores, 'Doc2Vec_dm': msr_dcdm_binscores, 'Doc2Vec_dbow0': msr_dcdbow0_binscores,
                              'Doc2Vec_dbow1': msr_dcdbow1_binscores})
df_msr_d2v

Unnamed: 0,Similarity,Doc2Vec_dm,Doc2Vec_dbow0,Doc2Vec_dbow1
0,1,1,1,1
1,0,1,1,1
2,1,1,1,1
3,0,1,1,1
4,1,1,1,1
5,1,1,1,1
6,0,1,1,1
7,1,1,1,1
8,0,1,1,1
9,1,1,1,1


In [344]:
# Confusion matrix of Doc2Vec scores

#plt.figure(figsize=(6, 4))

msr_dcdm_cf = pd.DataFrame(confusion_matrix(df_msr_d2v['Similarity'], df_msr_d2v['Doc2Vec_dm']),  
                      columns=['D2V_dm 0', 'D2V_dm 1'], index=['MSR 0', 'MSR 1'])

msr_dcdbow0_cf = pd.DataFrame(confusion_matrix(df_msr_d2v['Similarity'], df_msr_d2v['Doc2Vec_dbow0']),  
                      columns=['D2V_dbow0 0', 'D2V_dbow0 1'], index=['MSR 0', 'MSR 1'])

msr_dcdbow1_cf = pd.DataFrame(confusion_matrix(df_msr_d2v['Similarity'], df_msr_d2v['Doc2Vec_dbow1']),  
                      columns=['D2V_dbow1 0', 'D2V_dbow1 1'], index=['MSR 0', 'MSR 1'])

#sns.heatmap(msr_dcdm_cf, annot=True, cmap='Blues')
#plt.show()
print(msr_dcdm_cf, '\n')
print(msr_dcdbow0_cf, '\n')
print(msr_dcdbow1_cf, '\n')

       D2V_dm 0  D2V_dm 1
MSR 0         0      1901
MSR 1         0      3900 

       D2V_dbow0 0  D2V_dbow0 1
MSR 0            0         1901
MSR 1            0         3900 

       D2V_dbow1 0  D2V_dbow1 1
MSR 0            4         1897
MSR 1            0         3900 



## fastText

In [349]:
from gensim.models import FastText

# billwords_train = preprocess_training_data(billwords_corpus) 

In [350]:
#modelft_train = FastText(documents, size=300, window=5, min_count=1, workers=10, negative=5, sg=0)

# modelft_train.train(documents, total_examples=len(documents), epochs=20)