In [1]:
import pandas as pd
import json
import warnings

In [2]:
warnings.filterwarnings(action = 'ignore')

In [3]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

w2v_model = Word2Vec(sentences=common_texts, window=5, min_count=1, workers=4)
w2v_model.save("word2vec.model")

In [4]:
#doc2vec method

In [5]:
#def for tagged document ->
def tag_doc(list_of_low):
    for i, low in enumerate(list_of_low):
        yield gensim.models.doc2vec.TaggedDocument(low, [i])

In [6]:
#data from a book: Life of James Mars

In [7]:
def getlistfromdef(sentence):
    sentence.rstrip("\n")
    removed = ""
    for ch in sentence:
        if ch.isalnum() or ch == " ":
                removed += ch.lower()
            
    return removed.split()

In [8]:
#test
getlistfromdef("A B; c d e \n")

['a', 'b', 'c', 'd', 'e']

In [9]:
# if you are using a window system, you might have to replace "r" with encoding='utf8'

with open ("Life_of_James_Mars.txt", "r") as myfile:
    file =myfile.readlines()

In [10]:
bookdata = []
for ln in file:
    hold = getlistfromdef(ln)
    for wd in hold:
        bookdata.append(wd)

In [11]:
bookdata

['introduction',
 'when',
 'i',
 'made',
 'up',
 'my',
 'mind',
 'to',
 'write',
 'this',
 'story',
 'it',
 'was',
 'not',
 'to',
 'publish',
 'it',
 'but',
 'it',
 'was',
 'at',
 'the',
 'request',
 'of',
 'my',
 'sister',
 'that',
 'lived',
 'in',
 'africa',
 'and',
 'has',
 'lived',
 'there',
 'more',
 'than',
 'thirty',
 'years',
 'she',
 'had',
 'heard',
 ...]

In [12]:
import gensim

In [13]:
word2vec_model = gensim.models.Word2Vec([bookdata],min_count=2)

In [14]:
word2vec_model.train([bookdata], total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)

(30409, 55280)

In [15]:
# vexed was not in the vocabulary will return error
# word2vec_model.n_similarity(['vexed'], ['annoyed'])

In [16]:
word2vec_model.n_similarity(['people'], ['man'])

0.9997852

In [17]:
word2vec_model.n_similarity(['horses'], ['man'])

0.99926007

In [18]:
# I think the text is a bit small and makes it hard to differentiate between words, 
# but at least  the output still works. 

In [19]:
# because the text was too small and the model was not trained very well, I wanted 
# to try a different larger dataset as well as the doc2vec method
# given dataset from gensim: text8

In [20]:
import gensim.downloader as api

In [21]:
dataset = api.load("text8")
data = [d for d in dataset]

In [22]:
train = list(tag_doc(data))

In [23]:
# print(data)

In [24]:
# vector_size = dimensionality of the feature
# min_count = ignores words with lower frequency than the given
# epoch = number of iterations 
# dm = training algorithm (1 is 'distributed memory'(PVDM), 2 is 'distributed bag of words' (DBOW))

model_dbow = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30, dm=0)
model_pvdm = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30, dm=1)
word2vec_model = gensim.models.Word2Vec(data,min_count=2)

In [25]:
model_pvdm.build_vocab(train)
model_dbow.build_vocab(train)

In [26]:
model_pvdm.train(train, total_examples=model_pvdm.corpus_count, epochs=model_pvdm.epochs)

In [27]:
model_dbow.train(train, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs)

In [28]:
word2vec_model.train(data, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)

(63449552, 85026035)

In [29]:
print(model_pvdm.infer_vector(['violent', 'means', 'to', 'destroy', 'the','organization']))

[-0.0266399   0.09813191 -0.22447698  0.25955695  0.18271853 -0.04699322
  0.14097512 -0.12786591 -0.16622266  0.2731424   0.08852278  0.06481127
  0.04294713  0.08524445 -0.07564342  0.01464169  0.13549389 -0.11494784
 -0.11806548 -0.09634089  0.2606743   0.08735646 -0.10390906  0.25909486
  0.02576874 -0.2787384   0.23608638  0.00932794  0.11527827  0.07946284
  0.19009067  0.26584113 -0.16609377 -0.04089054 -0.02549952  0.13503678
 -0.2255434  -0.06137215  0.306121   -0.23324919]


In [30]:
#test case
model_pvdm.n_similarity(['sushi'], ['japanese'])

0.39376003

In [31]:
#building the actual synonym detector

In [32]:
def synonym_pvdm(key, a, b, c, d):
    highest_similarity = 0
    highest_word = ""
    words = [a, b, c, d]
    for w in words:
        sim = model_pvdm.n_similarity([key], [w])
        if sim > highest_similarity:
            highest_similarity = sim
            highest_word = w
            
    return highest_word

In [33]:
# 1. vexed
#    a. annoyed <- correct answer
#    b. amused
#    c. frightened
#    d. excited

synonym_pvdm('vexed', 'annoyed', 'amused', 'frightened', 'excited')

'amused'

In [34]:
#pvdm was able to get the correct answer with a similarity score of 0.30836374

model_pvdm.n_similarity(['vexed'], ['annoyed'])

0.30836374

In [35]:
def synonym_dbow(key, a, b, c, d):
    highest_similarity = 0
    highest_word = ""
    words = [a, b, c, d]
    for w in words:
        sim = model_dbow.n_similarity([key], [w])
        if sim > highest_similarity:
            highest_similarity = sim
            highest_word = w
            
    return highest_word

In [36]:
synonym_dbow('vexed', 'annoyed', 'amused', 'frightened', 'excited')

'frightened'

In [37]:
#dbow got a simlarity score of: 0.072231844

model_dbow.n_similarity(['vexed'], ['excited'])

0.072231844

In [38]:
def synonym_w2v(key, a, b, c, d):
    highest_similarity = 0
    highest_word = ""
    words = [a, b, c, d]
    for w in words:
        sim = word2vec_model.n_similarity(key, w)
        if sim > highest_similarity:
            highest_similarity = sim
            highest_word = w
            
    return highest_word

In [39]:
synonym_w2v('vexed', 'annoyed', 'amused', 'frightened', 'excited')

'excited'

In [40]:
#w2v got a simlarity score of: 0.28602237 (might vary )

word2vec_model.n_similarity(['vexed'], ['excited'])

0.28602237

In [41]:
# note: dbow did not do as well as pvdm probably because it uses a method that samples
# entire phrases. On the other hand pvdm uses a method that samples specific words, 
# similar to the word2vec methods. the word2vec method also did not work well, perhaps
# this is an issue with our data. 

In [42]:
# other tests

In [43]:
synonym_pvdm('enterprise', 'want', 'venture', 'offer', 'shorten')

'venture'

In [44]:
synonym_pvdm('affinity', 'dispatch', 'connection', 'hoax', 'conviction')

'connection'

In [45]:
synonym_pvdm('imperious', 'royal', 'friendly', 'insightful', 'arrogant')

'arrogant'

In [46]:
#pvdm seems to work well :)