## Semantic Analysis

In [28]:
import string
import time
import numpy as np
import gensim
from nltk.tokenize import WordPunctTokenizer

from nose.tools import (
    assert_equal,
    assert_is_instance,
    assert_almost_equal,
    assert_true
    )

from numpy.testing import assert_array_equal


### Wordnet

In [30]:
from nltk.corpus import wordnet as wn

In [32]:
def find_number_of_entries(word):
    '''
    Finds the number of entries in the wordnet synset.
    
    Parameters
    ----------
    word: A string.
    
    Returns
    -------
    An int.
    '''
    
    the_synsets = wn.synsets(word)
    result=len(the_synsets)
    return result

In [33]:
the_word = 'love'
n_entries = find_number_of_entries(the_word)
print('{0} total entries in synonym ring for {1}. '.format(n_entries, the_word))

10 total entries in synonym ring for love. 


In [34]:
the_word = 'live'
n_entries = find_number_of_entries(the_word)
print('{0} total entries in synonym ring for {1}. '.format(n_entries, the_word))

19 total entries in synonym ring for live. 


In [35]:
assert_is_instance(find_number_of_entries('love'), int)
assert_equal(find_number_of_entries('love'), 10)
assert_equal(find_number_of_entries('live'), 19)

### Word Similarities

In [39]:
def get_path_similarity(word1, word2):
    '''
    Computes the path similarity between word1 and word1.
    
    Parameters
    ----------
    word1: A string.
    word2: A string.
    
    Returns
    -------
    A float.
    '''
    
    w1 = wn.synset(word1+'.n.01')
    w2 = wn.synset(word2+'.n.01')
    result=wn.path_similarity(w1, w2)
    return result

In [40]:
fmt_str = '{1} to {2}: {0:4.3f}'

print('Path Similarity:')
print(40*'-')
print(fmt_str.format(get_path_similarity('excess', 'surplus'), 'excess', 'surplus'))
print(fmt_str.format(get_path_similarity('trade', 'economy'), 'trade', 'economy'))
print(fmt_str.format(get_path_similarity('mean', 'average'), 'mean', 'average'))
print(fmt_str.format(get_path_similarity('import', 'export'), 'mean', 'average'))
print(fmt_str.format(get_path_similarity('excess', 'excess'), 'excess', 'excess'))

Path Similarity:
----------------------------------------
excess to surplus: 1.000
trade to economy: 0.100
mean to average: 0.500
mean to average: 0.333
excess to excess: 1.000


In [41]:
assert_is_instance(get_path_similarity('excess', 'surplus'), float)
assert_almost_equal(get_path_similarity('excess', 'surplus'), 1.0)
assert_almost_equal(get_path_similarity('trade', 'economy'), 0.1)
assert_almost_equal(get_path_similarity('mean', 'average'), 0.5)
assert_almost_equal(get_path_similarity('import', 'export'), 0.3333333333333333)
assert_almost_equal(get_path_similarity('excess', 'excess'), 1.0)

### Word2Vec

In [42]:
from nltk.corpus import reuters
sentences = reuters.sents()[:20000]

In [43]:
def get_model(sentences):
    '''
    Builds a Word2Vec model from sentences in corpus.
    
    Parameters
    ----------
    sentences: A list of lists(sentences); each sentence is a list of strings(words).
    
    Returns
    -------
    A Word2Vec instance.
    '''
    
    model = gensim.models.Word2Vec(sentences, window=10, min_count=6)
    return model

In [44]:
start_time = time.clock()
model = get_model(sentences)
print(time.clock() - start_time, "seconds")

33.050535000000025 seconds


In [45]:
assert_is_instance(model, gensim.models.Word2Vec)
assert_equal(model.window, 10)
assert_equal(model.min_count, 6)

### Cosine Similarity

In [46]:
def get_cosine_similarity(model, word1, word2):
    '''
    Computes cosine similarity between "word1" and "word2" using a Word2Vec model.
    
    Parameters
    ----------
    model: A gensim.Word2Vec model.
    word1: A string.
    word2: A string.
    
    Returns
    -------
    A float.
    '''
    
    result = model.similarity(word1,word2)
    return result

In [47]:
fmt_str = '{1} to {2}: {0:4.3f}'

print('Cosine Similarity:')
print(40*'-')
print(fmt_str.format(get_cosine_similarity(model, 'excess', 'surplus'), 'excess', 'surplus'))
print(fmt_str.format(get_cosine_similarity(model, 'trade', 'economy'), 'trade', 'economy'))
print(fmt_str.format(get_cosine_similarity(model, 'mean', 'average'), 'mean', 'average'))
print(fmt_str.format(get_cosine_similarity(model, 'import', 'export'), 'mean', 'average'))
print(fmt_str.format(get_cosine_similarity(model, 'excess', 'excess'), 'excess', 'excess'))

Cosine Similarity:
----------------------------------------
excess to surplus: 0.488
trade to economy: 0.822
mean to average: 0.148
mean to average: 0.881
excess to excess: 1.000


In [48]:
assert_is_instance(get_cosine_similarity(model, 'excess', 'surplus'), float)
assert_almost_equal(get_cosine_similarity(model, 'excess', 'surplus'), model.similarity('excess', 'surplus'))
assert_almost_equal(get_cosine_similarity(model, 'trade', 'economy'), model.similarity('trade', 'economy'))
assert_almost_equal(get_cosine_similarity(model, 'mean', 'average'), model.similarity('mean', 'average'))
assert_almost_equal(get_cosine_similarity(model, 'import', 'export'), model.similarity('import', 'export'))
assert_almost_equal(get_cosine_similarity(model, 'excess', 'excess'), 1.0)

### Most similar words

In [49]:
def find_most_similar_words(model):
    '''
    Find the top 5 most similar words,
    where "price", "economy", and "trade" contribute positively towards the similarity,
    and "law" and "legal" contribute negatively.
    
    Parameters
    ----------
    model: A gensim.Word2Vec model.
    
    Returns
    -------
    A list of tuples (word, similarty).
    word: A string.
    similarity: A float.
    '''
    
    vals = model.most_similar(positive=['price', 'economy', 'trade' ], negative=['law','legal' ], topn=5)
    return vals

In [50]:
print('{0:14s}: {1}'.format('Word', 'Cosine Similarity'))
print(40*'-')
for val in find_most_similar_words(model):
    print('{0:14s}: {1:6.3f}'.format(val[0], val[1]))

Word          : Cosine Similarity
----------------------------------------
double        :  0.791
yen           :  0.768
current       :  0.763
account       :  0.745
rise          :  0.736


In [51]:
assert_is_instance(find_most_similar_words(model), list)
assert_true(all(isinstance(t[0], str) for t in find_most_similar_words(model)))
assert_true(all(isinstance(t[1], float) for t in find_most_similar_words(model)))
assert_equal(len(find_most_similar_words(model)), 5)
words = [t[0] for t in model.most_similar(positive=['price', 'economy', 'trade'], negative=['law', 'legal'], topn=5)]
similarities = [t[1] for t in model.most_similar(positive=['price', 'economy', 'trade'], negative=['law', 'legal'], topn=5)]
assert_equal([t[0] for t in find_most_similar_words(model)], words)
assert_almost_equal(find_most_similar_words(model)[0][1], similarities[0])
assert_almost_equal(find_most_similar_words(model)[1][1], similarities[1])
assert_almost_equal(find_most_similar_words(model)[2][1], similarities[2])
assert_almost_equal(find_most_similar_words(model)[3][1], similarities[3])
assert_almost_equal(find_most_similar_words(model)[4][1], similarities[4])