In [142]:
import numpy as np
from nltk.corpus import wordnet
from collections import OrderedDict
from itertools import combinations
import string

# Cosine Similarity

The notion of `cosine similarity` has become the standard measure of similarity between elements in vector space.

In [None]:
TODO explain cosine similarity

## Normalize a vector

To normalize a vector, we shrink all values so they fall between $0$ and $1$. <br>
$vector_{normalized} = \frac{vector}{\sqrt{vector \cdot vector}}$ (where $\cdot$ represents the *dot product*)

In [181]:
# to normalize a vector
def normalize_vector(vector):
    """
    Normalizes a vector so that all its values are between 0 and 1
    :param: vector: a `numpy` vector
    :return: a normalized `numpy` vector
    """
    # norm = np.sqrt(vector.dot(vector))
    # numpy has a built in function
    norm = np.linalg.norm(vector)
    if norm:
        return vector / norm
    else:
        # if norm == 0, then original vector was all 0s
        return vector

In [182]:
vector_3d = np.array([1,2,3])
print("original vector", vector_3d)
print("normalized vector", normalize_vector(vector_3d))

original vector [1 2 3]
normalized vector [ 0.26726124  0.53452248  0.80178373]


Normalizing a vector maintains the relationship between the values. <br>
$0.267$ is $\frac{1}{3}$rd of $0.801$ just like $1$ is $\frac{1}{3}$rd of $3$.

## Cosine Similarity

`Cosine similarity` has become the standard metric for determining "similarity" of elements in vector space. <br>
It is simply the `dot product` of two normalized vectors. <br>
$cosSim = vector_{normalized}^a \cdot vector_{normalized}^b$ <br>
This score will be between $0$ (representing absolutely no similarity) and $1$ representing equality. <br>
**Note:** `cosine similarity` is a symmetric measurement, so $vector_{normalized}^a \cdot vector_{normalized}^b = vector_{normalized}^b \cdot vector_{normalized}^a$

In [30]:
# calculate cosine similarity of two vectors
def cos_sim(vector_one, vector_two):
    """
    Calculate the cosine similarity of two `numpy` vectors
    param: vector_one: a `numpy` vector
    param: vector_two: a `numpy` vector
    return: A score between 0 and 1
    """
    # ensure that both vectors are already normalized
    vector_one_norm = normalize_vector(vector_one)
    vector_two_norm = normalize_vector(vector_two)
    
    # calculate the dot product between the two normalized vectors
    return vector_one_norm.dot(vector_two_norm)

In [33]:
vector_one = np.array([1,1,1,1,1])
vector_two = np.array([1,1,1,1,2])
vector_three = np.array([1,2,3,4,5])
vector_four = np.array([10,20,30,40,50])
print("cosine similarity of vector_one and vector_two", cos_sim(vector_one, vector_two))
print("cosine similarity of vector_one and vector_three", cos_sim(vector_one, vector_three))
print("cosine similarity of vector_one and vector_four", cos_sim(vector_one, vector_four))

cosine similarity of vector_one and vector_two 0.948683298051
cosine similarity of vector_one and vector_three 0.904534033733
cosine similarity of vector_one and vector_four 0.904534033733


### Why not just use Euclidean Distance?

See http://stackoverflow.com/questions/9314576/calculate-distance-between-two-vectors-of-different-length

In [183]:
# calculate euclidean distance


## Measuring the "Similarity" of Words

Now that we have a metric to measure "similarity" we can use it to calculate (computationally) the "similarity" between two words....

...as long as we project those words into vector space.

### Option 1: One-hot vectors

1. Create an indexed list of your vocabulary
2. Generate the "one-hot" (vector with only one $1$ in it and the rest $0$s) for any word in your vocabulary

In [54]:
vocabulary = ['apple', 'banana', 'orange', 'cantaloupe', 'peach']

In [148]:
# generate vocabulary lookup
def build_voc_lookup(list_of_voc):
    """
    Generates a dictionary where the key is the word and the value is its index
    :param: list_of_voc: list of vocabulary words
    :return: Dictionary of vocabulary
    """
    lookup_dict = OrderedDict()
    counter = 0
    for word in list_of_voc:
        lookup_dict[word] = counter
        counter+=1
    return lookup_dict

# lookup word
def lookup_word(lookup_dict, word):
    """ 
    Looks up a given word in the vocabulary dictionary, and returns None if word not in vocabulary
    :param: lookup_dict: lookup-dictionary built with build_voc_lookup()
    :param: word to index
    :returns: index of word in vocabulary or None
    """
    if word in lookup_dict:
        return lookup_dict[word]
    else:
        return None

In [184]:
lookup_dict = build_voc_lookup(vocabulary)
print(lookup_word(lookup_dict, 'peach'))
print(lookup_word(lookup_dict, 'hashbrown'))

4
None


In [185]:
# build one-hot vector for word
def make_one_hot(lookup_dict, word):
    """
    Builds a one-hot numpy vector for a word
    :param: lookup_dict: lookup-dictionary built with build_voc_lookup()
    :param: word: word to convert to one-hot
    :returns: numpy vector with dimension equal to size of vocabulary
    """
    # get size of vocabulary
    voc_size = len(lookup_dict.items())
    # initialize empty vector of zeros with the size of the vocabulary
    one_hot = np.zeros((voc_size))
    # get index of word (or None if not in vocabulary)
    word_index = lookup_word(lookup_dict, word)
    # make the nth dimension of one-hot (representing the index of word in vocabulary) to 1
    if word_index or word_index == 0:
        one_hot[word_index] = 1
    # if word not in vocabulary, the one-hot will remain zeros
    return one_hot

In [186]:
for word in vocabulary + ['hashbrown', 'Capizzi']:
    print("one-hot vector for '{:>11}'".format(word), make_one_hot(lookup_dict, word))

one-hot vector for '      apple' [ 1.  0.  0.  0.  0.]
one-hot vector for '     banana' [ 0.  1.  0.  0.  0.]
one-hot vector for '     orange' [ 0.  0.  1.  0.  0.]
one-hot vector for ' cantaloupe' [ 0.  0.  0.  1.  0.]
one-hot vector for '      peach' [ 0.  0.  0.  0.  1.]
one-hot vector for '  hashbrown' [ 0.  0.  0.  0.  0.]
one-hot vector for '    Capizzi' [ 0.  0.  0.  0.  0.]


#### The problem with one-hot vectors

In [187]:
# add two OOV words to vocabulary
vocabulary_plus_oov = vocabulary + ["Capizzi", "Phoenix"]
# get all combinations
all_combinations = combinations(vocabulary_plus_oov, 2)
# iterate through all combinations and calculate cosine similarity
for (word1, word2) in all_combinations:
    one_hot_word_1 = make_one_hot(lookup_dict, word1)
    one_hot_word_2 = make_one_hot(lookup_dict, word2)
    print("cosine similarity between {} and {}".format(word1, word2), cos_sim(one_hot_word_1, one_hot_word_2))

cosine similarity between apple and banana 0.0
cosine similarity between apple and orange 0.0
cosine similarity between apple and cantaloupe 0.0
cosine similarity between apple and peach 0.0
cosine similarity between apple and Capizzi 0.0
cosine similarity between apple and Phoenix 0.0
cosine similarity between banana and orange 0.0
cosine similarity between banana and cantaloupe 0.0
cosine similarity between banana and peach 0.0
cosine similarity between banana and Capizzi 0.0
cosine similarity between banana and Phoenix 0.0
cosine similarity between orange and cantaloupe 0.0
cosine similarity between orange and peach 0.0
cosine similarity between orange and Capizzi 0.0
cosine similarity between orange and Phoenix 0.0
cosine similarity between cantaloupe and peach 0.0
cosine similarity between cantaloupe and Capizzi 0.0
cosine similarity between cantaloupe and Phoenix 0.0
cosine similarity between peach and Capizzi 0.0
cosine similarity between peach and Phoenix 0.0
cosine similarity 

### Option 2: Encode spelling

Following a similar pattern as the one-hot of a word over a vocabulary, let's build word vectors represented by the frequency of the letters present

In [188]:
alphabet = list(string.ascii_lowercase) 

In [189]:
# since we don't need to worry about "out-of-vocabulary" now, we can just use alphabet.index([letter])
def lookup_letter(letter):
    return alphabet.index(letter.lower())

In [190]:
print("a", lookup_letter('a'))
print("A", lookup_letter('A'))

a 0
A 0


In [191]:
def make_spelling_vector(word):
    """
    Converts a word into a vector of dimension 26 where each cell contains the count for that letter
    :param: word: word to vectorize
    :returns: numpy vector of 26 dimensions
    """
    # initialize vector with zeros
    spelling_vector = np.zeros((26))
    # iterate through each letter and update count
    for letter in word:
        if letter in string.ascii_letters:
            letter_index = lookup_letter(letter)
            spelling_vector[letter_index] = spelling_vector[letter_index] + 1
    return spelling_vector

In [192]:
make_spelling_vector("apple")

array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [193]:
# reset the generator
all_combinations = combinations(vocabulary_plus_oov, 2)
# iterate through all words
for (word1, word2) in all_combinations:
    spelling_vector_1 = make_spelling_vector(word1)
    spelling_vector_2 = make_spelling_vector(word2)
    print("cosine similarity between {} and {}".format(word1, word2), cos_sim(spelling_vector_1, spelling_vector_2))

cosine similarity between apple and banana 0.303045763366
cosine similarity between apple and orange 0.308606699924
cosine similarity between apple and cantaloupe 0.654653670708
cosine similarity between apple and peach 0.676123403783
cosine similarity between apple and Capizzi 0.341881729379
cosine similarity between apple and Phoenix 0.428571428571
cosine similarity between banana and orange 0.54554472559
cosine similarity between banana and cantaloupe 0.617213399848
cosine similarity between banana and peach 0.3585685828
cosine similarity between banana and Capizzi 0.241746889208
cosine similarity between banana and Phoenix 0.20203050891
cosine similarity between orange and cantaloupe 0.589255650989
cosine similarity between orange and peach 0.36514837167
cosine similarity between orange and Capizzi 0.123091490979
cosine similarity between orange and Phoenix 0.462910049886
cosine similarity between cantaloupe and peach 0.645497224368
cosine similarity between cantaloupe and Capizzi 

We've successfully generated similarity scores!  But...

Do they really reflect anything semantic?  In other words, does it make sense that "peach" and "Phoenix" (`cosine similarity = 0.507`) are more similar than "peach" and "orange" (`cosine similarity = .365`)?

### Option 3: Word Embeddings

In [None]:
# TODO load word to vec vectors

In [None]:
# TODO method for evaluation