# Requirements for this notebook:
1. Internet connection (to download corpora and tokenizer data with calls to nltk.download())
2. The following packages:
  1. nltk (Anaconda or PIP command line install : pip install -U nltk OR conda install nltk)
  2. gensim (pip install -U gensim)
  3. scikit-learn v0.18.1 (pip install -U scikit-learn)
  4. matplotlib (pip install -U matplotlib)
  5. numpy (pip install -U numpy)

# Objectives of this notebook are to illustrate how we can do the following with word embeddings:
1. Train some embeddings from scratch
2. Explore embeddings vectors
3. Use these for an NLP task

In [None]:
import os
import pickle
import time
import logging
from collections import defaultdict

import nltk
nltk.__version__

import gensim
gensim.__version__

import numpy as np
np.__version__

from gensim.models import Word2Vec

import sklearn
from sklearn.manifold import TSNE

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

# Now that we're done with imports, let's download a few datasets
## First some document sets (corpora)
## Then some resources for wordlists and tokenization

In [None]:
%time nltk.download('brown')
%time nltk.download('movie_reviews')
%time nltk.download('treebank')

# Let's download the PUNKT tokenizer first so that we can use tokenize words and sentences
%time nltk.download('punkt')

# Let's download stopwords so we can plot them later
%time nltk.download('stopwords')

In [None]:
#  Now we can import these
from nltk.corpus import brown
from nltk.corpus import movie_reviews
from nltk.corpus import treebank

# Let's look at a few sentences from each of the 3 document sets
* Brown -- First million word corpus created in 1961 at Brown University
* Movie Reviews -- Reviews of movies with sentiment labels
* Penn Treebank -- Widely used dataset for part-of-speech tagging and other NLP tasks

In [None]:
print(brown.sents()[0])
print(movie_reviews.sents()[0])
print(treebank.sents()[0])

# Pro Tip : These corpora are relatively small.  We may not get great results.  Certainly not as large as GoogleNews, Wikipedia, PubMed, etc.  Our mileage may vary

In [None]:
# How many sentences do we have in each corpus?
print('Brown sentence count : ', len(brown.sents()))
print('Movie Review sentence count : ', len(movie_reviews.sents()))
print('Treebank sentence count : ', len(treebank.sents()))

# Pro Tip : Before we go on, it's good to know about the GenSim documentation for models, training, querying, etc:
https://radimrehurek.com/gensim/models/word2vec.html

# Establish some parameters for training word vectors

In [None]:
# Word2vec has two flavors -- Skip-gram (SG) and continuous bag of words (CBOW)
## Skipgram takes longer to train, so we'll disable it
W2V_SKIP_GRAM = 0
# Determine how many dimensions we want our word vectors to have in the end
W2V_DIMENSIONS = 200
# Minimum count of a token's occurrences in a corpus to be considered for training
W2V_MIN_COUNT = 3

# How many worker threads should we use to train?  Depends on your hardware...
W2V_WORKERS = 1

# Now we can train some word2vec models and time them.
## This doesn't take to long since the documents sets are relatively small and we can easily work with the sentences

In [None]:
%time brown_model = Word2Vec(brown.sents(), \
                             sg = W2V_SKIP_GRAM, \
                             size = W2V_DIMENSIONS, \
                             min_count = W2V_MIN_COUNT, \
                             workers = W2V_WORKERS)

In [None]:
%time movie_model = Word2Vec(movie_reviews.sents(), \
                             sg = W2V_SKIP_GRAM, \
                             size = W2V_DIMENSIONS, \
                             min_count = W2V_MIN_COUNT, \
                             workers = W2V_WORKERS)

In [None]:
%time treebank_model = Word2Vec(treebank.sents(), \
                                sg = W2V_SKIP_GRAM, \
                                size = W2V_DIMENSIONS, \
                                min_count = W2V_MIN_COUNT, \
                                workers = W2V_WORKERS)

In [None]:
# Now that these are trained, we can inspect the final vocabulary sizes for each model
print('Brown Model vocab size : ', len(brown_model.wv.vocab))
print('Movie Review Model vocab size : ', len(movie_model.wv.vocab))
print('Treebank Model vocab size : ', len(treebank_model.wv.vocab))

# Question : Given these vocabulary sizes, which one do we expect would be the worst in translating to other tasks like text classification, information extraction, etc?
* Brown?
* Movie Reviews?
* Treebank?

# Let's see which words are in-vocabulary and out-of-vocabulary for each set

In [None]:
# do you think that the word "Spielberg" will be in each of these corpora?

VOCABULARY_WORD_1 = 'spielberg'

if VOCABULARY_WORD_1 in brown_model.wv.vocab:
    print('Word found in Brown corpus')
else:
    print('Word NOT found in Brown corpus')
    
if VOCABULARY_WORD_1 in movie_model.wv.vocab:
    print('Word found in Movie Review corpus')
else:
    print('Word NOT found in Movie Review corpus')
    
if VOCABULARY_WORD_1 in treebank_model.wv.vocab:
    print('Word found in Treebank corpus')
else:
    print('Word NOT found in Treebank corpus')

# Now choose another word of your own to see if it's in the vocabulary for each model

In [None]:
# do you think that the word "Spielberg" will be in each of these corpora?

VOCABULARY_WORD_2 = 'CHOOSE_YOUR_WORD_HERE'

if VOCABULARY_WORD_2 in brown_model.wv.vocab:
    print('Word found in Brown corpus')
else:
    print('Word NOT found in Brown corpus')
    
if VOCABULARY_WORD_2 in movie_model.wv.vocab:
    print('Word found in Movie Review corpus')
else:
    print('Word NOT found in Movie Review corpus')
    
if VOCABULARY_WORD_2 in treebank_model.wv.vocab:
    print('Word found in Treebank corpus')
else:
    print('Word NOT found in Treebank corpus')

# Save / Load models -- This is what allows us to train them and then transfer them to be portable for other tasks

In [None]:
# Before we go any further, we can save one of our models
brown_file_name = 'brown_' + time.strftime("%m_%d_%Y")
brown_model.save(brown_file_name)
print('Saved Model to : ' + brown_file_name)

In [None]:
# and here's how we can load a model back in...
brown_loaded_model = Word2Vec.load(brown_file_name)
print('Brown LOADED Model vocab size : ', len(brown_loaded_model.wv.vocab))

# Let's start inspect our newly trained vectors

In [None]:
# Now let's inspect what some of these vectors look like
print(brown_model.wv['business'])

# Let's ask the model which words have the most similar vectors to a few query words

In [None]:
print(movie_model.wv.most_similar(positive = ['movie'], topn = 10))

# Tom Hanks is a film actor -- what other words behave in a similar way in the movie corpus?

In [None]:
print(movie_model.wv.most_similar(positive = ['hanks'], topn = 10))

# What about a similar word like 'talk' in a generic corpus like the Brown corpus?

In [None]:
print(brown_model.wv.most_similar(positive = ['talk'], topn = 10))

# Now try a few of your own in each model

In [None]:
print(brown_model.wv.most_similar(positive = ['YOUR_OWN_WORD_HERE'], topn = 10))

In [None]:
print(movie_model.wv.most_similar(positive = ['YOUR_OWN_WORD_HERE'], topn = 10))

In [None]:
# Note : This model is small with a small vocabulary so if a word doesn't exist, 
# it will throw an exception when trying to access it directly
print(treebank_model.wv.most_similar(positive = ['YOUR_OWN_WORD_HERE'], topn = 10))

# Another thing we can do for each model is ask how similar two words are

In [None]:
# Now let's see how similar certain word pairs might be
TERM_SIMILARITY_1 = 'movie'
TERM_SIMILARITY_2 = 'film'

print(brown_model.wv.similarity(TERM_SIMILARITY_1, TERM_SIMILARITY_2))

In [None]:
# Now let's try another pair
TERM_SIMILARITY_3 = 'computer'
TERM_SIMILARITY_4 = 'life'

print(brown_model.wv.similarity(TERM_SIMILARITY_3, TERM_SIMILARITY_4))

![Vector Composition](vector_composition.png)

# Now let's see if we can do some vector arithmetic to see if our model can perform well on analogy tasks

# Question : vec(families) + vec(city) - vec(family) = ???

In [None]:
# this gives what we might expect from the BROWN corpus
RELATIONSHIP_WORD_1 = 'families'
RELATIONSHIP_WORD_2 = 'city'
RELATIONSHIP_WORD_3 = 'family'

#print(movie_model.wv.most_similar(positive=[RELATIONSHIP_WORD_1, RELATIONSHIP_WORD_2], negative=[RELATIONSHIP_WORD_3]))
print(brown_model.wv.most_similar(positive=[RELATIONSHIP_WORD_1, RELATIONSHIP_WORD_2], negative=[RELATIONSHIP_WORD_3]))
# apparently this corpus does not have one of our target words (KING)
#treebank_model.most_similar(positive=[RELATIONSHIP_WORD_1, RELATIONSHIP_WORD_2], negative=['man'])

# Question : vec(films) + vec(movie) - vec(film) = ???

In [None]:
# this gives what we might expect from the MOVIE corpus
RELATIONSHIP_WORD_1 = 'films'
RELATIONSHIP_WORD_2 = 'movie'
RELATIONSHIP_WORD_3 = 'film'

print(movie_model.wv.most_similar(positive=[RELATIONSHIP_WORD_1, RELATIONSHIP_WORD_2], negative=[RELATIONSHIP_WORD_3]))

# Can we replicate the famous example from this paper:
## If not, why not?

In [None]:
# "King - Man + Woman ~~ Queen"
# http://www.aclweb.org/anthology/N13-1#page=784
RELATIONSHIP_WORD_1 = 'woman'
RELATIONSHIP_WORD_2 = 'king'
RELATIONSHIP_WORD_3 = 'man'

print(movie_model.wv.most_similar(positive=[RELATIONSHIP_WORD_1, RELATIONSHIP_WORD_2], negative=[RELATIONSHIP_WORD_3]))
print(brown_model.wv.most_similar(positive=[RELATIONSHIP_WORD_1, RELATIONSHIP_WORD_2], negative=[RELATIONSHIP_WORD_3]))
# apparently this corpus does not have one of our target words (KING)
#treebank_model.most_similar(positive=[RELATIONSHIP_WORD_1, RELATIONSHIP_WORD_2], negative=['man'])

# Now let's try to visualize some of the embeddings vectors with reduced dimensions by using a visualization method called t-distributed stochastic neighbor embedding (t-SNE)

In [None]:
def plot_embeddings(w2v_model, target_terms):
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    term_vectors = []
    # let's make sure that a term we want is in the model
    for target_term in target_terms:
        if target_term in w2v_model.wv.vocab:
            term_vectors.append(w2v_model.wv[target_term])
    Y = tsne.fit_transform(term_vectors)
    
    # let's make this plot a decent size...
    # Get current size
    fig_size = plt.rcParams["figure.figsize"]
    # Set figure width and height
    fig_size[0] = 15
    fig_size[1] = 9
    plt.rcParams["figure.figsize"] = fig_size
 
    plt.scatter(Y[:, 0], Y[:, 1])
    for label, x, y in zip(target_terms, Y[:, 0], Y[:, 1]):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.show()
    
max_movie_sentences_to_visualize = 8
visualization_terms = set()
for sentence in movie_reviews.sents()[:max_movie_sentences_to_visualize]:
    for token in sentence:
        visualization_terms.add(token)
    
plot_embeddings(movie_model, list(visualization_terms))

In [None]:
def plot_embeddings_3d(w2v_model, target_terms, min_word_length = 2):
    from mpl_toolkits.mplot3d import Axes3D
    tsne = TSNE(n_components=3, random_state=0)
    np.set_printoptions(suppress=True)
    term_vectors = []
    filtered_terms = []
    # let's make sure that a term we want is in the model
    for target_term in target_terms:
        # make sure that we're either not filtering or its above a certain length
        if min_word_length < 0 or len(target_term) > min_word_length:
            if target_term in w2v_model.wv.vocab:
                term_vectors.append(w2v_model.wv[target_term])
                filtered_terms.append(target_term)
    Y = tsne.fit_transform(term_vectors)
    
    # let's make this plot a decent size...
    # Get current size
    fig_size = plt.rcParams["figure.figsize"]
    # Set figure width and height
    fig_size[0] = 15
    fig_size[1] = 9
    plt.rcParams["figure.figsize"] = fig_size
 
    fig = plt.gcf()
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(Y[:,0], Y[:,1], Y[:,2], marker='.')
    for i, word in enumerate(filtered_terms):
        ax.text(x=Y[i,0], y=Y[i,1], z=Y[i,2], s=word)
    plt.show()
    
max_movie_sentences_to_visualize = 5
visualization_terms = set()
for sentence in movie_reviews.sents()[105:107]:
    for token in sentence:
        visualization_terms.add(token)
    
plot_embeddings_3d(movie_model, list(visualization_terms))