# Requirements for this notebook:
1. Internet connection (to download corpora and tokenizer data with calls to nltk.download())
2. The following packages:
  1. nltk (Anaconda or PIP command line install : pip install -U nltk OR conda install nltk)
  2. gensim (pip install -U gensim)
  3. scikit-learn v0.18.1 (pip install -U scikit-learn)
  4. matplotlib (pip install -U matplotlib)
  5. numpy (pip install -U numpy)

# Objectives of this notebook are to illustrate how we can do the following with word embeddings:
1. Introduce Document Vectors (Doc2Vec)
2. Compare sentence vectors
3. Use these for an NLP task -- Text classification

# OK, so having a vector to represent a sequence of words (sentence, paragraph or document) would be useful, but how does it work?  
## It's very similar to word2vec.  Word2vec predicts a word based on its neighbors or neighbors based on a word.  Doc2Vec is extremely similar but it adds the concept of a "Document ID" which can represent any sequence of text (sentence, paragraph or document)

![Brief Description of Doc2Vec](doc2vec.png)

Le, Q., & Mikolov, T. (2014, January). Distributed representations of sentences and documents. In International Conference on Machine Learning (pp. 1188-1196).

In [None]:
import time
import nltk
nltk.__version__
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
gensim.__version__

import sklearn
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
sklearn.__version__
import numpy as np
np.__version__

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# download our corpora
%time nltk.download('brown')
%time nltk.download('movie_reviews')
%time nltk.download('treebank')
# Let's download the PUNKT tokenizer first so that we can use tokenize words and sentences
%time nltk.download('punkt')
# Let's download stopwords so we can plot them later
%time nltk.download('stopwords')

In [None]:
from nltk.corpus import brown
from nltk.corpus import movie_reviews
from nltk.corpus import treebank

# Setting up parameters for Doc2Vec much like we did previously with Word2Vec...

In [None]:
# number of dimensions we want for our sentence/document vectors
D2V_DIMENSIONS = 100
# number of times a word much occur
D2V_MIN_COUNT = 3
# How many worker threads should we use to train?  Depends on your hardware...
D2V_WORKERS = 1

# Don't worry about understanding the code and class below.  It's not usually this complicated to train Doc2Vec, but it helps us on an experiment at the end of the notebook
## For now, understand that this allows us to "wrap" each sentence/document so that each one can be treated as distinct during training time.

In [None]:
# PRO TIP : This is a generator class (notice the __iter__() and the yield)
# This makes setting up a "TaggedDocument" class much easier and it becomes almost essential when training large 
# models for either Word2Vec or Doc2Vec since some corpora are so large that we cannot keep them in memory while training
# the vector model.  Therefore, you can "iterate" through files or database rows and only keep batches of them 
# in memory at a time
class TaggedSentenceGenerator(object):
    def __init__(self, nltk_corpus, max_training_documents = -1):
        self.nltk_corpus = nltk_corpus
        self.max_training_documents = max_training_documents
    def __iter__(self):
        sent_idx = 0
        sentences = self.nltk_corpus.sents()
        
        if self.max_training_documents > 0:
            sentence_count_before = len(sentences)
            sentences = sentences[:self.max_training_documents]
            print('Using a smaller training set.  Reducing from size : {0} to {1}'.format(sentence_count_before, len(sentences)))
            
        for sent in sentences:
            sent_idx += 1
            # NOTE : These tags are not actually used during training time, but they are used
            # to index and potentially query to find similar sentences/paragraphs/documents in GenSim
            yield TaggedDocument(words=sent, tags=['SENT_%s' % sent_idx])

# Now we can train a Doc2Vec model

In [None]:
%%time 

print('Training a Doc2Vec model.  This can take some time...')
movie_d2v_model = Doc2Vec(TaggedSentenceGenerator(movie_reviews), 
                                size = D2V_DIMENSIONS, 
                                min_count = D2V_MIN_COUNT, 
                                workers = D2V_WORKERS)

print('Done training Doc2Vec model.')

# Now that we have a model, let's try it out by comparing some sentences by their vectors using infer_vector()

In [None]:
# In this example, the average embeddings vectors for sentence_1 and sentence_2 would be the same
# However, doc2vec sees them as different.  This will allow context to be handled differently and a classifier
# will be able to learn about these differences

sentence_1 = 'I loved this great movie'
sentence_2 = 'I loved this fantastic film'
sentence_3 = 'boring and terrible'
sentence_texts = [sentence_1, sentence_2, sentence_3]

sentence_vectors = []
for sentence_text in sentence_texts:
    sentence_vec = movie_d2v_model.infer_vector(nltk.tokenize.word_tokenize(sentence_text))
    sentence_vec = sentence_vec.reshape(1, -1)
    sentence_vectors.append(sentence_vec)

for i in range(len(sentence_vectors)):
    for j in range(len(sentence_vectors)):
        if i == j:
            continue
        print('*************')
        print('{0} similarity score comparing [{1}] to [{2}]'.format(
             sklearn.metrics.pairwise.cosine_similarity(sentence_vectors[i], sentence_vectors[j]),
             sentence_texts[i],
             sentence_texts[j]))

print('DONE comparing sentences')

# Now let's try to visualize some sentence vectors with t-SNE

In [None]:
def plot_sentences(d2v_model, target_sentences):
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    sentence_vectors = []
    # let's make sure that a term we want is in the model
    for target_sentence in target_sentences:
        document_tokens = target_sentence.split()
        sentence_vector = d2v_model.infer_vector(document_tokens)
        sentence_vectors.append(sentence_vector)
    Y = tsne.fit_transform(sentence_vectors)
    
    # let's make this plot a decent size...
    # Get current size
    fig_size = plt.rcParams["figure.figsize"]
    # Set figure width and height
    fig_size[0] = 15
    fig_size[1] = 9
    plt.rcParams["figure.figsize"] = fig_size
 
    plt.scatter(Y[:, 0], Y[:, 1])
    for label, x, y in zip(target_sentences, Y[:, 0], Y[:, 1]):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.show()
    
target_plot_sentences = ['I absolutely adored the movie', 'I loved it more than any other movie', 'I hated it']
plot_sentences(movie_d2v_model, target_plot_sentences)

# Now let's use these sentence vectors in a text classification task

In [None]:
X = [movie_reviews.words(fileid) for fileid in movie_reviews.fileids()]
# these are the categories
y_categories = [movie_reviews.categories(fileid)[0] for fileid in movie_reviews.fileids()]
# this maps the strings to 1 -> positive 0 -> negative (anything else)
y = [1 if category == 'pos' else 0 for category in y_categories ]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 777)

print('TRAIN set size : {}'.format(len(X_train)))
print('TEST set size : {}'.format(len(X_test)))

# Let's transform all of our sentences into vectors to use them in text classification

In [None]:
%%time

X_train_vectors = [movie_d2v_model.infer_vector(doc) for doc in X_train]
X_test_vectors = [movie_d2v_model.infer_vector(doc) for doc in X_test]

print('TRAIN transformed vector size : {}'.format(len(X_train_vectors)))
print('TEST transformed vector size : {}'.format(len(X_test_vectors)))

# Train a model using these transformed vectors and compute our F1 score (balanced metric of Precision and Recall)

In [None]:
%%time

print('Training the model...  this could take some time')
lr = LogisticRegression()
lr.fit(X_train_vectors, y_train)

d2v_f1 = f1_score(y_test, lr.predict(X_test_vectors))
print('Doc2Vec F1 : {}'.format(d2v_f1))

# As one more experiment, let's evaluate the impact of how many documents we use for Doc2Vec training
## What happens if we use only 500 documents instead of thousands?

In [None]:
%%time 

SMALL_DOCUMENT_COUNT = 500

print('Training a smaller Doc2Vec model.  This can take some time...')
movie_d2v_model_small = Doc2Vec(TaggedSentenceGenerator(movie_reviews, max_training_documents = SMALL_DOCUMENT_COUNT), 
                                size = D2V_DIMENSIONS, 
                                min_count = D2V_MIN_COUNT, 
                                workers = D2V_WORKERS)

print('Done training Doc2Vec model.')

In [None]:
%%time

X_train_vectors_small = [movie_d2v_model_small.infer_vector(doc) for doc in X_train]
X_test_vectors_small = [movie_d2v_model_small.infer_vector(doc) for doc in X_test]

print('TRAIN smaller transformed vector size : {}'.format(len(X_train_vectors)))
print('TEST smaller transformed vector size : {}'.format(len(X_test_vectors)))

In [None]:
%%time

print('Training the smaller model...  this could take some time')
lr_small = LogisticRegression()
lr_small.fit(X_train_vectors_small, y_train)

d2v_f1_small = f1_score(y_test, lr_small.predict(X_test_vectors_small))
print('Doc2Vec (smaller training size) F1 : {}'.format(d2v_f1_small))