Source: https://www.kaggle.com/c/word2vec-nlp-tutorial#part-2-word-vectors

### Preparing to Train a Model

In [25]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.tokenize.punkt import PunktSentenceTokenizer
import logging
from gensim.models import word2vec

In [2]:
# Read data from files 
train = pd.read_csv('word2vec-nlp-tutorial/labeledTrainData.tsv',\
                    delimiter='\t', header=0, quoting=3)
unlabeled_train = pd.read_csv('word2vec-nlp-tutorial/unlabeledTrainData.tsv',\
                    delimiter='\t', header=0, quoting=3)
test = pd.read_csv('word2vec-nlp-tutorial/testData.tsv',\
                    delimiter='\t', header=0, quoting=3)


# Verify the number of reviews that were read (100,000 in total)
print('Labeled Train Reviews: {}\n\
Unlabeled Train Reviews: {}\n\
Labeled Test Reviews: {}'\
.format(train.review.size, unlabeled_train.review.size, test.review.size))

Labeled Train Reviews: 25000
Unlabeled Train Reviews: 50000
Labeled Test Reviews: 25000


In [12]:
def review_to_wordlist(review, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    
    # Remove HTML
    review_text = BeautifulSoup(review, 'lxml').get_text()
      
    # Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    # Convert words to lower case and split them
    words = review_text.lower().split()
    
    # Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    # Return a list of words
    return(words)

Word2Vec expects single sentences, each one as a list of words. In other words, the input format is a list of lists. 

Here, we'll use NLTK's punkt tokenizer for sentence splitting.

In [10]:
# Load the punkt tokenizer
tokenizer = PunktSentenceTokenizer()

# Define a function to split a review into parsed sentences
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    
    # Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    # Loop over each sentence. If a sentence is empty, skip it, 
    # Otherwise, call review_to_wordlist to get a list of words
    sentences = []
    for raw_sentence in raw_sentences:
        if(len(raw_sentence)>0):
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
            
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

Now we can apply this function to prepare our data for input to Word2Vec

In [13]:
# Initialize an empty list of sentences
sentences = []

print('Parsing sentences from training set...\n')
for review in train.review:
    sentences += review_to_sentences(review, tokenizer)

print('Parsing sentences from unlabeled set...\n')
for review in unlabeled_train.review:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set...



  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set...



  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


### Training and Saving Your Model

In [23]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [24]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality
min_word_count = 40   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

In [27]:
# Initialize and train the model
print('Training model...')
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

2018-03-20 22:53:40,955 : INFO : collecting all words and their counts
2018-03-20 22:53:40,957 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-03-20 22:53:41,049 : INFO : PROGRESS: at sentence #10000, processed 224638 words, keeping 17724 word types
2018-03-20 22:53:41,119 : INFO : PROGRESS: at sentence #20000, processed 449777 words, keeping 24881 word types


Training model...


2018-03-20 22:53:41,191 : INFO : PROGRESS: at sentence #30000, processed 667303 words, keeping 29952 word types
2018-03-20 22:53:41,267 : INFO : PROGRESS: at sentence #40000, processed 892329 words, keeping 34260 word types
2018-03-20 22:53:41,338 : INFO : PROGRESS: at sentence #50000, processed 1110245 words, keeping 37669 word types
2018-03-20 22:53:41,421 : INFO : PROGRESS: at sentence #60000, processed 1330403 words, keeping 40634 word types
2018-03-20 22:53:41,523 : INFO : PROGRESS: at sentence #70000, processed 1551611 words, keeping 43214 word types
2018-03-20 22:53:41,606 : INFO : PROGRESS: at sentence #80000, processed 1769343 words, keeping 45636 word types
2018-03-20 22:53:41,702 : INFO : PROGRESS: at sentence #90000, processed 1993234 words, keeping 47996 word types
2018-03-20 22:53:41,782 : INFO : PROGRESS: at sentence #100000, processed 2213376 words, keeping 50106 word types
2018-03-20 22:53:41,853 : INFO : PROGRESS: at sentence #110000, processed 2431812 words, keeping 

In [28]:
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-03-20 23:01:05,426 : INFO : precomputing L2-norms of word weight vectors
2018-03-20 23:01:05,809 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2018-03-20 23:01:05,810 : INFO : not storing attribute vectors_norm
2018-03-20 23:01:05,813 : INFO : not storing attribute cum_table
2018-03-20 23:01:10,967 : INFO : saved 300features_40minwords_10context


### Numeric Representations of Words