<center><h1>Benchmarking Word2Vec Classification - Amazon Book Reviews Dataset</h1>
<h3>By Mike Kane, Neal Staalberg, Majoni Maumbe, and Bhavani Shankar</h3></center>

In [1]:
import pandas as pd
import seaborn as sns
import gzip
import logging
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [2]:
# Read in stratified sample (you're a life saver, Neal)
raw_data = pd.read_parquet('stratifiedSample.parquet')

In [3]:
raw_data.dtypes

overall           float64
helpful            object
summary            object
reviewText         object
reviewTime         object
unixReviewTime      int64
dtype: object

In [5]:
# Separate reviews for preprocessing
raw_reviews = list(raw_data['reviewText'])

In [6]:
# Create a generator that processes all of the reviews.  Nothing fancy, will only use gensim's simple_preprocess method 
# to tokenize
def process_all_reviews(raw_reviews):
    logging.info("Begin processing reviews...")
    
    for ind, review in enumerate(raw_reviews):
        if ind % 10000 == 0:
            logging.info("Read {} reviews".format(ind))
        
        yield gensim.utils.simple_preprocess(review)

processed_reviews = list(process_all_reviews(raw_reviews))

2018-06-27 16:13:27,426 : INFO : Begin processing reviews...
2018-06-27 16:13:27,427 : INFO : Read 0 reviews
2018-06-27 16:13:31,388 : INFO : Read 10000 reviews
2018-06-27 16:13:35,424 : INFO : Read 20000 reviews
2018-06-27 16:13:39,428 : INFO : Read 30000 reviews
2018-06-27 16:13:43,640 : INFO : Read 40000 reviews
2018-06-27 16:13:47,940 : INFO : Read 50000 reviews
2018-06-27 16:13:51,957 : INFO : Read 60000 reviews
2018-06-27 16:13:56,234 : INFO : Read 70000 reviews
2018-06-27 16:14:00,041 : INFO : Read 80000 reviews
2018-06-27 16:14:04,250 : INFO : Read 90000 reviews
2018-06-27 16:14:07,874 : INFO : Read 100000 reviews
2018-06-27 16:14:11,881 : INFO : Read 110000 reviews
2018-06-27 16:14:16,048 : INFO : Read 120000 reviews
2018-06-27 16:14:19,445 : INFO : Read 130000 reviews
2018-06-27 16:14:23,157 : INFO : Read 140000 reviews
2018-06-27 16:14:27,156 : INFO : Read 150000 reviews
2018-06-27 16:14:30,920 : INFO : Read 160000 reviews
2018-06-27 16:14:34,412 : INFO : Read 170000 reviews

In [7]:
# Examine a sample processed review
print(processed_reviews[0:1])

[['for', 'those', 'who', 'don', 'know', 'gibran', 'get', 'to', 'know', 'his', 'work', 'the', 'prophet', 'is', 'must', 'read', 'his', 'outlook', 'on', 'life', 'is', 'truly', 'an', 'inspiring', 'guide', 'on', 'how', 'to', 'approach', 'major', 'life', 'decisions', 'this', 'is', 'relationship', 'with', 'man', 'work', 'you', 'won', 'regret', 'having']]


In [34]:
# Get a list of all unique words contained in reviews sample
all_words = set(word for review in processed_reviews for word in review)

In [36]:
# That's a lot of words
len(all_words)

289883

In [9]:
# Import and split GloVe dataset.  Will use this and pull only the words that exist in the sample dataset. Otherwise,
# this would take much too long
glove_big = {}
with open("glove.840B.300d.txt", "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in all_words:
            nums=np.array(parts[1:], dtype=np.float32)
            glove_big[word] = nums

In [10]:
# Get only the data we'll use for building our models.  
# We'll use the ratings contained in 'overall' for our target.
# We don't actually need reviewText category since we've already preprocessed the reviews, but keeping them together 
# just in case we think of a use for this DataFrame at a later time.
raw_data = raw_data[['overall', 'reviewText']]
y_5_class = np.array(raw_data['overall'])
X = processed_reviews

In [11]:
# Create function for binning 5 star ratings system into three levels, which will represent good, netural, and bad.  
# Almost named this function move_goalpost()
def convert_to_3_class(labels):
    new_labels_dict = {
        5: 3,
        4: 3,
        3: 2,
        2: 1,
        1: 1
    }
    transformed_data = []
    
    for i in labels:
        transformed_data.append(new_labels_dict[int(i)])
    
    return np.array(transformed_data)

# Get 3 class version of labels
y_3_class = convert_to_3_class(raw_data['overall'])

In [12]:
# Check size of total vocabulary for this sample
len(all_words)

289883

In [38]:
# Estimate Number of trainable parameters
len(all_words) * 150

43482450

In [13]:
# Full disclosure--borrowed this code from DS Lore tutorial to avoid getting caught in misshaped tensor hell.  
# Website is cited as used for general understanding and debugging in the paper. 
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove_big))])
    
    # Note from Mike: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # It can't be used in a sklearn Pipeline. 
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [14]:
# Same as above--borrowed this code from DS Lore tutorial.  See citations page of report for link to website.  
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_big))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])


In [27]:
# Create Word2Vec model.  
# Size is the dimensionality of the feature vectors
# Window is the output size the model is trained to predict.  For a dataset this large, we can choose a small number like 5. 
# Min_count ignores words with a frequency count lower than 2
# Uses 10 threads to train the workers. We can go multicore since we have a decent GPU backing this. 

model = gensim.models.Word2Vec(processed_reviews, 
                               size=150, window=5, 
                               min_count=2, 
                               workers=10)

2018-06-27 19:23:44,858 : INFO : collecting all words and their counts
2018-06-27 19:23:44,860 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-27 19:23:45,372 : INFO : PROGRESS: at sentence #10000, processed 1716701 words, keeping 45280 word types
2018-06-27 19:23:45,926 : INFO : PROGRESS: at sentence #20000, processed 3475642 words, keeping 62402 word types
2018-06-27 19:23:46,445 : INFO : PROGRESS: at sentence #30000, processed 5257190 words, keeping 74103 word types
2018-06-27 19:23:47,050 : INFO : PROGRESS: at sentence #40000, processed 7090850 words, keeping 87292 word types
2018-06-27 19:23:47,629 : INFO : PROGRESS: at sentence #50000, processed 8967488 words, keeping 99861 word types
2018-06-27 19:23:48,147 : INFO : PROGRESS: at sentence #60000, processed 10696996 words, keeping 107756 word types
2018-06-27 19:23:48,685 : INFO : PROGRESS: at sentence #70000, processed 12485781 words, keeping 116376 word types
2018-06-27 19:23:49,225 : INFO : PR

2018-06-27 19:24:33,643 : INFO : EPOCH 1 - PROGRESS: at 40.30% examples, 1342902 words/s, in_qsize 20, out_qsize 1
2018-06-27 19:24:34,653 : INFO : EPOCH 1 - PROGRESS: at 42.77% examples, 1343791 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:24:35,652 : INFO : EPOCH 1 - PROGRESS: at 45.25% examples, 1344735 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:24:36,656 : INFO : EPOCH 1 - PROGRESS: at 47.60% examples, 1344307 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:24:37,683 : INFO : EPOCH 1 - PROGRESS: at 49.75% examples, 1342006 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:24:38,700 : INFO : EPOCH 1 - PROGRESS: at 52.02% examples, 1342801 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:24:39,688 : INFO : EPOCH 1 - PROGRESS: at 54.79% examples, 1342835 words/s, in_qsize 15, out_qsize 4
2018-06-27 19:24:40,695 : INFO : EPOCH 1 - PROGRESS: at 57.63% examples, 1342754 words/s, in_qsize 20, out_qsize 3
2018-06-27 19:24:41,693 : INFO : EPOCH 1 - PROGRESS: at 59.86% examples, 1343213

2018-06-27 19:25:31,788 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-27 19:25:31,789 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-27 19:25:31,799 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-27 19:25:31,803 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-27 19:25:31,804 : INFO : EPOCH - 2 : training on 66577500 raw words (50580357 effective words) took 37.9s, 1335757 effective words/s
2018-06-27 19:25:32,824 : INFO : EPOCH 3 - PROGRESS: at 2.33% examples, 1329833 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:25:33,815 : INFO : EPOCH 3 - PROGRESS: at 4.60% examples, 1344922 words/s, in_qsize 16, out_qsize 3
2018-06-27 19:25:34,819 : INFO : EPOCH 3 - PROGRESS: at 6.87% examples, 1352076 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:25:35,834 : INFO : EPOCH 3 - PROGRESS: at 9.12% examples, 1354943 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:25:36,848 : INFO : EPOCH 

2018-06-27 19:26:31,363 : INFO : EPOCH 4 - PROGRESS: at 52.18% examples, 1348185 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:26:32,359 : INFO : EPOCH 4 - PROGRESS: at 55.06% examples, 1349509 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:26:33,360 : INFO : EPOCH 4 - PROGRESS: at 57.81% examples, 1350058 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:26:34,366 : INFO : EPOCH 4 - PROGRESS: at 60.06% examples, 1349937 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:26:35,370 : INFO : EPOCH 4 - PROGRESS: at 62.44% examples, 1349663 words/s, in_qsize 19, out_qsize 3
2018-06-27 19:26:36,385 : INFO : EPOCH 4 - PROGRESS: at 65.59% examples, 1350767 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:26:37,386 : INFO : EPOCH 4 - PROGRESS: at 69.37% examples, 1351493 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:26:38,400 : INFO : EPOCH 4 - PROGRESS: at 73.05% examples, 1353768 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:26:39,400 : INFO : EPOCH 4 - PROGRESS: at 75.44% examples, 1353617

2018-06-27 19:27:24,581 : INFO : training on a 332887500 raw words (252900832 effective words) took 188.2s, 1344142 effective words/s


In [28]:
# Train the model for further epochs on the processed_reviews
model.train(processed_reviews, total_examples=len(processed_reviews), epochs=40)

2018-06-27 19:27:24,671 : INFO : training model with 10 workers on 144950 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2018-06-27 19:27:25,698 : INFO : EPOCH 1 - PROGRESS: at 2.29% examples, 1304568 words/s, in_qsize 20, out_qsize 2
2018-06-27 19:27:26,701 : INFO : EPOCH 1 - PROGRESS: at 4.42% examples, 1290886 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:27:27,713 : INFO : EPOCH 1 - PROGRESS: at 6.56% examples, 1283378 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:27:28,715 : INFO : EPOCH 1 - PROGRESS: at 8.64% examples, 1280142 words/s, in_qsize 20, out_qsize 1
2018-06-27 19:27:29,721 : INFO : EPOCH 1 - PROGRESS: at 10.64% examples, 1277584 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:27:30,718 : INFO : EPOCH 1 - PROGRESS: at 12.68% examples, 1271257 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:27:31,733 : INFO : EPOCH 1 - PROGRESS: at 15.01% examples, 1283047 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:27:32,748 : INFO : EPOCH 1 - PROG

2018-06-27 19:28:27,860 : INFO : EPOCH 2 - PROGRESS: at 58.65% examples, 1315325 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:28:28,854 : INFO : EPOCH 2 - PROGRESS: at 60.83% examples, 1314767 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:28:29,867 : INFO : EPOCH 2 - PROGRESS: at 63.26% examples, 1315062 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:28:30,882 : INFO : EPOCH 2 - PROGRESS: at 66.34% examples, 1311904 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:28:31,881 : INFO : EPOCH 2 - PROGRESS: at 70.13% examples, 1313162 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:28:32,888 : INFO : EPOCH 2 - PROGRESS: at 73.37% examples, 1314100 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:28:33,878 : INFO : EPOCH 2 - PROGRESS: at 75.69% examples, 1314346 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:28:34,897 : INFO : EPOCH 2 - PROGRESS: at 77.76% examples, 1313426 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:28:35,903 : INFO : EPOCH 2 - PROGRESS: at 80.43% examples, 1314292

2018-06-27 19:29:22,057 : INFO : EPOCH 4 - PROGRESS: at 2.17% examples, 1242802 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:29:23,079 : INFO : EPOCH 4 - PROGRESS: at 4.21% examples, 1226413 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:29:24,081 : INFO : EPOCH 4 - PROGRESS: at 6.23% examples, 1226870 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:29:25,083 : INFO : EPOCH 4 - PROGRESS: at 8.32% examples, 1228337 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:29:26,086 : INFO : EPOCH 4 - PROGRESS: at 10.18% examples, 1217276 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:29:27,098 : INFO : EPOCH 4 - PROGRESS: at 12.18% examples, 1226322 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:29:28,103 : INFO : EPOCH 4 - PROGRESS: at 14.36% examples, 1231997 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:29:29,116 : INFO : EPOCH 4 - PROGRESS: at 16.49% examples, 1236573 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:29:30,103 : INFO : EPOCH 4 - PROGRESS: at 19.01% examples, 1243805 wor

2018-06-27 19:30:25,657 : INFO : EPOCH 5 - PROGRESS: at 50.84% examples, 1207785 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:30:26,672 : INFO : EPOCH 5 - PROGRESS: at 53.03% examples, 1206456 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:30:27,693 : INFO : EPOCH 5 - PROGRESS: at 55.86% examples, 1207799 words/s, in_qsize 20, out_qsize 1
2018-06-27 19:30:28,686 : INFO : EPOCH 5 - PROGRESS: at 58.16% examples, 1209518 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:30:29,694 : INFO : EPOCH 5 - PROGRESS: at 60.23% examples, 1210788 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:30:30,706 : INFO : EPOCH 5 - PROGRESS: at 62.52% examples, 1212244 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:30:31,732 : INFO : EPOCH 5 - PROGRESS: at 65.32% examples, 1213180 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:30:32,721 : INFO : EPOCH 5 - PROGRESS: at 68.70% examples, 1214439 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:30:33,768 : INFO : EPOCH 5 - PROGRESS: at 72.23% examples, 1215664

2018-06-27 19:31:23,921 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-27 19:31:23,927 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-27 19:31:23,932 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-27 19:31:23,935 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-27 19:31:23,936 : INFO : EPOCH - 6 : training on 66577500 raw words (50579341 effective words) took 41.1s, 1231880 effective words/s
2018-06-27 19:31:24,949 : INFO : EPOCH 7 - PROGRESS: at 2.11% examples, 1194961 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:31:25,950 : INFO : EPOCH 7 - PROGRESS: at 4.15% examples, 1206474 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:31:26,956 : INFO : EPOCH 7 - PROGRESS: at 6.16% examples, 1214519 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:31:27,968 : INFO : EPOCH 7 - PROGRESS: at 8.35% examples, 1233884 words/s, in_qsize 20, out_qsize 1
2018-06-27 19:31:28,981 : INFO : EPOCH 

2018-06-27 19:32:24,256 : INFO : EPOCH 8 - PROGRESS: at 40.65% examples, 1217909 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:32:25,271 : INFO : EPOCH 8 - PROGRESS: at 42.85% examples, 1217577 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:32:26,265 : INFO : EPOCH 8 - PROGRESS: at 45.12% examples, 1217442 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:32:27,274 : INFO : EPOCH 8 - PROGRESS: at 47.21% examples, 1217519 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:32:28,301 : INFO : EPOCH 8 - PROGRESS: at 49.22% examples, 1216523 words/s, in_qsize 20, out_qsize 3
2018-06-27 19:32:29,299 : INFO : EPOCH 8 - PROGRESS: at 51.28% examples, 1217620 words/s, in_qsize 20, out_qsize 1
2018-06-27 19:32:30,323 : INFO : EPOCH 8 - PROGRESS: at 53.49% examples, 1216632 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:32:31,345 : INFO : EPOCH 8 - PROGRESS: at 56.41% examples, 1216577 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:32:32,342 : INFO : EPOCH 8 - PROGRESS: at 58.55% examples, 1217932

2018-06-27 19:33:27,828 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-06-27 19:33:27,828 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-06-27 19:33:27,837 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-06-27 19:33:27,840 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-06-27 19:33:27,849 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-06-27 19:33:27,852 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-06-27 19:33:27,853 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-27 19:33:27,855 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-27 19:33:27,865 : INFO : EPOCH 9 - PROGRESS: at 99.98% examples, 1222703 words/s, in_qsize 1, out_qsize 1
2018-06-27 19:33:27,866 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-27 19:33:27,871 : INFO : worker thread finished; awaiting f

2018-06-27 19:34:21,600 : INFO : EPOCH 11 - PROGRESS: at 25.02% examples, 1212417 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:34:22,597 : INFO : EPOCH 11 - PROGRESS: at 27.08% examples, 1214478 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:34:23,600 : INFO : EPOCH 11 - PROGRESS: at 29.48% examples, 1212890 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:34:24,612 : INFO : EPOCH 11 - PROGRESS: at 31.58% examples, 1208275 words/s, in_qsize 20, out_qsize 1
2018-06-27 19:34:25,623 : INFO : EPOCH 11 - PROGRESS: at 33.45% examples, 1201126 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:34:26,638 : INFO : EPOCH 11 - PROGRESS: at 35.54% examples, 1198307 words/s, in_qsize 16, out_qsize 3
2018-06-27 19:34:27,658 : INFO : EPOCH 11 - PROGRESS: at 37.80% examples, 1196756 words/s, in_qsize 20, out_qsize 2
2018-06-27 19:34:28,670 : INFO : EPOCH 11 - PROGRESS: at 40.09% examples, 1196394 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:34:29,687 : INFO : EPOCH 11 - PROGRESS: at 42.27% examples

2018-06-27 19:35:24,901 : INFO : EPOCH 12 - PROGRESS: at 74.67% examples, 1213913 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:35:25,902 : INFO : EPOCH 12 - PROGRESS: at 76.73% examples, 1214246 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:35:26,919 : INFO : EPOCH 12 - PROGRESS: at 78.73% examples, 1213872 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:35:27,913 : INFO : EPOCH 12 - PROGRESS: at 81.15% examples, 1213679 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:35:28,921 : INFO : EPOCH 12 - PROGRESS: at 83.52% examples, 1214409 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:35:29,918 : INFO : EPOCH 12 - PROGRESS: at 87.45% examples, 1215725 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:35:30,922 : INFO : EPOCH 12 - PROGRESS: at 91.73% examples, 1216257 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:35:31,945 : INFO : EPOCH 12 - PROGRESS: at 95.76% examples, 1215696 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:35:32,948 : INFO : EPOCH 12 - PROGRESS: at 99.52% examples

2018-06-27 19:36:18,852 : INFO : EPOCH 14 - PROGRESS: at 7.06% examples, 1039950 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:36:19,865 : INFO : EPOCH 14 - PROGRESS: at 8.94% examples, 1060799 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:36:20,866 : INFO : EPOCH 14 - PROGRESS: at 10.70% examples, 1071015 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:36:21,887 : INFO : EPOCH 14 - PROGRESS: at 12.63% examples, 1084272 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:36:22,893 : INFO : EPOCH 14 - PROGRESS: at 14.58% examples, 1089089 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:36:23,900 : INFO : EPOCH 14 - PROGRESS: at 16.35% examples, 1088406 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:36:24,900 : INFO : EPOCH 14 - PROGRESS: at 18.61% examples, 1094322 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:36:25,909 : INFO : EPOCH 14 - PROGRESS: at 20.57% examples, 1100338 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:36:26,918 : INFO : EPOCH 14 - PROGRESS: at 22.83% examples, 

2018-06-27 19:37:21,586 : INFO : EPOCH 15 - PROGRESS: at 45.12% examples, 1158409 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:37:22,590 : INFO : EPOCH 15 - PROGRESS: at 47.18% examples, 1159296 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:37:23,618 : INFO : EPOCH 15 - PROGRESS: at 48.96% examples, 1156223 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:37:24,633 : INFO : EPOCH 15 - PROGRESS: at 50.93% examples, 1157351 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:37:25,642 : INFO : EPOCH 15 - PROGRESS: at 53.06% examples, 1157471 words/s, in_qsize 16, out_qsize 3
2018-06-27 19:37:26,659 : INFO : EPOCH 15 - PROGRESS: at 55.59% examples, 1156133 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:37:27,673 : INFO : EPOCH 15 - PROGRESS: at 57.92% examples, 1158024 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:37:28,689 : INFO : EPOCH 15 - PROGRESS: at 59.94% examples, 1158373 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:37:29,704 : INFO : EPOCH 15 - PROGRESS: at 62.05% examples

2018-06-27 19:38:24,462 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-06-27 19:38:24,494 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-06-27 19:38:24,505 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-06-27 19:38:24,508 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-06-27 19:38:24,509 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-06-27 19:38:24,510 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-06-27 19:38:24,520 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-27 19:38:24,522 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-27 19:38:24,524 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-27 19:38:24,525 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-27 19:38:24,526 : INFO : EPOCH - 16 : training on 66577500 raw words (50580355 eff

2018-06-27 19:39:19,225 : INFO : EPOCH 18 - PROGRESS: at 21.18% examples, 1122413 words/s, in_qsize 16, out_qsize 3
2018-06-27 19:39:20,246 : INFO : EPOCH 18 - PROGRESS: at 23.33% examples, 1131365 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:39:21,274 : INFO : EPOCH 18 - PROGRESS: at 25.41% examples, 1132255 words/s, in_qsize 16, out_qsize 3
2018-06-27 19:39:22,274 : INFO : EPOCH 18 - PROGRESS: at 27.65% examples, 1137378 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:39:23,276 : INFO : EPOCH 18 - PROGRESS: at 29.88% examples, 1141450 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:39:24,280 : INFO : EPOCH 18 - PROGRESS: at 32.14% examples, 1145294 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:39:25,283 : INFO : EPOCH 18 - PROGRESS: at 34.16% examples, 1148311 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:39:26,280 : INFO : EPOCH 18 - PROGRESS: at 36.23% examples, 1149566 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:39:27,298 : INFO : EPOCH 18 - PROGRESS: at 38.78% examples

2018-06-27 19:40:22,386 : INFO : EPOCH 19 - PROGRESS: at 69.07% examples, 1216627 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:40:23,384 : INFO : EPOCH 19 - PROGRESS: at 72.30% examples, 1216152 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:40:24,389 : INFO : EPOCH 19 - PROGRESS: at 74.62% examples, 1215459 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:40:25,418 : INFO : EPOCH 19 - PROGRESS: at 76.66% examples, 1214427 words/s, in_qsize 20, out_qsize 2
2018-06-27 19:40:26,435 : INFO : EPOCH 19 - PROGRESS: at 78.56% examples, 1212295 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:40:27,464 : INFO : EPOCH 19 - PROGRESS: at 80.94% examples, 1210436 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:40:28,478 : INFO : EPOCH 19 - PROGRESS: at 83.27% examples, 1211213 words/s, in_qsize 16, out_qsize 3
2018-06-27 19:40:29,494 : INFO : EPOCH 19 - PROGRESS: at 87.08% examples, 1212422 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:40:30,494 : INFO : EPOCH 19 - PROGRESS: at 91.24% examples

2018-06-27 19:41:16,698 : INFO : EPOCH 21 - PROGRESS: at 4.01% examples, 1172168 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:41:17,698 : INFO : EPOCH 21 - PROGRESS: at 6.08% examples, 1202766 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:41:18,713 : INFO : EPOCH 21 - PROGRESS: at 8.06% examples, 1191990 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:41:19,740 : INFO : EPOCH 21 - PROGRESS: at 10.04% examples, 1197249 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:41:20,738 : INFO : EPOCH 21 - PROGRESS: at 11.92% examples, 1201143 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:41:21,745 : INFO : EPOCH 21 - PROGRESS: at 14.03% examples, 1203694 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:41:22,757 : INFO : EPOCH 21 - PROGRESS: at 16.08% examples, 1205630 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:41:23,752 : INFO : EPOCH 21 - PROGRESS: at 18.41% examples, 1205562 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:41:24,774 : INFO : EPOCH 21 - PROGRESS: at 20.46% examples, 1

2018-06-27 19:42:19,763 : INFO : EPOCH 22 - PROGRESS: at 48.53% examples, 1194186 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:42:20,790 : INFO : EPOCH 22 - PROGRESS: at 50.56% examples, 1193822 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:42:21,801 : INFO : EPOCH 22 - PROGRESS: at 52.73% examples, 1195897 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:42:22,804 : INFO : EPOCH 22 - PROGRESS: at 55.31% examples, 1195580 words/s, in_qsize 20, out_qsize 3
2018-06-27 19:42:23,809 : INFO : EPOCH 22 - PROGRESS: at 57.74% examples, 1195696 words/s, in_qsize 16, out_qsize 3
2018-06-27 19:42:24,817 : INFO : EPOCH 22 - PROGRESS: at 59.77% examples, 1196170 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:42:25,827 : INFO : EPOCH 22 - PROGRESS: at 61.80% examples, 1197152 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:42:26,839 : INFO : EPOCH 22 - PROGRESS: at 64.33% examples, 1197271 words/s, in_qsize 20, out_qsize 3
2018-06-27 19:42:27,839 : INFO : EPOCH 22 - PROGRESS: at 67.59% examples

2018-06-27 19:43:20,390 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-06-27 19:43:20,394 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-06-27 19:43:20,399 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-06-27 19:43:20,406 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-27 19:43:20,413 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-27 19:43:20,417 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-27 19:43:20,418 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-27 19:43:20,420 : INFO : EPOCH - 23 : training on 66577500 raw words (50577081 effective words) took 41.9s, 1208236 effective words/s
2018-06-27 19:43:21,424 : INFO : EPOCH 24 - PROGRESS: at 2.00% examples, 1148763 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:43:22,438 : INFO : EPOCH 24 - PROGRESS: at 3.57% examples, 1031684 words/s, in_qsize 19, out

2018-06-27 19:44:17,339 : INFO : EPOCH 25 - PROGRESS: at 29.46% examples, 1212379 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:44:18,345 : INFO : EPOCH 25 - PROGRESS: at 31.77% examples, 1214331 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:44:19,358 : INFO : EPOCH 25 - PROGRESS: at 33.83% examples, 1214690 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:44:20,359 : INFO : EPOCH 25 - PROGRESS: at 36.02% examples, 1214335 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:44:21,387 : INFO : EPOCH 25 - PROGRESS: at 38.59% examples, 1214034 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:44:22,387 : INFO : EPOCH 25 - PROGRESS: at 40.58% examples, 1213386 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:44:23,401 : INFO : EPOCH 25 - PROGRESS: at 42.75% examples, 1210769 words/s, in_qsize 16, out_qsize 3
2018-06-27 19:44:24,402 : INFO : EPOCH 25 - PROGRESS: at 45.09% examples, 1213349 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:44:25,403 : INFO : EPOCH 25 - PROGRESS: at 47.13% examples

2018-06-27 19:45:20,292 : INFO : EPOCH 26 - PROGRESS: at 78.47% examples, 1210952 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:45:21,306 : INFO : EPOCH 26 - PROGRESS: at 80.95% examples, 1211457 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:45:22,307 : INFO : EPOCH 26 - PROGRESS: at 83.21% examples, 1211709 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:45:23,312 : INFO : EPOCH 26 - PROGRESS: at 86.82% examples, 1211663 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:45:24,313 : INFO : EPOCH 26 - PROGRESS: at 90.98% examples, 1212018 words/s, in_qsize 19, out_qsize 2
2018-06-27 19:45:25,321 : INFO : EPOCH 26 - PROGRESS: at 95.30% examples, 1212966 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:45:26,337 : INFO : EPOCH 26 - PROGRESS: at 98.88% examples, 1212859 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:45:26,597 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-06-27 19:45:26,613 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-0

2018-06-27 19:46:14,384 : INFO : EPOCH 28 - PROGRESS: at 12.12% examples, 1215374 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:46:15,388 : INFO : EPOCH 28 - PROGRESS: at 14.18% examples, 1213070 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:46:16,398 : INFO : EPOCH 28 - PROGRESS: at 16.19% examples, 1212218 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:46:17,405 : INFO : EPOCH 28 - PROGRESS: at 18.66% examples, 1217323 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:46:18,419 : INFO : EPOCH 28 - PROGRESS: at 20.76% examples, 1217835 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:46:19,444 : INFO : EPOCH 28 - PROGRESS: at 23.04% examples, 1220563 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:46:20,450 : INFO : EPOCH 28 - PROGRESS: at 25.20% examples, 1220987 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:46:21,466 : INFO : EPOCH 28 - PROGRESS: at 27.41% examples, 1217743 words/s, in_qsize 16, out_qsize 3
2018-06-27 19:46:22,487 : INFO : EPOCH 28 - PROGRESS: at 29.73% examples

2018-06-27 19:47:17,181 : INFO : EPOCH 29 - PROGRESS: at 58.28% examples, 1210957 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:47:18,195 : INFO : EPOCH 29 - PROGRESS: at 60.28% examples, 1210901 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:47:19,205 : INFO : EPOCH 29 - PROGRESS: at 62.51% examples, 1210947 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:47:20,220 : INFO : EPOCH 29 - PROGRESS: at 65.26% examples, 1211872 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:47:21,236 : INFO : EPOCH 29 - PROGRESS: at 68.59% examples, 1211856 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:47:22,235 : INFO : EPOCH 29 - PROGRESS: at 72.02% examples, 1213016 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:47:23,258 : INFO : EPOCH 29 - PROGRESS: at 74.42% examples, 1211739 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:47:24,269 : INFO : EPOCH 29 - PROGRESS: at 76.49% examples, 1211853 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:47:25,271 : INFO : EPOCH 29 - PROGRESS: at 78.54% examples

2018-06-27 19:48:13,384 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-27 19:48:13,387 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-27 19:48:13,389 : INFO : EPOCH - 30 : training on 66577500 raw words (50580899 effective words) took 41.7s, 1211606 effective words/s
2018-06-27 19:48:14,389 : INFO : EPOCH 31 - PROGRESS: at 2.14% examples, 1226728 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:48:15,401 : INFO : EPOCH 31 - PROGRESS: at 4.16% examples, 1225379 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:48:16,405 : INFO : EPOCH 31 - PROGRESS: at 6.18% examples, 1224308 words/s, in_qsize 20, out_qsize 1
2018-06-27 19:48:17,410 : INFO : EPOCH 31 - PROGRESS: at 8.30% examples, 1226959 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:48:18,422 : INFO : EPOCH 31 - PROGRESS: at 10.18% examples, 1220434 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:48:19,450 : INFO : EPOCH 31 - PROGRESS: at 12.01% examples, 1207576 words/s, in_qsize 

2018-06-27 19:49:14,210 : INFO : EPOCH 32 - PROGRESS: at 40.97% examples, 1219367 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:49:15,219 : INFO : EPOCH 32 - PROGRESS: at 43.16% examples, 1220070 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:49:16,211 : INFO : EPOCH 32 - PROGRESS: at 45.41% examples, 1221171 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:49:17,223 : INFO : EPOCH 32 - PROGRESS: at 47.59% examples, 1220691 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:49:18,225 : INFO : EPOCH 32 - PROGRESS: at 49.50% examples, 1219588 words/s, in_qsize 16, out_qsize 3
2018-06-27 19:49:19,251 : INFO : EPOCH 32 - PROGRESS: at 51.55% examples, 1219326 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:49:20,248 : INFO : EPOCH 32 - PROGRESS: at 53.83% examples, 1218671 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:49:21,270 : INFO : EPOCH 32 - PROGRESS: at 56.74% examples, 1218988 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:49:22,263 : INFO : EPOCH 32 - PROGRESS: at 58.78% examples

2018-06-27 19:50:16,769 : INFO : EPOCH 33 - PROGRESS: at 95.72% examples, 1216547 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:50:17,783 : INFO : EPOCH 33 - PROGRESS: at 99.30% examples, 1216483 words/s, in_qsize 19, out_qsize 2
2018-06-27 19:50:17,921 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-06-27 19:50:17,936 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-06-27 19:50:17,936 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-06-27 19:50:17,958 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-06-27 19:50:17,971 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-06-27 19:50:17,976 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-06-27 19:50:17,977 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-27 19:50:17,983 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-27 19:50:17,986 : INFO : worker 

2018-06-27 19:51:10,562 : INFO : EPOCH 35 - PROGRESS: at 22.91% examples, 1215656 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:51:11,570 : INFO : EPOCH 35 - PROGRESS: at 25.04% examples, 1213849 words/s, in_qsize 19, out_qsize 1
2018-06-27 19:51:12,564 : INFO : EPOCH 35 - PROGRESS: at 27.06% examples, 1213660 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:51:13,577 : INFO : EPOCH 35 - PROGRESS: at 29.61% examples, 1217342 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:51:14,582 : INFO : EPOCH 35 - PROGRESS: at 31.83% examples, 1216266 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:51:15,592 : INFO : EPOCH 35 - PROGRESS: at 33.82% examples, 1215083 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:51:16,617 : INFO : EPOCH 35 - PROGRESS: at 36.08% examples, 1215628 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:51:17,609 : INFO : EPOCH 35 - PROGRESS: at 38.64% examples, 1216598 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:51:18,618 : INFO : EPOCH 35 - PROGRESS: at 40.64% examples

2018-06-27 19:52:13,468 : INFO : EPOCH 36 - PROGRESS: at 72.05% examples, 1212492 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:52:14,478 : INFO : EPOCH 36 - PROGRESS: at 74.50% examples, 1212432 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:52:15,479 : INFO : EPOCH 36 - PROGRESS: at 76.53% examples, 1212078 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:52:16,484 : INFO : EPOCH 36 - PROGRESS: at 78.54% examples, 1212054 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:52:17,491 : INFO : EPOCH 36 - PROGRESS: at 80.97% examples, 1212259 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:52:18,501 : INFO : EPOCH 36 - PROGRESS: at 83.19% examples, 1211141 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:52:19,516 : INFO : EPOCH 36 - PROGRESS: at 86.67% examples, 1210180 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:52:20,515 : INFO : EPOCH 36 - PROGRESS: at 90.93% examples, 1211266 words/s, in_qsize 20, out_qsize 1
2018-06-27 19:52:21,530 : INFO : EPOCH 36 - PROGRESS: at 95.19% examples

2018-06-27 19:53:07,399 : INFO : EPOCH 38 - PROGRESS: at 6.15% examples, 1221542 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:53:08,416 : INFO : EPOCH 38 - PROGRESS: at 8.22% examples, 1218650 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:53:09,418 : INFO : EPOCH 38 - PROGRESS: at 10.13% examples, 1215291 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:53:10,423 : INFO : EPOCH 38 - PROGRESS: at 12.05% examples, 1217408 words/s, in_qsize 16, out_qsize 3
2018-06-27 19:53:11,430 : INFO : EPOCH 38 - PROGRESS: at 14.17% examples, 1219941 words/s, in_qsize 20, out_qsize 0
2018-06-27 19:53:12,442 : INFO : EPOCH 38 - PROGRESS: at 16.17% examples, 1216423 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:53:13,453 : INFO : EPOCH 38 - PROGRESS: at 18.54% examples, 1215292 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:53:14,468 : INFO : EPOCH 38 - PROGRESS: at 20.49% examples, 1209469 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:53:15,503 : INFO : EPOCH 38 - PROGRESS: at 22.68% examples, 

2018-06-27 19:54:10,672 : INFO : EPOCH 39 - PROGRESS: at 50.86% examples, 1204499 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:54:11,686 : INFO : EPOCH 39 - PROGRESS: at 53.09% examples, 1205065 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:54:12,695 : INFO : EPOCH 39 - PROGRESS: at 55.78% examples, 1204290 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:54:13,708 : INFO : EPOCH 39 - PROGRESS: at 57.98% examples, 1203114 words/s, in_qsize 19, out_qsize 0
2018-06-27 19:54:14,697 : INFO : EPOCH 39 - PROGRESS: at 60.00% examples, 1202964 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:54:15,710 : INFO : EPOCH 39 - PROGRESS: at 62.07% examples, 1203336 words/s, in_qsize 18, out_qsize 1
2018-06-27 19:54:16,714 : INFO : EPOCH 39 - PROGRESS: at 64.69% examples, 1204338 words/s, in_qsize 20, out_qsize 1
2018-06-27 19:54:17,714 : INFO : EPOCH 39 - PROGRESS: at 67.92% examples, 1204924 words/s, in_qsize 17, out_qsize 2
2018-06-27 19:54:18,717 : INFO : EPOCH 39 - PROGRESS: at 71.16% examples

2018-06-27 19:55:10,444 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-06-27 19:55:10,452 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-06-27 19:55:10,463 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-06-27 19:55:10,464 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-27 19:55:10,467 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-27 19:55:10,468 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-27 19:55:10,477 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-27 19:55:10,479 : INFO : EPOCH - 40 : training on 66577500 raw words (50582501 effective words) took 41.5s, 1217800 effective words/s
2018-06-27 19:55:10,481 : INFO : training on a 2663100000 raw words (2023226349 effective words) took 1665.8s, 1214561 effective words/s


(2023226349, 2663100000)

In [17]:
model.save('w2v_project_final.h5')
# Use this to save the weights as an h5 file so that we can just load the model in next time rather than train it

In [18]:
# Uncomment this to load the file in 
model = gensim.models.Word2Vec.load('w2v_project_final.h5')

2018-06-27 16:17:04,747 : INFO : loading Word2Vec object from w2v_project_final.h5
2018-06-27 16:17:06,164 : INFO : loading wv recursively from w2v_project_final.h5.wv.* with mmap=None
2018-06-27 16:17:06,164 : INFO : loading vectors from w2v_project_final.h5.wv.vectors.npy with mmap=None
2018-06-27 16:17:06,425 : INFO : setting ignored attribute vectors_norm to None
2018-06-27 16:17:06,425 : INFO : loading vocabulary recursively from w2v_project_final.h5.vocabulary.* with mmap=None
2018-06-27 16:17:06,425 : INFO : loading trainables recursively from w2v_project_final.h5.trainables.* with mmap=None
2018-06-27 16:17:06,425 : INFO : loading syn1neg from w2v_project_final.h5.trainables.syn1neg.npy with mmap=None
2018-06-27 16:17:06,680 : INFO : setting ignored attribute cum_table to None
2018-06-27 16:17:06,680 : INFO : loaded w2v_project_final.h5


In [30]:
# Test model loaded correctly 
w1 = 'walmart'
model.wv.most_similar(positive=w1)

[('store', 0.6826429963111877),
 ('mcdonalds', 0.6786150932312012),
 ('costco', 0.6713492274284363),
 ('lowes', 0.6474838256835938),
 ('starbucks', 0.6396681070327759),
 ('stores', 0.6294941306114197),
 ('supermarket', 0.6140614748001099),
 ('bookstore', 0.6098101139068604),
 ('sale', 0.5992050170898438),
 ('mall', 0.5950944423675537)]

In [32]:
# Get words most dissimilar to walmart
model.wv.most_similar(negative=[w1])

[('unravelling', 0.3367311954498291),
 ('unraveled', 0.3316466510295868),
 ('spoilerssome', 0.3251951336860657),
 ('chronicler', 0.32300466299057007),
 ('unraveling', 0.32052502036094666),
 ('edric', 0.3175353407859802),
 ('crescendos', 0.31333744525909424),
 ('foretold', 0.31330209970474243),
 ('shevaun', 0.30688461661338806),
 ('unquestionably', 0.3023051917552948)]

In [21]:
# Create dictionary of words and their corresponding weights
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

  """Entry point for launching an IPython kernel.


In [20]:
# Create Modeling pipelines that create mean embedding vectors and then run them through the specified classifiers
rf_5 = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove_big)),
              ("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True))])
e_trees_5 = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(w2v)),
                    ("Extra Trees Classifier", ExtraTreesClassifier(n_estimators=200, verbose=True))])
bnb_5 = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(w2v)),
                ("Bernoulli Naive Bayesian Classifier", BernoulliNB(verbose))])
lr_5 = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(w2v)),
                ("Logistic Regression", LogisticRegression())])

rf_3 = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove_big)),
              ("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True))])
e_trees_3 = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(w2v)),
                    ("Extra Trees Classifier", ExtraTreesClassifier(n_estimators=200))])
bnb_3 = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(w2v)),
                ("Bernoulli Naive Bayesian Classifier", BernoulliNB())])
lr_3 = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(w2v)),
                ("Logistic Regression", LogisticRegression())])


In [33]:
# Create list of models for cross validation

models_5_class = [
    ("Random Forest", rf_5),
    ("Extra Trees", e_trees_5),
    ("Bernoulli Naive Bayes", bnb_5),
    ('Logistic Regression', lr_5)
    
]

unsorted_scores_5_class = [(name, cross_val_score(model, X, y_5_class, cv=3).mean()) for name, model, in models_5_class]


In [25]:
# Sort models by score and create DataFrame of Results
scores_5_class  = sorted(unsorted_scores_5_class, key=lambda x: -x[1])

scores_df_5_class = pd.DataFrame(scores_5_class, columns=["Model", "Mean Score"])
scores_df_5_class

Unnamed: 0,Model,Mean Score
0,Logistic Regression,0.50453
1,Extra Trees,0.461925
2,Random Forest,0.421484
3,Bernoulli Naive Bayes,0.39015


In [26]:
# Repeat of above, but with classifies for 3_class prediction instead of 5_class
models_3_class = [
    ("Random Forest", rf_3),
    ("Extra Trees", e_trees_3),
    ("Bernoulli Naive Bayes", bnb_3),
    ('Logistic Regression', lr_3)
    
]

# Same as above, run pipeline sthrough cross validation and then store results in a dataframe sorted by accuracy descending
unsorted_scores_3_class = [(name, cross_val_score(model, X, y_3_class, cv=3).mean()) for name, model, in models_3_class]
scores_3_class  = sorted(unsorted_scores_3_class, key=lambda x: -x[1])

scores_df_3_class = pd.DataFrame(scores_3_class, columns=["Model", "Mean Score"])
scores_df_3_class

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 11.9min finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    7.1s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 11.5min finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    8.6s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 13.1min finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    7.0s finished


Unnamed: 0,Model,Mean Score
0,Logistic Regression,0.716621
1,Extra Trees,0.686111
2,Random Forest,0.654829
3,Bernoulli Naive Bayes,0.592287
