In [1]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

logger.info("testing logger")

2017-12-19 20:41:35 INFO     testing logger


In [2]:
import time
time.ctime()

'Tue Dec 19 20:41:38 2017'

In [3]:
from contextlib import contextmanager
from timeit import default_timer

In [4]:
import tqdm
import re
from sys import getsizeof
import os
import pickle

import gensim
from gensim.models import Doc2Vec
from gensim.models.phrases import Phrases,Phraser
from gensim.models.doc2vec import TaggedDocument

from collections import OrderedDict
import multiprocessing

import numpy as np
from random import sample,shuffle

2017-12-19 20:41:40 INFO     'pattern' package not found; tag filters are not available for English


In [37]:
%%time
if False:
    file_path = []
    directories = list( os.scandir('case-data') )
    for d in directories:
        if d.is_dir():
            file_path.extend( list( os.scandir(d.path)) )

    all_path = [ (file.path,index) for index,file in enumerate(file_path)]
    file_path = all_path
else:
    with open("file_path.pkl",'rb') as f:
        file_path = pickle.load(f)
        all_path  = file_path

CPU times: user 360 ms, sys: 152 ms, total: 512 ms
Wall time: 486 ms


In [26]:
len(file_path)

1357687

In [5]:
%%time
shuffle(file_path)

CPU times: user 1.07 s, sys: 0 ns, total: 1.07 s
Wall time: 1.07 s


In [6]:
def tokenizer(txt):
    txt = re.sub('U. S. C.', 'U.S.C.', txt)
    txt = re.split(" *\)| *\(| |\]|\[|\.\s+|,\s+|;\s*|:\s+|\?\s+|!\s+|'s|\"|'|\.$" , txt)
    ret = [token.lower() for token in txt if len(token)>0]
    return ret

In [7]:
def get_tokens(file):
    with open( file , 'rb' ) as f:
        ret = pickle.load(f)
    text = " ".join(ret['content'] )
    return tokenizer(text)

In [8]:
class corpus_iterator(object):
    
    def __init__(self,size=-1):
        self.size = size
        
    def __iter__(self):
            
        for sz,(i,_) in enumerate(file_path):
            if sz==self.size:
                break
            yield get_tokens(i)

In [10]:
%%time
if 1==0:
    bigram = Phrases( corpus_iterator() , progress_per=10000 )
    bigram_phraser = Phraser(bigram)
    
    
    class bigram_generator(object):
        def __iter__(self):
            for txt in corpus_iterator():
                yield bigram_phraser[txt]
    bigram_phraser.save( "model_bigram_phraser"+time.ctime() )

    tokens_ = bigram_generator()

    trigram = Phrases(tokens_ , progress_per=10000 )
    trigram_phraser = Phraser(trigram)
    trigram_phraser.save( "model_trigram_phraser"+time.ctime() )
    
else:
    bigram_phraser = Phraser.load("model/Final/model_bigram_phraser_2")
    trigram_phraser= Phraser.load("model/Final/model_trigram_phraser_2")
    
class trigram_generator(object):
    def __iter__(self):
        for txt in corpus_iterator():
            yield trigram_phraser[ bigram_phraser[txt] ]

tokens  = trigram_generator()

2017-12-19 03:20:46 INFO     loading Phraser object from model/Final/model_bigram_phraser_2
2017-12-19 03:20:47 INFO     loaded model/Final/model_bigram_phraser_2
2017-12-19 03:20:47 INFO     loading Phraser object from model/Final/model_trigram_phraser_2
2017-12-19 03:20:50 INFO     loaded model/Final/model_trigram_phraser_2


CPU times: user 3.81 s, sys: 536 ms, total: 4.34 s
Wall time: 4.43 s


In [11]:
for _,i in enumerate(tokens):
    print(i)
    if _==10:
        break

['samuel_conti', 'district', 'judge']
['goldberg', 'judge', 'usinor_sacilor', 'unimetal', 'and', 'ascometal', 'usinor_sacilor', 'or', 'usinor', 'and', 'inland_steel', 'bar', 'company', 'inland_steel', 'bring', 'this', 'consolidated', 'action', 'pursuant', 'to', '19', 'u.s.c', '§', '1516a', 'a', '2', 'b', 'iii', '1988', 'to', 'contest', 'the', 'final', 'determination', 'by', 'the', 'international', 'trade', 'administration', 'u.s', 'department', 'of', 'commerce', 'ita', 'or', 'commerce', 'in', 'certain', 'hot_rolled_lead', 'and', 'bismuth_carbon_steel', 'products', 'from', 'france', '58', 'fed.reg', '6221', 'jan', '27', '1993', 'final', 'determination', 'in', 'this', 'consolidated', 'action', 'usinor_sacilor', 'and', 'inland_steel', 'each', 'seek', 'judgment', 'upon', 'the', 'agency', 'record', 'pursuant', 'to', 'uscit', 'rule', '56.2', 'the', 'court', 'exercises', 'its', 'jurisdiction', 'under', '28', 'u.s.c', '§', '1581', 'c', '1988', 'in', 'general', 'terms', 'commerce', 'investigati

In [12]:
class corpus_tagged_generator(object):
    
    def __init__(self,size=-1):
        self.size = size
        
    def __iter__(self):
            
        for sz,(i,index) in enumerate(file_path):
            if sz==self.size:
                break
            yield TaggedDocument( trigram_phraser[ bigram_phraser[get_tokens(i)] ] , [index] )

In [13]:
cores = multiprocessing.cpu_count()

In [13]:
@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start

In [48]:
doc2vec_model = Doc2Vec(dm=1, dm_concat=1, size=300, window=10, min_count=4, workers=cores , compute_loss = True)
doc2vec_model.build_vocab( corpus_tagged_generator(10000),progress_per=1000 )

In [14]:
doc2vec_model = Doc2Vec.load("model/model_doc2vec_Mon Dec 18 19:43:54 2017")

2017-12-19 03:22:15 INFO     loading Doc2Vec object from model/model_doc2vec_Mon Dec 18 19:43:54 2017
2017-12-19 03:22:16 INFO     loading wv recursively from model/model_doc2vec_Mon Dec 18 19:43:54 2017.wv.* with mmap=None
2017-12-19 03:22:16 INFO     loading syn0 from model/model_doc2vec_Mon Dec 18 19:43:54 2017.wv.syn0.npy with mmap=None
2017-12-19 03:22:16 INFO     setting ignored attribute syn0norm to None
2017-12-19 03:22:16 INFO     loading docvecs recursively from model/model_doc2vec_Mon Dec 18 19:43:54 2017.docvecs.* with mmap=None
2017-12-19 03:22:16 INFO     loading doctag_syn0 from model/model_doc2vec_Mon Dec 18 19:43:54 2017.docvecs.doctag_syn0.npy with mmap=None
2017-12-19 03:22:17 INFO     loading doctag_syn0norm from model/model_doc2vec_Mon Dec 18 19:43:54 2017.docvecs.doctag_syn0norm.npy with mmap=None
2017-12-19 03:22:17 INFO     loading syn1neg from model/model_doc2vec_Mon Dec 18 19:43:54 2017.syn1neg.npy with mmap=None
2017-12-19 03:22:18 INFO     setting ignored at

In [50]:
alpha, min_alpha, passes = (0.025, 0.001, 10)
alpha_delta = (alpha - min_alpha) / passes

In [52]:
print("START %s" % time.ctime() )
for epoch in range(passes):
    
    duration = 'na'
    doc2vec_model.alpha, doc2vec_model.min_alpha = alpha, alpha
    
    with elapsed_timer() as elapsed:
        loss=doc2vec_model.train( corpus_tagged_generator(10000), total_examples=doc2vec_model.corpus_count, epochs=1, compute_loss=True )
        print("loss = ",loss)
        duration = '%.1f' % elapsed()
        
    print('Completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta  


print("END %s" % str(time.ctime()))

2017-12-11 09:25:34 INFO     training model with 8 workers on 104001 vocabulary and 6300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10


START Mon Dec 11 09:25:34 2017


2017-12-11 09:25:35 INFO     PROGRESS: at 0.31% examples, 64679 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:25:36 INFO     PROGRESS: at 0.73% examples, 82209 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:25:37 INFO     PROGRESS: at 1.14% examples, 83084 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:25:38 INFO     PROGRESS: at 1.55% examples, 86149 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:25:39 INFO     PROGRESS: at 1.84% examples, 82930 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:25:40 INFO     PROGRESS: at 2.38% examples, 88558 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:25:41 INFO     PROGRESS: at 2.84% examples, 88986 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:25:42 INFO     PROGRESS: at 3.28% examples, 89542 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:25:43 INFO     PROGRESS: at 3.70% examples, 89379 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:25:44 INFO     PROGRESS: at 4.22% examples, 90258 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:25:45 INFO     P

2017-12-11 09:27:03 INFO     PROGRESS: at 38.23% examples, 94930 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:27:04 INFO     PROGRESS: at 38.71% examples, 94892 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:27:05 INFO     PROGRESS: at 39.09% examples, 94804 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:27:06 INFO     PROGRESS: at 39.45% examples, 94587 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:27:07 INFO     PROGRESS: at 39.82% examples, 94485 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:27:08 INFO     PROGRESS: at 40.34% examples, 94582 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:27:09 INFO     PROGRESS: at 40.65% examples, 94508 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:27:10 INFO     PROGRESS: at 41.16% examples, 94659 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:27:11 INFO     PROGRESS: at 41.62% examples, 94607 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:27:12 INFO     PROGRESS: at 42.14% examples, 94624 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:27:13 

2017-12-11 09:28:31 INFO     PROGRESS: at 76.61% examples, 96508 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:28:32 INFO     PROGRESS: at 77.22% examples, 96600 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:28:33 INFO     PROGRESS: at 77.75% examples, 96672 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:28:34 INFO     PROGRESS: at 78.26% examples, 96734 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:28:35 INFO     PROGRESS: at 78.73% examples, 96663 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:28:36 INFO     PROGRESS: at 79.22% examples, 96661 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:28:37 INFO     PROGRESS: at 79.67% examples, 96617 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:28:38 INFO     PROGRESS: at 80.19% examples, 96658 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:28:39 INFO     PROGRESS: at 80.64% examples, 96637 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:28:40 INFO     PROGRESS: at 81.22% examples, 96762 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:28:41 

loss =  22268004
Completed pass 1 at alpha 0.025000


2017-12-11 09:29:24 INFO     PROGRESS: at 0.47% examples, 102279 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:29:25 INFO     PROGRESS: at 0.89% examples, 98904 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:29:26 INFO     PROGRESS: at 1.36% examples, 100831 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:29:27 INFO     PROGRESS: at 1.83% examples, 98023 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:29:28 INFO     PROGRESS: at 2.30% examples, 99979 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:29:29 INFO     PROGRESS: at 2.77% examples, 99530 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:29:30 INFO     PROGRESS: at 3.22% examples, 99559 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:29:31 INFO     PROGRESS: at 3.70% examples, 98987 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:29:32 INFO     PROGRESS: at 4.28% examples, 100957 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:29:33 INFO     PROGRESS: at 4.69% examples, 100245 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:29:34 INFO  

2017-12-11 09:30:51 INFO     PROGRESS: at 41.54% examples, 104239 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:30:52 INFO     PROGRESS: at 42.11% examples, 104256 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:30:53 INFO     PROGRESS: at 42.62% examples, 104259 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:30:54 INFO     PROGRESS: at 43.15% examples, 104351 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:30:55 INFO     PROGRESS: at 43.48% examples, 104205 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:30:56 INFO     PROGRESS: at 44.06% examples, 104368 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:30:57 INFO     PROGRESS: at 44.40% examples, 104192 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:30:58 INFO     PROGRESS: at 44.82% examples, 104171 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:30:59 INFO     PROGRESS: at 45.47% examples, 104252 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:31:01 INFO     PROGRESS: at 45.91% examples, 104247 words/s, in_qsize 0, out_qsize 0
2017-12-11

2017-12-11 09:32:18 INFO     PROGRESS: at 82.55% examples, 104685 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:32:19 INFO     PROGRESS: at 83.19% examples, 104752 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:32:20 INFO     PROGRESS: at 83.69% examples, 104728 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:32:21 INFO     PROGRESS: at 84.15% examples, 104701 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:32:22 INFO     PROGRESS: at 84.67% examples, 104726 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:32:23 INFO     PROGRESS: at 85.38% examples, 104780 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:32:24 INFO     PROGRESS: at 85.86% examples, 104790 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:32:25 INFO     PROGRESS: at 86.41% examples, 104859 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:32:26 INFO     PROGRESS: at 86.87% examples, 104884 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:32:27 INFO     PROGRESS: at 87.39% examples, 104943 words/s, in_qsize 1, out_qsize 0
2017-12-11

loss =  22269054
Completed pass 2 at alpha 0.022600


2017-12-11 09:32:55 INFO     PROGRESS: at 0.47% examples, 102840 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:32:56 INFO     PROGRESS: at 0.96% examples, 104674 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:32:57 INFO     PROGRESS: at 1.48% examples, 106296 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:32:58 INFO     PROGRESS: at 1.83% examples, 101311 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:32:59 INFO     PROGRESS: at 2.27% examples, 101799 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:33:00 INFO     PROGRESS: at 2.76% examples, 100645 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:33:01 INFO     PROGRESS: at 3.22% examples, 101737 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:33:02 INFO     PROGRESS: at 3.76% examples, 102766 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:33:03 INFO     PROGRESS: at 4.30% examples, 104110 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:33:04 INFO     PROGRESS: at 4.79% examples, 103974 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:33:05 

2017-12-11 09:34:21 INFO     PROGRESS: at 42.49% examples, 107606 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:34:22 INFO     PROGRESS: at 42.99% examples, 107621 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:34:23 INFO     PROGRESS: at 43.40% examples, 107479 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:34:25 INFO     PROGRESS: at 43.91% examples, 107571 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:34:26 INFO     PROGRESS: at 44.36% examples, 107573 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:34:27 INFO     PROGRESS: at 44.72% examples, 107581 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:34:28 INFO     PROGRESS: at 45.38% examples, 107668 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:34:29 INFO     PROGRESS: at 45.82% examples, 107528 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:34:30 INFO     PROGRESS: at 46.35% examples, 107311 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:34:31 INFO     PROGRESS: at 46.80% examples, 107298 words/s, in_qsize 1, out_qsize 0
2017-12-11

2017-12-11 09:35:48 INFO     PROGRESS: at 84.72% examples, 107976 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:35:49 INFO     PROGRESS: at 85.44% examples, 108073 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:35:50 INFO     PROGRESS: at 85.91% examples, 108032 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:35:51 INFO     PROGRESS: at 86.41% examples, 108068 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:35:52 INFO     PROGRESS: at 86.87% examples, 108084 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:35:53 INFO     PROGRESS: at 87.38% examples, 108131 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:35:54 INFO     PROGRESS: at 87.93% examples, 108173 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:35:55 INFO     PROGRESS: at 88.39% examples, 108210 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:35:56 INFO     PROGRESS: at 88.88% examples, 108234 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:35:57 INFO     PROGRESS: at 89.38% examples, 108244 words/s, in_qsize 0, out_qsize 0
2017-12-11

loss =  22267022
Completed pass 3 at alpha 0.020200


2017-12-11 09:36:22 INFO     PROGRESS: at 0.31% examples, 65598 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:36:23 INFO     PROGRESS: at 0.79% examples, 90040 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:36:24 INFO     PROGRESS: at 1.36% examples, 102942 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:36:25 INFO     PROGRESS: at 1.83% examples, 100096 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:36:26 INFO     PROGRESS: at 2.30% examples, 102634 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:36:27 INFO     PROGRESS: at 2.81% examples, 102067 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:36:28 INFO     PROGRESS: at 3.31% examples, 102695 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:36:29 INFO     PROGRESS: at 3.89% examples, 105051 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:36:30 INFO     PROGRESS: at 4.42% examples, 104274 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:36:31 INFO     PROGRESS: at 4.82% examples, 103257 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:36:32 IN

2017-12-11 09:37:49 INFO     PROGRESS: at 42.71% examples, 107325 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:37:50 INFO     PROGRESS: at 43.12% examples, 107097 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:37:51 INFO     PROGRESS: at 43.46% examples, 106903 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:37:52 INFO     PROGRESS: at 43.95% examples, 106855 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:37:53 INFO     PROGRESS: at 44.35% examples, 106635 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:37:54 INFO     PROGRESS: at 44.62% examples, 106554 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:37:55 INFO     PROGRESS: at 45.15% examples, 106410 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:37:56 INFO     PROGRESS: at 45.69% examples, 106352 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:37:57 INFO     PROGRESS: at 46.17% examples, 106236 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:37:58 INFO     PROGRESS: at 46.67% examples, 106073 words/s, in_qsize 0, out_qsize 0
2017-12-11

2017-12-11 09:39:15 INFO     PROGRESS: at 84.49% examples, 107222 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:39:16 INFO     PROGRESS: at 85.09% examples, 107249 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:17 INFO     PROGRESS: at 85.62% examples, 107258 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:18 INFO     PROGRESS: at 86.17% examples, 107254 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:19 INFO     PROGRESS: at 86.52% examples, 107169 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:20 INFO     PROGRESS: at 86.99% examples, 107115 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:39:21 INFO     PROGRESS: at 87.44% examples, 107095 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:22 INFO     PROGRESS: at 87.91% examples, 107055 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:23 INFO     PROGRESS: at 88.39% examples, 107099 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:39:24 INFO     PROGRESS: at 88.91% examples, 107200 words/s, in_qsize 0, out_qsize 0
2017-12-11

loss =  22267395
Completed pass 4 at alpha 0.017800


2017-12-11 09:39:49 INFO     PROGRESS: at 0.31% examples, 64501 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:50 INFO     PROGRESS: at 0.77% examples, 86123 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:39:51 INFO     PROGRESS: at 1.28% examples, 91738 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:52 INFO     PROGRESS: at 1.69% examples, 93596 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:53 INFO     PROGRESS: at 2.14% examples, 95319 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:54 INFO     PROGRESS: at 2.74% examples, 99438 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:55 INFO     PROGRESS: at 3.28% examples, 102577 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:56 INFO     PROGRESS: at 3.85% examples, 105201 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:39:57 INFO     PROGRESS: at 4.48% examples, 106857 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:58 INFO     PROGRESS: at 4.87% examples, 105402 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:39:59 INFO  

2017-12-11 09:41:16 INFO     PROGRESS: at 42.84% examples, 107233 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:41:17 INFO     PROGRESS: at 43.39% examples, 107351 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:41:18 INFO     PROGRESS: at 43.91% examples, 107502 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:41:19 INFO     PROGRESS: at 44.35% examples, 107370 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:41:20 INFO     PROGRESS: at 44.72% examples, 107446 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:41:21 INFO     PROGRESS: at 45.38% examples, 107532 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:41:22 INFO     PROGRESS: at 45.82% examples, 107403 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:41:23 INFO     PROGRESS: at 46.35% examples, 107239 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:41:24 INFO     PROGRESS: at 46.80% examples, 107206 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:41:25 INFO     PROGRESS: at 47.12% examples, 107027 words/s, in_qsize 0, out_qsize 0
2017-12-11

2017-12-11 09:42:43 INFO     PROGRESS: at 85.71% examples, 108108 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:42:44 INFO     PROGRESS: at 86.29% examples, 108220 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:42:45 INFO     PROGRESS: at 86.77% examples, 108293 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:42:46 INFO     PROGRESS: at 87.26% examples, 108346 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:42:47 INFO     PROGRESS: at 87.78% examples, 108422 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:42:48 INFO     PROGRESS: at 88.31% examples, 108475 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:42:49 INFO     PROGRESS: at 88.73% examples, 108465 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:42:50 INFO     PROGRESS: at 89.25% examples, 108436 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:42:51 INFO     PROGRESS: at 89.87% examples, 108538 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:42:52 INFO     PROGRESS: at 90.10% examples, 108352 words/s, in_qsize 1, out_qsize 0
2017-12-11

loss =  22269590
Completed pass 5 at alpha 0.015400


2017-12-11 09:43:13 INFO     PROGRESS: at 0.39% examples, 85928 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:43:14 INFO     PROGRESS: at 0.92% examples, 104584 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:43:15 INFO     PROGRESS: at 1.39% examples, 103003 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:43:16 INFO     PROGRESS: at 1.79% examples, 100510 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:43:17 INFO     PROGRESS: at 2.14% examples, 96443 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:43:18 INFO     PROGRESS: at 2.59% examples, 97449 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:43:19 INFO     PROGRESS: at 3.12% examples, 99519 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:43:20 INFO     PROGRESS: at 3.65% examples, 100012 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:43:21 INFO     PROGRESS: at 4.12% examples, 100469 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:43:23 INFO     PROGRESS: at 4.58% examples, 99478 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:43:24 INFO 

2017-12-11 09:44:41 INFO     PROGRESS: at 43.31% examples, 108428 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:44:42 INFO     PROGRESS: at 43.77% examples, 108588 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:44:43 INFO     PROGRESS: at 44.26% examples, 108625 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:44:44 INFO     PROGRESS: at 44.60% examples, 108501 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:44:45 INFO     PROGRESS: at 45.12% examples, 108516 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:44:46 INFO     PROGRESS: at 45.67% examples, 108421 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:44:47 INFO     PROGRESS: at 46.14% examples, 108305 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:44:48 INFO     PROGRESS: at 46.67% examples, 108241 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:44:49 INFO     PROGRESS: at 47.06% examples, 108207 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:44:50 INFO     PROGRESS: at 47.53% examples, 108101 words/s, in_qsize 0, out_qsize 0
2017-12-11

2017-12-11 09:46:07 INFO     PROGRESS: at 86.87% examples, 110149 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:08 INFO     PROGRESS: at 87.39% examples, 110213 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:09 INFO     PROGRESS: at 87.97% examples, 110250 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:10 INFO     PROGRESS: at 88.41% examples, 110306 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:11 INFO     PROGRESS: at 88.93% examples, 110365 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:46:12 INFO     PROGRESS: at 89.42% examples, 110329 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:46:13 INFO     PROGRESS: at 89.87% examples, 110248 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:46:14 INFO     PROGRESS: at 90.10% examples, 110064 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:15 INFO     PROGRESS: at 90.73% examples, 110143 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:16 INFO     PROGRESS: at 91.25% examples, 110115 words/s, in_qsize 0, out_qsize 0
2017-12-11

loss =  22270440
Completed pass 6 at alpha 0.013000


2017-12-11 09:46:35 INFO     PROGRESS: at 0.31% examples, 64844 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:36 INFO     PROGRESS: at 0.72% examples, 81111 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:37 INFO     PROGRESS: at 1.20% examples, 87644 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:38 INFO     PROGRESS: at 1.65% examples, 93989 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:39 INFO     PROGRESS: at 2.09% examples, 94112 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:40 INFO     PROGRESS: at 2.54% examples, 96261 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:42 INFO     PROGRESS: at 3.07% examples, 96073 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:43 INFO     PROGRESS: at 3.62% examples, 98392 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:44 INFO     PROGRESS: at 4.08% examples, 98315 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:46:45 INFO     PROGRESS: at 4.51% examples, 96942 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:46:46 INFO     P

2017-12-11 09:48:02 INFO     PROGRESS: at 42.89% examples, 107693 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:48:03 INFO     PROGRESS: at 43.44% examples, 107769 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:48:04 INFO     PROGRESS: at 43.99% examples, 107972 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:48:05 INFO     PROGRESS: at 44.39% examples, 107947 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:48:07 INFO     PROGRESS: at 44.72% examples, 107800 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:48:08 INFO     PROGRESS: at 45.34% examples, 107787 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:48:09 INFO     PROGRESS: at 45.85% examples, 107716 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:48:10 INFO     PROGRESS: at 46.41% examples, 107727 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:48:11 INFO     PROGRESS: at 46.88% examples, 107666 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:48:12 INFO     PROGRESS: at 47.31% examples, 107653 words/s, in_qsize 1, out_qsize 0
2017-12-11

2017-12-11 09:49:30 INFO     PROGRESS: at 84.12% examples, 106296 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:49:31 INFO     PROGRESS: at 84.72% examples, 106354 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:49:32 INFO     PROGRESS: at 85.40% examples, 106418 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:49:33 INFO     PROGRESS: at 85.91% examples, 106407 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:49:34 INFO     PROGRESS: at 86.41% examples, 106453 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:49:35 INFO     PROGRESS: at 86.90% examples, 106533 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:49:36 INFO     PROGRESS: at 87.35% examples, 106495 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:49:37 INFO     PROGRESS: at 87.78% examples, 106480 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:49:38 INFO     PROGRESS: at 88.27% examples, 106497 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:49:39 INFO     PROGRESS: at 88.70% examples, 106517 words/s, in_qsize 1, out_qsize 0
2017-12-11

loss =  22269533
Completed pass 7 at alpha 0.010600


2017-12-11 09:50:04 INFO     PROGRESS: at 0.42% examples, 89914 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:50:05 INFO     PROGRESS: at 0.80% examples, 92903 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:50:06 INFO     PROGRESS: at 1.36% examples, 100697 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:50:07 INFO     PROGRESS: at 1.83% examples, 97705 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:50:08 INFO     PROGRESS: at 2.37% examples, 102521 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:50:09 INFO     PROGRESS: at 2.87% examples, 103034 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:50:11 INFO     PROGRESS: at 3.37% examples, 103256 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:50:12 INFO     PROGRESS: at 3.85% examples, 104057 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:50:13 INFO     PROGRESS: at 4.39% examples, 103913 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:50:14 INFO     PROGRESS: at 4.79% examples, 102444 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:50:15 INF

2017-12-11 09:51:31 INFO     PROGRESS: at 43.23% examples, 109280 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:51:32 INFO     PROGRESS: at 43.73% examples, 109500 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:51:33 INFO     PROGRESS: at 44.23% examples, 109438 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:51:34 INFO     PROGRESS: at 44.59% examples, 109364 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:51:35 INFO     PROGRESS: at 45.15% examples, 109459 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:51:36 INFO     PROGRESS: at 45.73% examples, 109412 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:51:37 INFO     PROGRESS: at 46.33% examples, 109356 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:51:38 INFO     PROGRESS: at 46.81% examples, 109406 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:51:39 INFO     PROGRESS: at 47.24% examples, 109455 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:51:40 INFO     PROGRESS: at 47.76% examples, 109564 words/s, in_qsize 0, out_qsize 0
2017-12-11

2017-12-11 09:52:57 INFO     PROGRESS: at 86.87% examples, 110699 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:52:58 INFO     PROGRESS: at 87.32% examples, 110639 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:52:59 INFO     PROGRESS: at 87.82% examples, 110687 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:00 INFO     PROGRESS: at 88.31% examples, 110705 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:01 INFO     PROGRESS: at 88.79% examples, 110720 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:53:02 INFO     PROGRESS: at 89.25% examples, 110731 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:03 INFO     PROGRESS: at 89.83% examples, 110771 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:04 INFO     PROGRESS: at 90.09% examples, 110575 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:05 INFO     PROGRESS: at 90.65% examples, 110579 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:06 INFO     PROGRESS: at 91.24% examples, 110497 words/s, in_qsize 0, out_qsize 0
2017-12-11

loss =  22271399
Completed pass 8 at alpha 0.008200


2017-12-11 09:53:26 INFO     PROGRESS: at 0.47% examples, 103202 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:27 INFO     PROGRESS: at 1.02% examples, 112831 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:28 INFO     PROGRESS: at 1.55% examples, 113574 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:29 INFO     PROGRESS: at 1.99% examples, 111157 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:30 INFO     PROGRESS: at 2.53% examples, 110611 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:53:31 INFO     PROGRESS: at 3.14% examples, 114060 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:32 INFO     PROGRESS: at 3.72% examples, 114899 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:53:33 INFO     PROGRESS: at 4.28% examples, 114876 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:34 INFO     PROGRESS: at 4.71% examples, 113394 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:53:35 INFO     PROGRESS: at 5.21% examples, 111983 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:53:36 

2017-12-11 09:54:52 INFO     PROGRESS: at 43.31% examples, 109527 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:54:53 INFO     PROGRESS: at 43.73% examples, 109589 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:54:54 INFO     PROGRESS: at 44.26% examples, 109629 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:54:55 INFO     PROGRESS: at 44.60% examples, 109487 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:54:56 INFO     PROGRESS: at 45.19% examples, 109631 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:54:57 INFO     PROGRESS: at 45.76% examples, 109694 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:54:58 INFO     PROGRESS: at 46.40% examples, 109731 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:54:59 INFO     PROGRESS: at 46.88% examples, 109778 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:55:00 INFO     PROGRESS: at 47.33% examples, 109797 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:55:01 INFO     PROGRESS: at 47.91% examples, 109887 words/s, in_qsize 0, out_qsize 0
2017-12-11

2017-12-11 09:56:19 INFO     PROGRESS: at 86.23% examples, 109592 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:20 INFO     PROGRESS: at 86.63% examples, 109547 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:21 INFO     PROGRESS: at 87.09% examples, 109497 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:56:22 INFO     PROGRESS: at 87.50% examples, 109373 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:56:23 INFO     PROGRESS: at 87.97% examples, 109297 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:24 INFO     PROGRESS: at 88.36% examples, 109225 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:25 INFO     PROGRESS: at 88.88% examples, 109280 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:26 INFO     PROGRESS: at 89.38% examples, 109329 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:27 INFO     PROGRESS: at 89.91% examples, 109290 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:28 INFO     PROGRESS: at 90.21% examples, 109158 words/s, in_qsize 0, out_qsize 0
2017-12-11

loss =  22268888
Completed pass 9 at alpha 0.005800


2017-12-11 09:56:49 INFO     PROGRESS: at 0.31% examples, 61897 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:50 INFO     PROGRESS: at 0.79% examples, 87664 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:56:51 INFO     PROGRESS: at 1.36% examples, 101396 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:52 INFO     PROGRESS: at 1.83% examples, 100421 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:53 INFO     PROGRESS: at 2.38% examples, 105186 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:54 INFO     PROGRESS: at 2.98% examples, 107470 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:55 INFO     PROGRESS: at 3.59% examples, 110263 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:57 INFO     PROGRESS: at 4.10% examples, 109695 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:58 INFO     PROGRESS: at 4.58% examples, 108170 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:56:59 INFO     PROGRESS: at 5.15% examples, 109071 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:57:00 IN

2017-12-11 09:58:16 INFO     PROGRESS: at 42.75% examples, 108073 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:58:17 INFO     PROGRESS: at 43.34% examples, 108214 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:58:18 INFO     PROGRESS: at 43.73% examples, 108181 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:58:19 INFO     PROGRESS: at 44.18% examples, 108014 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:58:20 INFO     PROGRESS: at 44.53% examples, 107881 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:58:21 INFO     PROGRESS: at 44.95% examples, 107691 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:58:22 INFO     PROGRESS: at 45.53% examples, 107663 words/s, in_qsize 1, out_qsize 0
2017-12-11 09:58:23 INFO     PROGRESS: at 46.01% examples, 107679 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:58:24 INFO     PROGRESS: at 46.60% examples, 107663 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:58:25 INFO     PROGRESS: at 47.06% examples, 107801 words/s, in_qsize 0, out_qsize 0
2017-12-11

2017-12-11 09:59:42 INFO     PROGRESS: at 84.93% examples, 108382 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:59:43 INFO     PROGRESS: at 85.50% examples, 108336 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:59:44 INFO     PROGRESS: at 86.02% examples, 108362 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:59:45 INFO     PROGRESS: at 86.47% examples, 108380 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:59:46 INFO     PROGRESS: at 86.95% examples, 108378 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:59:47 INFO     PROGRESS: at 87.44% examples, 108411 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:59:48 INFO     PROGRESS: at 88.03% examples, 108463 words/s, in_qsize 0, out_qsize 1
2017-12-11 09:59:49 INFO     PROGRESS: at 88.48% examples, 108546 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:59:50 INFO     PROGRESS: at 88.99% examples, 108619 words/s, in_qsize 0, out_qsize 0
2017-12-11 09:59:51 INFO     PROGRESS: at 89.51% examples, 108632 words/s, in_qsize 0, out_qsize 0
2017-12-11

loss =  22269655
Completed pass 10 at alpha 0.003400
END Mon Dec 11 10:00:13 2017


In [31]:
doc_id = np.random.randint(doc2vec_model.docvecs.count)
doc_id

300357

In [32]:
sims = doc2vec_model.docvecs.most_similar(doc_id, topn=3) 
sims

[(478196, 0.8871992230415344),
 (1331101, 0.8761962652206421),
 (1017814, 0.8719441294670105)]

In [33]:
print( "actual document ")
with open( all_path[doc_id][0] ,'rb' ) as f:
        print( pickle.load(f)['url'] )
print( "similar documents are")
for i,score in sims:
    with open( all_path[i][0] ,'rb' ) as f:
        print( pickle.load(f)['url'] , score )

actual document 
https://www.leagle.com/decision/194141538fsupp3771310
similar documents are
https://www.leagle.com/decision/1950763181f2d5821560 0.8871992230415344
https://www.leagle.com/decision/197341232citcm3801326 0.8761962652206421
https://www.leagle.com/decision/20051648360fsupp2d128811519 0.8719441294670105


In [71]:
doc2vec_model.save("model/model_doc2vec_" + str(time.ctime()))

2017-12-11 13:06:36 INFO     saving Doc2Vec object under model/model_doc2vec_Mon Dec 11 13:06:36 2017, separately None
2017-12-11 13:06:36 INFO     storing np array 'syn0' to model/model_doc2vec_Mon Dec 11 13:06:36 2017.wv.syn0.npy
2017-12-11 13:06:36 INFO     not storing attribute syn0norm
2017-12-11 13:06:36 INFO     storing np array 'syn1neg' to model/model_doc2vec_Mon Dec 11 13:06:36 2017.syn1neg.npy
2017-12-11 13:06:37 INFO     not storing attribute cum_table
2017-12-11 13:06:38 INFO     saved model/model_doc2vec_Mon Dec 11 13:06:36 2017


In [77]:
doc_id = np.random.randint(doc2v.docvecs.count)
doc_id

947

In [79]:
sims = doc2v.docvecs.most_similar(doc_id, topn=3) 
sims

[(942, 0.8239321708679199),
 (977, 0.8048499822616577),
 (442, 0.8022733330726624)]

In [78]:
print( "actual document ")
with open( all_path[doc_id][0] ,'rb' ) as f:
        print( pickle.load(f)['url'] )
print( "similar documents are")
for i,score in sims:
    with open( all_path[i][0] ,'rb' ) as f:
        print( pickle.load(f)['url'] , score )

2017-12-11 13:14:46 INFO     precomputing L2-norms of doc weight vectors


actual document 
https://www.leagle.com/decision/19871022659fsupp3631936
similar documents are
https://www.leagle.com/decision/19981694993fsupp70111596 0.8239321708679199
https://www.leagle.com/decision/19811363521fsupp84211267 0.8048499822616577
https://www.leagle.com/decision/19841967594fsupp137311797 0.8022733330726624


In [26]:
import random
from IPython.display import HTML

while True:
    word = random.choice(doc2vec_model.wv.index2word)
    if doc2vec_model.wv.vocab[word].count > 10:
        break
word = 'cocaine'
# or uncomment below line, to just pick a word from the relevant domain:
#word = 'comedy/drama'
similars_per_model = [str(model.most_similar(word, topn=20)).replace('), ','),<br>\n') for model in [doc2vec_model]]
similar_table = ("<table><tr><th>" +
    "</th><th>".join([str(model) for model in [doc2vec_model]]) + 
    "</th></tr><tr><td>" +
    "</td><td>".join(similars_per_model) +
    "</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, doc2vec_model.wv.vocab[word].count))
HTML(similar_table)

2017-12-19 03:30:36 INFO     precomputing L2-norms of word weight vectors


most similar words for 'cocaine' (168046 occurences)


"Doc2Vec(dm/c,d64,n5,w16,mc50,s0.001,t16)"
"[('methamphetamine', 0.8328279852867126), ('marijuana', 0.8260446190834045), ('heroin', 0.8131932020187378), ('crack_cocaine', 0.7398765683174133), ('drugs', 0.7237692475318909), ('grams', 0.7220239043235779), ('crack', 0.7128248810768127), ('firearm', 0.7010208368301392), ('narcotics', 0.6627141833305359), ('ammunition', 0.6568285226821899), ('robbery', 0.6552248001098633), ('contraband', 0.6496333479881287), ('gun', 0.6460971832275391), ('trafficking', 0.6436505913734436), ('cocaine_base', 0.6436289548873901), ('murder', 0.6420984268188477), ('firearms', 0.6318500638008118), ('child_pornography', 0.624148428440094), ('stolen', 0.6223495602607727), ('importation', 0.6134322881698608)]"


In [27]:
doc2vec_model.most_similar_cosmul( ['crack_cocaine','841',] , topn = 20)

[('cocaine', 0.6530447006225586),
 ('grams', 0.646716296672821),
 ('1261_1270', 0.6349310278892517),
 ('methamphetamine', 0.6191443204879761),
 ('marijuana', 0.5994399189949036),
 ('924', 0.5964365005493164),
 ('liner_yankelevitz', 0.5931196212768555),
 ('2k2.1', 0.5863490104675293),
 ('shakespeare', 0.5781325697898865),
 ('cocaine_base', 0.5768197178840637),
 ('mallery', 0.5762568712234497),
 ('2010.1', 0.5708507299423218),
 ('firearm', 0.5648546814918518),
 ('robbery', 0.5648131370544434),
 ('crime', 0.562721848487854),
 ('photostatic_copies', 0.5585120320320129),
 ('deland', 0.554549515247345),
 ('heroin', 0.5540895462036133),
 ('938', 0.5535390377044678),
 ('shenango', 0.5517335534095764)]

In [48]:
original = 355839
with open( all_path[original][0] ,'rb' ) as f:
    print( pickle.load(f)['url'] )
print("\n\n*******\n\n")

for i,score in doc2vec_model.docvecs.most_similar(positive=original):
    with open( all_path[i][0] ,'rb' ) as f:
        print( pickle.load(f)['url'] , score )

https://www.leagle.com/decision/1987978482us4961958


*******


https://www.leagle.com/decision/19871494659fsupp83511346 0.80198073387146
https://www.leagle.com/decision/20081334576bkfsupp2d75811272 0.801685631275177
https://www.leagle.com/decision/2006886414fsupp2d4721848 0.7938035726547241
https://www.leagle.com/decision/infdco20140829j09 0.7920225262641907
https://www.leagle.com/decision/inadvfdco110310000015 0.7896073460578918
https://www.leagle.com/decision/1989952723fsupp2291916 0.7890070676803589
https://www.leagle.com/decision/19981577999fsupp57811524 0.7882194519042969
https://www.leagle.com/decision/20041234313fsupp2d92111141 0.7845861315727234
https://www.leagle.com/decision/20041267347fsupp2d92011169 0.780868649482727
https://www.leagle.com/decision/1986825645fsupp1801771 0.7770735025405884


In [47]:
for index,(i,j) in enumerate(all_path):
    if( 'BOOTH v. MARYLAND' in i ):
        print(index , i)

166087 case-data/F.Supp - Federal Supplement Reports/207 F.Supp.2d 394 - BOOTH v. MARYLAND, United States District Court, D. Maryland..pickle
355839 case-data/U.S. - U.S. Official Reports/482 U.S. 496 - BOOTH v. MARYLAND, Supreme Court of United States..pickle
1008533 case-data/F.Supp.2d - Federal Supplement Reporters 2d series/207 F.Supp.2d 394 - BOOTH v. MARYLAND, United States District Court, D. Maryland..pickle
1146333 case-data/F.3d - Federal Reporter 3d Series/327 F.3d 377 - BOOTH v. MARYLAND, United States Court of Appeals, Fourth Circuit..pickle


In [45]:
all_path[0]

("case-data/F.Supp - Federal Supplement Reports/685 F.Supp.2d 1044 - HAGAN v. CALIFORNIA PHYSICIANS' SERVICE, United States District Court, N.D. California..pickle",
 0)