In [84]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import json
import logging
from tqdm import tqdm_notebook as tqdm
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### load data

In [15]:
import csv
with open('../dataset/abcnews-date-text.csv', 'r') as f:
    reader = csv.DictReader(f)
    data = [TaggedDocument(words=word_tokenize(row['headline_text']), tags=[i]) for i, row in enumerate(reader)]
    
    raw_data = [row for row in reader]

In [43]:
print(data[:5])

[TaggedDocument(words=['aba', 'decides', 'against', 'community', 'broadcasting', 'licence'], tags=[0]), TaggedDocument(words=['act', 'fire', 'witnesses', 'must', 'be', 'aware', 'of', 'defamation'], tags=[1]), TaggedDocument(words=['a', 'g', 'calls', 'for', 'infrastructure', 'protection', 'summit'], tags=[2]), TaggedDocument(words=['air', 'nz', 'staff', 'in', 'aust', 'strike', 'for', 'pay', 'rise'], tags=[3]), TaggedDocument(words=['air', 'nz', 'strike', 'to', 'affect', 'australian', 'travellers'], tags=[4])]


# Train model

In [63]:
def train_and_get_model(vector_size, tagged_data):
    max_epochs = 10
    
    model = Doc2Vec(vector_size=vector_size, min_count=5, epochs=max_epochs)
    model.build_vocab(tagged_data)
    
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    
    return model

## train model (vector size:25)

In [64]:
model = train_and_get_model(25, data)
model.save('d2v_vec25.model')

2022-01-22 18:36:46,826 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d25,n5,w5,mc5,s0.001,t3)', 'datetime': '2022-01-22T18:36:46.826488', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
2022-01-22 18:36:46,827 : INFO : collecting all words and their counts
2022-01-22 18:36:46,828 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2022-01-22 18:36:46,857 : INFO : PROGRESS: at example #10000, processed 64407 words (2254773/s), 10057 word types, 0 tags
2022-01-22 18:36:46,888 : INFO : PROGRESS: at example #20000, processed 127677 words (2152998/s), 14092 word types, 0 tags
2022-01-22 18:36:46,916 : INFO : PROGRESS: at example #30000, processed 190647 words (2280189/s), 17073 word types, 0 tags
2022-01-22 18:36:46,941 : INFO : PROGRESS: at example #40000, processed 253604 words (2590070/s), 19307 word types, 0 tags
2022-01-2

2022-01-22 18:36:48,644 : INFO : PROGRESS: at example #630000, processed 3879565 words (2640369/s), 60483 word types, 0 tags
2022-01-22 18:36:48,667 : INFO : PROGRESS: at example #640000, processed 3933954 words (2493273/s), 61163 word types, 0 tags
2022-01-22 18:36:48,689 : INFO : PROGRESS: at example #650000, processed 3988612 words (2524355/s), 61851 word types, 0 tags
2022-01-22 18:36:48,712 : INFO : PROGRESS: at example #660000, processed 4045225 words (2585197/s), 62471 word types, 0 tags
2022-01-22 18:36:48,735 : INFO : PROGRESS: at example #670000, processed 4101596 words (2531025/s), 63054 word types, 0 tags
2022-01-22 18:36:48,758 : INFO : PROGRESS: at example #680000, processed 4158093 words (2490159/s), 63672 word types, 0 tags
2022-01-22 18:36:48,784 : INFO : PROGRESS: at example #690000, processed 4214205 words (2277637/s), 64323 word types, 0 tags
2022-01-22 18:36:48,810 : INFO : PROGRESS: at example #700000, processed 4270917 words (2222560/s), 64889 word types, 0 tags


2022-01-22 18:36:50,723 : INFO : deleting the raw counts dictionary of 106758 items
2022-01-22 18:36:50,726 : INFO : sample=0.001 downsamples 20 most-common words
2022-01-22 18:36:50,727 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 7295791.508223403 word corpus (91.9%% of prior 7935096)', 'datetime': '2022-01-22T18:36:50.727631', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2022-01-22 18:36:51,171 : INFO : estimated required memory for 39772 words and 25 dimensions: 395717800 bytes
2022-01-22 18:36:51,171 : INFO : resetting layer weights
2022-01-22 18:36:51,342 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 39772 vocabulary and 25 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-01-22T18:36:51.342464', 'gensim': '4.1.2', 'python': '3.8.1 (default, Ju

2022-01-22 18:37:54,227 : INFO : EPOCH 2 - PROGRESS: at 3.60% examples, 138306 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:37:55,360 : INFO : EPOCH 2 - PROGRESS: at 5.65% examples, 142452 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:37:56,426 : INFO : EPOCH 2 - PROGRESS: at 7.60% examples, 144358 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:37:57,474 : INFO : EPOCH 2 - PROGRESS: at 9.33% examples, 142140 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:37:58,573 : INFO : EPOCH 2 - PROGRESS: at 11.30% examples, 142811 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:37:59,592 : INFO : EPOCH 2 - PROGRESS: at 13.12% examples, 143344 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:38:00,614 : INFO : EPOCH 2 - PROGRESS: at 14.93% examples, 143653 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:38:01,725 : INFO : EPOCH 2 - PROGRESS: at 16.61% examples, 141502 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:38:02,866 : INFO : EPOCH 2 - PROGRESS: at 18.45% examples, 140444 words/s, in_qsize 5,

2022-01-22 18:39:06,496 : INFO : EPOCH 3 - PROGRESS: at 25.11% examples, 140928 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:39:07,711 : INFO : EPOCH 3 - PROGRESS: at 26.94% examples, 139558 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:39:08,713 : INFO : EPOCH 3 - PROGRESS: at 28.51% examples, 138852 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:39:09,776 : INFO : EPOCH 3 - PROGRESS: at 30.21% examples, 138353 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:39:10,849 : INFO : EPOCH 3 - PROGRESS: at 32.16% examples, 138970 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:39:11,927 : INFO : EPOCH 3 - PROGRESS: at 34.00% examples, 138960 words/s, in_qsize 5, out_qsize 0
2022-01-22 18:39:12,955 : INFO : EPOCH 3 - PROGRESS: at 35.42% examples, 137748 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:39:14,041 : INFO : EPOCH 3 - PROGRESS: at 37.22% examples, 137734 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:39:15,098 : INFO : EPOCH 3 - PROGRESS: at 38.99% examples, 137502 words/s, in_qsiz

2022-01-22 18:40:19,508 : INFO : EPOCH 4 - PROGRESS: at 41.10% examples, 131549 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:40:20,636 : INFO : EPOCH 4 - PROGRESS: at 43.09% examples, 131682 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:40:21,691 : INFO : EPOCH 4 - PROGRESS: at 44.75% examples, 131343 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:40:22,699 : INFO : EPOCH 4 - PROGRESS: at 46.41% examples, 131235 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:40:23,761 : INFO : EPOCH 4 - PROGRESS: at 48.19% examples, 131243 words/s, in_qsize 5, out_qsize 0
2022-01-22 18:40:24,852 : INFO : EPOCH 4 - PROGRESS: at 49.97% examples, 131130 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:40:25,853 : INFO : EPOCH 4 - PROGRESS: at 51.67% examples, 131091 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:40:26,923 : INFO : EPOCH 4 - PROGRESS: at 53.44% examples, 130809 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:40:27,942 : INFO : EPOCH 4 - PROGRESS: at 55.32% examples, 131035 words/s, in_qsiz

2022-01-22 18:41:33,391 : INFO : EPOCH 5 - PROGRESS: at 65.60% examples, 135767 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:41:34,418 : INFO : EPOCH 5 - PROGRESS: at 67.49% examples, 136255 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:41:35,448 : INFO : EPOCH 5 - PROGRESS: at 69.24% examples, 136437 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:41:36,503 : INFO : EPOCH 5 - PROGRESS: at 71.09% examples, 136778 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:41:37,617 : INFO : EPOCH 5 - PROGRESS: at 72.93% examples, 136912 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:41:38,684 : INFO : EPOCH 5 - PROGRESS: at 74.78% examples, 137383 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:41:39,711 : INFO : EPOCH 5 - PROGRESS: at 76.56% examples, 137945 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:41:40,715 : INFO : EPOCH 5 - PROGRESS: at 78.29% examples, 138544 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:41:41,756 : INFO : EPOCH 5 - PROGRESS: at 79.99% examples, 138990 words/s, in_qsiz

2022-01-22 18:42:45,905 : INFO : EPOCH 6 - PROGRESS: at 88.96% examples, 142080 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:42:47,022 : INFO : EPOCH 6 - PROGRESS: at 90.52% examples, 142006 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:42:48,068 : INFO : EPOCH 6 - PROGRESS: at 91.96% examples, 141932 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:42:49,205 : INFO : EPOCH 6 - PROGRESS: at 93.54% examples, 141815 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:42:50,311 : INFO : EPOCH 6 - PROGRESS: at 95.41% examples, 142331 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:42:51,340 : INFO : EPOCH 6 - PROGRESS: at 97.07% examples, 142657 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:42:52,455 : INFO : EPOCH 6 - PROGRESS: at 98.94% examples, 143153 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:42:52,937 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-01-22 18:42:52,947 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-01-22 18:42:52,955 : I

2022-01-22 18:43:56,886 : INFO : EPOCH 8 - PROGRESS: at 1.66% examples, 122842 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:43:57,929 : INFO : EPOCH 8 - PROGRESS: at 3.47% examples, 132311 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:43:58,944 : INFO : EPOCH 8 - PROGRESS: at 5.01% examples, 130200 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:44:00,009 : INFO : EPOCH 8 - PROGRESS: at 6.43% examples, 125152 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:44:01,035 : INFO : EPOCH 8 - PROGRESS: at 8.00% examples, 125129 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:44:02,140 : INFO : EPOCH 8 - PROGRESS: at 9.59% examples, 123623 words/s, in_qsize 5, out_qsize 0
2022-01-22 18:44:03,160 : INFO : EPOCH 8 - PROGRESS: at 11.44% examples, 126815 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:44:04,167 : INFO : EPOCH 8 - PROGRESS: at 12.99% examples, 126872 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:44:05,396 : INFO : EPOCH 8 - PROGRESS: at 14.28% examples, 121735 words/s, in_qsize 5, o

2022-01-22 18:45:10,655 : INFO : EPOCH 9 - PROGRESS: at 14.41% examples, 122971 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:45:11,688 : INFO : EPOCH 9 - PROGRESS: at 16.35% examples, 126092 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:45:12,722 : INFO : EPOCH 9 - PROGRESS: at 18.05% examples, 126844 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:45:13,786 : INFO : EPOCH 9 - PROGRESS: at 19.90% examples, 128051 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:45:14,883 : INFO : EPOCH 9 - PROGRESS: at 21.87% examples, 129527 words/s, in_qsize 5, out_qsize 0
2022-01-22 18:45:15,965 : INFO : EPOCH 9 - PROGRESS: at 23.82% examples, 130873 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:45:17,019 : INFO : EPOCH 9 - PROGRESS: at 25.90% examples, 132930 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:45:18,051 : INFO : EPOCH 9 - PROGRESS: at 27.85% examples, 134266 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:45:19,165 : INFO : EPOCH 9 - PROGRESS: at 29.81% examples, 134853 words/s, in_qsiz

2022-01-22 18:46:24,222 : INFO : EPOCH 10 - PROGRESS: at 42.94% examples, 132160 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:46:25,249 : INFO : EPOCH 10 - PROGRESS: at 44.61% examples, 131940 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:46:26,364 : INFO : EPOCH 10 - PROGRESS: at 46.28% examples, 131317 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:46:27,473 : INFO : EPOCH 10 - PROGRESS: at 48.05% examples, 131114 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:46:28,481 : INFO : EPOCH 10 - PROGRESS: at 49.83% examples, 131360 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:46:29,566 : INFO : EPOCH 10 - PROGRESS: at 51.52% examples, 130957 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:46:30,568 : INFO : EPOCH 10 - PROGRESS: at 52.86% examples, 129964 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:46:31,649 : INFO : EPOCH 10 - PROGRESS: at 54.46% examples, 129341 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:46:32,755 : INFO : EPOCH 10 - PROGRESS: at 56.19% examples, 128975 words/s

## train model (vector size:50)

In [65]:
model = train_and_get_model(50, data)
model.save('d2v_vec50.model')

2022-01-22 18:46:59,449 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d50,n5,w5,mc5,s0.001,t3)', 'datetime': '2022-01-22T18:46:59.449158', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
2022-01-22 18:46:59,450 : INFO : collecting all words and their counts
2022-01-22 18:46:59,452 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2022-01-22 18:46:59,484 : INFO : PROGRESS: at example #10000, processed 64407 words (2041978/s), 10057 word types, 0 tags
2022-01-22 18:46:59,519 : INFO : PROGRESS: at example #20000, processed 127677 words (1952956/s), 14092 word types, 0 tags
2022-01-22 18:46:59,547 : INFO : PROGRESS: at example #30000, processed 190647 words (2285886/s), 17073 word types, 0 tags
2022-01-22 18:46:59,581 : INFO : PROGRESS: at example #40000, processed 253604 words (1929963/s), 19307 word types, 0 tags
2022-01-2

2022-01-22 18:47:01,112 : INFO : PROGRESS: at example #630000, processed 3879565 words (2620414/s), 60483 word types, 0 tags
2022-01-22 18:47:01,134 : INFO : PROGRESS: at example #640000, processed 3933954 words (2626183/s), 61163 word types, 0 tags
2022-01-22 18:47:01,156 : INFO : PROGRESS: at example #650000, processed 3988612 words (2542056/s), 61851 word types, 0 tags
2022-01-22 18:47:01,178 : INFO : PROGRESS: at example #660000, processed 4045225 words (2634567/s), 62471 word types, 0 tags
2022-01-22 18:47:01,201 : INFO : PROGRESS: at example #670000, processed 4101596 words (2601182/s), 63054 word types, 0 tags
2022-01-22 18:47:01,224 : INFO : PROGRESS: at example #680000, processed 4158093 words (2518449/s), 63672 word types, 0 tags
2022-01-22 18:47:01,247 : INFO : PROGRESS: at example #690000, processed 4214205 words (2571774/s), 64323 word types, 0 tags
2022-01-22 18:47:01,272 : INFO : PROGRESS: at example #700000, processed 4270917 words (2391521/s), 64889 word types, 0 tags


2022-01-22 18:47:03,126 : INFO : deleting the raw counts dictionary of 106758 items
2022-01-22 18:47:03,129 : INFO : sample=0.001 downsamples 20 most-common words
2022-01-22 18:47:03,130 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 7295791.508223403 word corpus (91.9%% of prior 7935096)', 'datetime': '2022-01-22T18:47:03.130857', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2022-01-22 18:47:03,585 : INFO : estimated required memory for 39772 words and 50 dimensions: 526298000 bytes
2022-01-22 18:47:03,586 : INFO : resetting layer weights
2022-01-22 18:47:03,931 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 39772 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-01-22T18:47:03.931789', 'gensim': '4.1.2', 'python': '3.8.1 (default, Ju

2022-01-22 18:48:07,070 : INFO : EPOCH 2 - PROGRESS: at 9.07% examples, 136458 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:48:08,103 : INFO : EPOCH 2 - PROGRESS: at 10.91% examples, 137845 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:48:09,128 : INFO : EPOCH 2 - PROGRESS: at 12.61% examples, 137521 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:48:10,137 : INFO : EPOCH 2 - PROGRESS: at 14.28% examples, 137489 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:48:11,254 : INFO : EPOCH 2 - PROGRESS: at 16.09% examples, 137070 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:48:12,284 : INFO : EPOCH 2 - PROGRESS: at 17.92% examples, 137897 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:48:13,329 : INFO : EPOCH 2 - PROGRESS: at 19.51% examples, 136590 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:48:14,442 : INFO : EPOCH 2 - PROGRESS: at 21.48% examples, 137285 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:48:15,462 : INFO : EPOCH 2 - PROGRESS: at 23.31% examples, 137997 words/s, in_qsize

2022-01-22 18:49:20,697 : INFO : EPOCH 3 - PROGRESS: at 22.66% examples, 132627 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:49:21,700 : INFO : EPOCH 3 - PROGRESS: at 24.21% examples, 132278 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:49:22,749 : INFO : EPOCH 3 - PROGRESS: at 25.77% examples, 131620 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:49:23,805 : INFO : EPOCH 3 - PROGRESS: at 27.59% examples, 132223 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:49:24,817 : INFO : EPOCH 3 - PROGRESS: at 29.16% examples, 131899 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:49:25,830 : INFO : EPOCH 3 - PROGRESS: at 31.00% examples, 132748 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:49:26,862 : INFO : EPOCH 3 - PROGRESS: at 32.81% examples, 133369 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:49:28,105 : INFO : EPOCH 3 - PROGRESS: at 34.52% examples, 132101 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:49:29,176 : INFO : EPOCH 3 - PROGRESS: at 36.06% examples, 131500 words/s, in_qsiz

2022-01-22 18:50:34,868 : INFO : EPOCH 4 - PROGRESS: at 36.32% examples, 119542 words/s, in_qsize 5, out_qsize 0
2022-01-22 18:50:35,887 : INFO : EPOCH 4 - PROGRESS: at 37.89% examples, 119794 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:50:37,026 : INFO : EPOCH 4 - PROGRESS: at 39.69% examples, 119951 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:50:38,036 : INFO : EPOCH 4 - PROGRESS: at 41.39% examples, 120276 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:50:39,147 : INFO : EPOCH 4 - PROGRESS: at 43.23% examples, 120530 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:50:40,235 : INFO : EPOCH 4 - PROGRESS: at 45.03% examples, 120833 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:50:41,236 : INFO : EPOCH 4 - PROGRESS: at 46.96% examples, 121781 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:50:42,272 : INFO : EPOCH 4 - PROGRESS: at 48.73% examples, 122194 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:50:43,280 : INFO : EPOCH 4 - PROGRESS: at 50.52% examples, 122695 words/s, in_qsiz

2022-01-22 18:51:48,656 : INFO : EPOCH 5 - PROGRESS: at 57.94% examples, 133574 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:51:49,755 : INFO : EPOCH 5 - PROGRESS: at 59.93% examples, 133694 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:51:50,870 : INFO : EPOCH 5 - PROGRESS: at 61.86% examples, 133726 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:51:52,017 : INFO : EPOCH 5 - PROGRESS: at 64.02% examples, 134187 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:51:53,108 : INFO : EPOCH 5 - PROGRESS: at 66.11% examples, 134793 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:51:54,189 : INFO : EPOCH 5 - PROGRESS: at 68.12% examples, 135377 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:51:55,275 : INFO : EPOCH 5 - PROGRESS: at 70.23% examples, 136154 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:51:56,368 : INFO : EPOCH 5 - PROGRESS: at 72.20% examples, 136629 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:51:57,382 : INFO : EPOCH 5 - PROGRESS: at 74.10% examples, 137298 words/s, in_qsiz

2022-01-22 18:53:01,570 : INFO : EPOCH 6 - PROGRESS: at 92.79% examples, 155710 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:53:02,577 : INFO : EPOCH 6 - PROGRESS: at 94.69% examples, 156287 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:53:03,587 : INFO : EPOCH 6 - PROGRESS: at 96.56% examples, 156827 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:53:04,664 : INFO : EPOCH 6 - PROGRESS: at 98.43% examples, 157179 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:53:05,405 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-01-22 18:53:05,466 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-01-22 18:53:05,484 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-01-22 18:53:05,485 : INFO : EPOCH - 6 : training on 8042881 raw words (8522028 effective words) took 54.0s, 157790 effective words/s
2022-01-22 18:53:06,548 : INFO : EPOCH 7 - PROGRESS: at 1.79% examples, 140376 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:53:07,608 :

2022-01-22 18:54:11,870 : INFO : EPOCH 8 - PROGRESS: at 13.90% examples, 135497 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:54:12,909 : INFO : EPOCH 8 - PROGRESS: at 15.70% examples, 136393 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:54:14,029 : INFO : EPOCH 8 - PROGRESS: at 17.54% examples, 136155 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:54:15,047 : INFO : EPOCH 8 - PROGRESS: at 19.24% examples, 136208 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:54:16,055 : INFO : EPOCH 8 - PROGRESS: at 20.96% examples, 136405 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:54:17,056 : INFO : EPOCH 8 - PROGRESS: at 22.53% examples, 135820 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:54:18,130 : INFO : EPOCH 8 - PROGRESS: at 24.21% examples, 135313 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:54:19,207 : INFO : EPOCH 8 - PROGRESS: at 25.90% examples, 134863 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:54:20,350 : INFO : EPOCH 8 - PROGRESS: at 27.85% examples, 135215 words/s, in_qsiz

2022-01-22 18:55:25,091 : INFO : EPOCH 9 - PROGRESS: at 44.89% examples, 148525 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:55:26,150 : INFO : EPOCH 9 - PROGRESS: at 46.96% examples, 148686 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:55:27,252 : INFO : EPOCH 9 - PROGRESS: at 48.86% examples, 148181 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:55:28,370 : INFO : EPOCH 9 - PROGRESS: at 50.92% examples, 148027 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:55:29,374 : INFO : EPOCH 9 - PROGRESS: at 52.86% examples, 147802 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:55:30,433 : INFO : EPOCH 9 - PROGRESS: at 54.74% examples, 147287 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:55:31,435 : INFO : EPOCH 9 - PROGRESS: at 56.62% examples, 147073 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:55:32,466 : INFO : EPOCH 9 - PROGRESS: at 58.66% examples, 147080 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:55:33,556 : INFO : EPOCH 9 - PROGRESS: at 60.62% examples, 146786 words/s, in_qsiz

2022-01-22 18:56:38,849 : INFO : EPOCH 10 - PROGRESS: at 71.96% examples, 132670 words/s, in_qsize 5, out_qsize 0
2022-01-22 18:56:39,850 : INFO : EPOCH 10 - PROGRESS: at 73.64% examples, 132990 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:56:40,858 : INFO : EPOCH 10 - PROGRESS: at 75.23% examples, 133242 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:56:41,858 : INFO : EPOCH 10 - PROGRESS: at 76.78% examples, 133504 words/s, in_qsize 5, out_qsize 0
2022-01-22 18:56:42,904 : INFO : EPOCH 10 - PROGRESS: at 77.97% examples, 132965 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:56:44,002 : INFO : EPOCH 10 - PROGRESS: at 79.36% examples, 132723 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:56:45,090 : INFO : EPOCH 10 - PROGRESS: at 80.31% examples, 131691 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:56:46,148 : INFO : EPOCH 10 - PROGRESS: at 81.88% examples, 131984 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:56:47,177 : INFO : EPOCH 10 - PROGRESS: at 83.45% examples, 132351 words/s

## train model (vector size:100)

In [66]:
model = train_and_get_model(100, data)
model.save('d2v_vec100.model')

2022-01-22 18:57:00,170 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d100,n5,w5,mc5,s0.001,t3)', 'datetime': '2022-01-22T18:57:00.170643', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
2022-01-22 18:57:00,172 : INFO : collecting all words and their counts
2022-01-22 18:57:00,173 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2022-01-22 18:57:00,195 : INFO : PROGRESS: at example #10000, processed 64407 words (3141313/s), 10057 word types, 0 tags
2022-01-22 18:57:00,216 : INFO : PROGRESS: at example #20000, processed 127677 words (3070042/s), 14092 word types, 0 tags
2022-01-22 18:57:00,238 : INFO : PROGRESS: at example #30000, processed 190647 words (3087133/s), 17073 word types, 0 tags
2022-01-22 18:57:00,259 : INFO : PROGRESS: at example #40000, processed 253604 words (3068590/s), 19307 word types, 0 tags
2022-01-

2022-01-22 18:57:01,703 : INFO : PROGRESS: at example #630000, processed 3879565 words (2509223/s), 60483 word types, 0 tags
2022-01-22 18:57:01,726 : INFO : PROGRESS: at example #640000, processed 3933954 words (2404429/s), 61163 word types, 0 tags
2022-01-22 18:57:01,748 : INFO : PROGRESS: at example #650000, processed 3988612 words (2654625/s), 61851 word types, 0 tags
2022-01-22 18:57:01,769 : INFO : PROGRESS: at example #660000, processed 4045225 words (2704701/s), 62471 word types, 0 tags
2022-01-22 18:57:01,791 : INFO : PROGRESS: at example #670000, processed 4101596 words (2693639/s), 63054 word types, 0 tags
2022-01-22 18:57:01,813 : INFO : PROGRESS: at example #680000, processed 4158093 words (2708488/s), 63672 word types, 0 tags
2022-01-22 18:57:01,837 : INFO : PROGRESS: at example #690000, processed 4214205 words (2419384/s), 64323 word types, 0 tags
2022-01-22 18:57:01,861 : INFO : PROGRESS: at example #700000, processed 4270917 words (2490930/s), 64889 word types, 0 tags


2022-01-22 18:57:03,829 : INFO : deleting the raw counts dictionary of 106758 items
2022-01-22 18:57:03,833 : INFO : sample=0.001 downsamples 20 most-common words
2022-01-22 18:57:03,833 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 7295791.508223403 word corpus (91.9%% of prior 7935096)', 'datetime': '2022-01-22T18:57:03.833958', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2022-01-22 18:57:04,263 : INFO : estimated required memory for 39772 words and 100 dimensions: 787458400 bytes
2022-01-22 18:57:04,264 : INFO : resetting layer weights
2022-01-22 18:57:05,005 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 39772 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-01-22T18:57:05.005428', 'gensim': '4.1.2', 'python': '3.8.1 (default, 

2022-01-22 18:58:09,033 : INFO : EPOCH 2 - PROGRESS: at 8.93% examples, 138282 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:58:10,078 : INFO : EPOCH 2 - PROGRESS: at 10.78% examples, 139163 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:58:11,152 : INFO : EPOCH 2 - PROGRESS: at 12.61% examples, 139151 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:58:12,179 : INFO : EPOCH 2 - PROGRESS: at 14.54% examples, 141161 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:58:13,192 : INFO : EPOCH 2 - PROGRESS: at 16.35% examples, 141855 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:58:14,253 : INFO : EPOCH 2 - PROGRESS: at 18.18% examples, 141811 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:58:15,374 : INFO : EPOCH 2 - PROGRESS: at 20.04% examples, 141069 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:58:16,534 : INFO : EPOCH 2 - PROGRESS: at 22.00% examples, 140857 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:58:17,613 : INFO : EPOCH 2 - PROGRESS: at 23.95% examples, 141443 words/s, in_qsize

2022-01-22 18:59:21,941 : INFO : EPOCH 3 - PROGRESS: at 35.42% examples, 143564 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:59:22,941 : INFO : EPOCH 3 - PROGRESS: at 37.22% examples, 143842 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:59:23,976 : INFO : EPOCH 3 - PROGRESS: at 38.99% examples, 143452 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:59:25,098 : INFO : EPOCH 3 - PROGRESS: at 40.96% examples, 143058 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:59:26,138 : INFO : EPOCH 3 - PROGRESS: at 42.80% examples, 142749 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:59:27,215 : INFO : EPOCH 3 - PROGRESS: at 44.61% examples, 142229 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:59:28,382 : INFO : EPOCH 3 - PROGRESS: at 46.68% examples, 142056 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:59:29,473 : INFO : EPOCH 3 - PROGRESS: at 48.73% examples, 142270 words/s, in_qsize 6, out_qsize 0
2022-01-22 18:59:30,537 : INFO : EPOCH 3 - PROGRESS: at 50.52% examples, 141870 words/s, in_qsiz

2022-01-22 19:00:34,956 : INFO : EPOCH 4 - PROGRESS: at 59.37% examples, 133853 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:00:35,979 : INFO : EPOCH 4 - PROGRESS: at 60.75% examples, 133047 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:00:37,008 : INFO : EPOCH 4 - PROGRESS: at 62.26% examples, 132539 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:00:38,174 : INFO : EPOCH 4 - PROGRESS: at 64.16% examples, 132421 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:00:39,200 : INFO : EPOCH 4 - PROGRESS: at 66.11% examples, 133017 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:00:40,209 : INFO : EPOCH 4 - PROGRESS: at 67.99% examples, 133613 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:00:41,252 : INFO : EPOCH 4 - PROGRESS: at 69.85% examples, 134062 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:00:42,272 : INFO : EPOCH 4 - PROGRESS: at 71.59% examples, 134321 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:00:43,381 : INFO : EPOCH 4 - PROGRESS: at 73.53% examples, 134761 words/s, in_qsiz

2022-01-22 19:01:48,679 : INFO : EPOCH 5 - PROGRESS: at 83.45% examples, 137672 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:01:49,708 : INFO : EPOCH 5 - PROGRESS: at 85.24% examples, 138340 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:01:50,757 : INFO : EPOCH 5 - PROGRESS: at 87.00% examples, 138922 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:01:51,784 : INFO : EPOCH 5 - PROGRESS: at 88.76% examples, 139537 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:01:52,821 : INFO : EPOCH 5 - PROGRESS: at 90.62% examples, 140298 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:01:53,850 : INFO : EPOCH 5 - PROGRESS: at 92.38% examples, 140862 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:01:54,902 : INFO : EPOCH 5 - PROGRESS: at 94.27% examples, 141533 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:01:55,929 : INFO : EPOCH 5 - PROGRESS: at 96.04% examples, 142068 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:01:56,972 : INFO : EPOCH 5 - PROGRESS: at 97.91% examples, 142736 words/s, in_qsiz

2022-01-22 19:02:58,406 : INFO : EPOCH - 6 : training on 8042881 raw words (8521556 effective words) took 60.3s, 141255 effective words/s
2022-01-22 19:02:59,462 : INFO : EPOCH 7 - PROGRESS: at 1.66% examples, 131418 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:03:00,482 : INFO : EPOCH 7 - PROGRESS: at 3.60% examples, 143580 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:03:01,532 : INFO : EPOCH 7 - PROGRESS: at 5.39% examples, 142964 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:03:02,546 : INFO : EPOCH 7 - PROGRESS: at 7.08% examples, 141472 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:03:03,611 : INFO : EPOCH 7 - PROGRESS: at 8.93% examples, 141351 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:03:04,659 : INFO : EPOCH 7 - PROGRESS: at 10.78% examples, 141666 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:03:05,693 : INFO : EPOCH 7 - PROGRESS: at 12.61% examples, 142048 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:03:06,710 : INFO : EPOCH 7 - PROGRESS: at 14.15% examples, 140

2022-01-22 19:04:11,884 : INFO : EPOCH 8 - PROGRESS: at 20.43% examples, 140057 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:04:12,946 : INFO : EPOCH 8 - PROGRESS: at 22.27% examples, 140169 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:04:13,994 : INFO : EPOCH 8 - PROGRESS: at 24.08% examples, 140350 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:04:15,100 : INFO : EPOCH 8 - PROGRESS: at 25.77% examples, 139247 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:04:16,235 : INFO : EPOCH 8 - PROGRESS: at 27.59% examples, 138721 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:04:17,251 : INFO : EPOCH 8 - PROGRESS: at 29.03% examples, 137352 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:04:18,258 : INFO : EPOCH 8 - PROGRESS: at 30.73% examples, 137389 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:04:19,322 : INFO : EPOCH 8 - PROGRESS: at 32.55% examples, 137560 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:04:20,328 : INFO : EPOCH 8 - PROGRESS: at 34.26% examples, 137581 words/s, in_qsiz

2022-01-22 19:05:26,038 : INFO : EPOCH 9 - PROGRESS: at 41.53% examples, 130174 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:05:27,242 : INFO : EPOCH 9 - PROGRESS: at 43.65% examples, 130389 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:05:28,404 : INFO : EPOCH 9 - PROGRESS: at 45.73% examples, 130744 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:05:29,413 : INFO : EPOCH 9 - PROGRESS: at 47.64% examples, 131375 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:05:30,478 : INFO : EPOCH 9 - PROGRESS: at 49.41% examples, 131364 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:05:31,595 : INFO : EPOCH 9 - PROGRESS: at 51.21% examples, 131153 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:05:32,653 : INFO : EPOCH 9 - PROGRESS: at 52.86% examples, 130593 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:05:33,699 : INFO : EPOCH 9 - PROGRESS: at 54.60% examples, 130404 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:05:34,806 : INFO : EPOCH 9 - PROGRESS: at 56.48% examples, 130310 words/s, in_qsiz

2022-01-22 19:06:39,241 : INFO : EPOCH 10 - PROGRESS: at 71.71% examples, 148472 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:06:40,312 : INFO : EPOCH 10 - PROGRESS: at 73.64% examples, 148705 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:06:41,386 : INFO : EPOCH 10 - PROGRESS: at 75.67% examples, 149373 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:06:42,388 : INFO : EPOCH 10 - PROGRESS: at 77.54% examples, 150015 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:06:43,404 : INFO : EPOCH 10 - PROGRESS: at 79.36% examples, 150557 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:06:44,455 : INFO : EPOCH 10 - PROGRESS: at 81.25% examples, 151173 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:06:45,466 : INFO : EPOCH 10 - PROGRESS: at 83.14% examples, 151903 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:06:46,475 : INFO : EPOCH 10 - PROGRESS: at 84.93% examples, 152387 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:06:47,484 : INFO : EPOCH 10 - PROGRESS: at 86.80% examples, 153073 words/s

## train  model (vector size:200)

In [67]:
model = train_and_get_model(200, data)
model.save('d2v_vec200.model')

2022-01-22 19:06:57,077 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d200,n5,w5,mc5,s0.001,t3)', 'datetime': '2022-01-22T19:06:57.077848', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
2022-01-22 19:06:57,080 : INFO : collecting all words and their counts
2022-01-22 19:06:57,082 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2022-01-22 19:06:57,108 : INFO : PROGRESS: at example #10000, processed 64407 words (2785501/s), 10057 word types, 0 tags
2022-01-22 19:06:57,137 : INFO : PROGRESS: at example #20000, processed 127677 words (2183768/s), 14092 word types, 0 tags
2022-01-22 19:06:57,162 : INFO : PROGRESS: at example #30000, processed 190647 words (2685784/s), 17073 word types, 0 tags
2022-01-22 19:06:57,191 : INFO : PROGRESS: at example #40000, processed 253604 words (2231440/s), 19307 word types, 0 tags
2022-01-

2022-01-22 19:06:58,732 : INFO : PROGRESS: at example #630000, processed 3879565 words (2312544/s), 60483 word types, 0 tags
2022-01-22 19:06:58,757 : INFO : PROGRESS: at example #640000, processed 3933954 words (2229861/s), 61163 word types, 0 tags
2022-01-22 19:06:58,783 : INFO : PROGRESS: at example #650000, processed 3988612 words (2243119/s), 61851 word types, 0 tags
2022-01-22 19:06:58,810 : INFO : PROGRESS: at example #660000, processed 4045225 words (2117599/s), 62471 word types, 0 tags
2022-01-22 19:06:58,837 : INFO : PROGRESS: at example #670000, processed 4101596 words (2195817/s), 63054 word types, 0 tags
2022-01-22 19:06:58,862 : INFO : PROGRESS: at example #680000, processed 4158093 words (2275954/s), 63672 word types, 0 tags
2022-01-22 19:06:58,888 : INFO : PROGRESS: at example #690000, processed 4214205 words (2227518/s), 64323 word types, 0 tags
2022-01-22 19:06:58,914 : INFO : PROGRESS: at example #700000, processed 4270917 words (2294272/s), 64889 word types, 0 tags


2022-01-22 19:07:00,957 : INFO : deleting the raw counts dictionary of 106758 items
2022-01-22 19:07:00,960 : INFO : sample=0.001 downsamples 20 most-common words
2022-01-22 19:07:00,961 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 7295791.508223403 word corpus (91.9%% of prior 7935096)', 'datetime': '2022-01-22T19:07:00.961405', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2022-01-22 19:07:01,383 : INFO : estimated required memory for 39772 words and 200 dimensions: 1309779200 bytes
2022-01-22 19:07:01,384 : INFO : resetting layer weights
2022-01-22 19:07:02,958 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 39772 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-01-22T19:07:02.958314', 'gensim': '4.1.2', 'python': '3.8.1 (default,

2022-01-22 19:08:05,774 : INFO : EPOCH 2 - PROGRESS: at 15.70% examples, 151330 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:08:06,798 : INFO : EPOCH 2 - PROGRESS: at 17.66% examples, 151944 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:08:07,919 : INFO : EPOCH 2 - PROGRESS: at 19.64% examples, 151037 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:08:08,987 : INFO : EPOCH 2 - PROGRESS: at 21.74% examples, 151910 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:08:10,006 : INFO : EPOCH 2 - PROGRESS: at 23.56% examples, 151511 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:08:11,020 : INFO : EPOCH 2 - PROGRESS: at 25.51% examples, 151973 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:08:12,048 : INFO : EPOCH 2 - PROGRESS: at 27.33% examples, 151515 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:08:13,053 : INFO : EPOCH 2 - PROGRESS: at 29.29% examples, 152008 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:08:14,109 : INFO : EPOCH 2 - PROGRESS: at 31.26% examples, 152007 words/s, in_qsiz

2022-01-22 19:09:19,924 : INFO : EPOCH 3 - PROGRESS: at 29.81% examples, 151542 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:09:20,941 : INFO : EPOCH 3 - PROGRESS: at 31.65% examples, 151288 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:09:21,974 : INFO : EPOCH 3 - PROGRESS: at 33.60% examples, 151495 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:09:23,091 : INFO : EPOCH 3 - PROGRESS: at 35.55% examples, 151026 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:09:24,184 : INFO : EPOCH 3 - PROGRESS: at 37.62% examples, 151309 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:09:25,273 : INFO : EPOCH 3 - PROGRESS: at 39.69% examples, 151197 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:09:26,283 : INFO : EPOCH 3 - PROGRESS: at 41.67% examples, 151165 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:09:27,344 : INFO : EPOCH 3 - PROGRESS: at 43.65% examples, 150804 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:09:28,482 : INFO : EPOCH 3 - PROGRESS: at 45.73% examples, 150393 words/s, in_qsiz

2022-01-22 19:10:32,894 : INFO : EPOCH 4 - PROGRESS: at 64.83% examples, 149057 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:10:34,034 : INFO : EPOCH 4 - PROGRESS: at 66.87% examples, 149052 words/s, in_qsize 5, out_qsize 0
2022-01-22 19:10:35,179 : INFO : EPOCH 4 - PROGRESS: at 68.99% examples, 149291 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:10:36,198 : INFO : EPOCH 4 - PROGRESS: at 70.97% examples, 149720 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:10:37,226 : INFO : EPOCH 4 - PROGRESS: at 72.93% examples, 150098 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:10:38,286 : INFO : EPOCH 4 - PROGRESS: at 74.90% examples, 150549 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:10:39,308 : INFO : EPOCH 4 - PROGRESS: at 76.78% examples, 151103 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:10:40,380 : INFO : EPOCH 4 - PROGRESS: at 78.72% examples, 151677 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:10:41,453 : INFO : EPOCH 4 - PROGRESS: at 80.73% examples, 152455 words/s, in_qsiz

2022-01-22 19:11:45,648 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-01-22 19:11:45,709 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-01-22 19:11:45,734 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-01-22 19:11:45,735 : INFO : EPOCH - 5 : training on 8042881 raw words (8522128 effective words) took 53.8s, 158500 effective words/s
2022-01-22 19:11:46,830 : INFO : EPOCH 6 - PROGRESS: at 1.92% examples, 147253 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:11:47,914 : INFO : EPOCH 6 - PROGRESS: at 3.98% examples, 152108 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:11:48,939 : INFO : EPOCH 6 - PROGRESS: at 5.78% examples, 149940 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:11:50,047 : INFO : EPOCH 6 - PROGRESS: at 7.87% examples, 151099 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:11:51,092 : INFO : EPOCH 6 - PROGRESS: at 9.86% examples, 151632 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:11:52,222 : INF

2022-01-22 19:12:56,382 : INFO : EPOCH 7 - PROGRESS: at 31.13% examples, 152321 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:12:57,438 : INFO : EPOCH 7 - PROGRESS: at 33.08% examples, 152288 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:12:58,612 : INFO : EPOCH 7 - PROGRESS: at 35.16% examples, 151870 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:12:59,661 : INFO : EPOCH 7 - PROGRESS: at 37.22% examples, 152437 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:13:00,665 : INFO : EPOCH 7 - PROGRESS: at 38.99% examples, 151820 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:13:01,703 : INFO : EPOCH 7 - PROGRESS: at 41.10% examples, 152058 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:13:02,723 : INFO : EPOCH 7 - PROGRESS: at 42.94% examples, 151456 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:13:03,751 : INFO : EPOCH 7 - PROGRESS: at 44.89% examples, 151262 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:13:04,793 : INFO : EPOCH 7 - PROGRESS: at 46.82% examples, 150987 words/s, in_qsiz

2022-01-22 19:14:09,375 : INFO : EPOCH 8 - PROGRESS: at 66.87% examples, 148984 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:14:10,393 : INFO : EPOCH 8 - PROGRESS: at 68.87% examples, 149456 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:14:11,432 : INFO : EPOCH 8 - PROGRESS: at 70.72% examples, 149528 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:14:12,478 : INFO : EPOCH 8 - PROGRESS: at 72.69% examples, 149845 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:14:13,499 : INFO : EPOCH 8 - PROGRESS: at 74.55% examples, 150201 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:14:14,559 : INFO : EPOCH 8 - PROGRESS: at 76.56% examples, 150880 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:14:15,566 : INFO : EPOCH 8 - PROGRESS: at 78.50% examples, 151701 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:14:16,620 : INFO : EPOCH 8 - PROGRESS: at 80.42% examples, 152305 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:14:17,701 : INFO : EPOCH 8 - PROGRESS: at 82.30% examples, 152767 words/s, in_qsiz

2022-01-22 19:15:25,390 : INFO : EPOCH 9 - PROGRESS: at 93.01% examples, 137555 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:15:26,403 : INFO : EPOCH 9 - PROGRESS: at 94.90% examples, 138365 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:15:27,417 : INFO : EPOCH 9 - PROGRESS: at 96.76% examples, 139145 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:15:28,439 : INFO : EPOCH 9 - PROGRESS: at 98.63% examples, 139902 words/s, in_qsize 5, out_qsize 0
2022-01-22 19:15:29,069 : INFO : worker thread finished; awaiting finish of 2 more threads
2022-01-22 19:15:29,107 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-01-22 19:15:29,119 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-01-22 19:15:29,120 : INFO : EPOCH - 9 : training on 8042881 raw words (8521230 effective words) took 60.6s, 140661 effective words/s
2022-01-22 19:15:30,161 : INFO : EPOCH 10 - PROGRESS: at 1.79% examples, 143315 words/s, in_qsize 6, out_qsize 0
2022-01-22 19:15:31,171 

##  convert  to json

In [110]:
def conv2json(model, data, output_file_path):
    with open(output_file_path, 'w') as f:
        for i, row in tqdm(enumerate(data)):
            record = {}
            record['id'] = i
            record['original_text'] = row['headline_text']
            record['vector'] = model.docvecs[i].tolist()
            
            f.write("{}\r\n".format(json.dumps(record, ensure_ascii=False)))

In [111]:
# vec_dims = [25, 50, 100, 200]
vec_dims = [25]
file_path = "../rally-config/race_dim_{}/document.json"
for dim in vec_dims:
    model = Doc2Vec.load('d2v_vec{}.model'.format(dim))
    file_name = file_path.format(dim)
    
    conv2json(model, raw_data, file_name)

2022-01-23 17:58:32,978 : INFO : loading Doc2Vec object from d2v_vec25.model
2022-01-23 17:58:33,132 : INFO : loading dv recursively from d2v_vec25.model.dv.* with mmap=None
2022-01-23 17:58:33,133 : INFO : loading vectors from d2v_vec25.model.dv.vectors.npy with mmap=None
2022-01-23 17:58:33,285 : INFO : loading wv recursively from d2v_vec25.model.wv.* with mmap=None
2022-01-23 17:58:33,286 : INFO : setting ignored attribute cum_table to None
2022-01-23 17:58:33,695 : INFO : Doc2Vec lifecycle event {'fname': 'd2v_vec25.model', 'datetime': '2022-01-23T17:58:33.695462', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'loaded'}
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, row in tqdm(enumerate(data)):


0it [00:00, ?it/s]

  record['vector'] = model.docvecs[i].tolist()


In [107]:
def conv2vec_top1000(model, data, output_file_path):
    with open(output_file_path, 'w') as f:
        f.write('[')
        for i, row in tqdm(enumerate(data[:1000])):
            if i != 0:
                f.write(',')
            f.write("{}".format(model.dv[i].tolist()))
        f.write(']')

In [108]:
vec_dims = [25, 50, 100, 200]
file_path = "../rally-config/race_dim_{}/search_vector_top1000.json"
for dim in vec_dims:
    model = Doc2Vec.load('d2v_vec{}.model'.format(dim))
    file_name = file_path.format(dim)
    
    conv2vec_top1000(model, raw_data, file_name)

2022-01-23 16:04:43,064 : INFO : loading Doc2Vec object from d2v_vec25.model
2022-01-23 16:04:43,153 : INFO : loading dv recursively from d2v_vec25.model.dv.* with mmap=None
2022-01-23 16:04:43,154 : INFO : loading vectors from d2v_vec25.model.dv.vectors.npy with mmap=None
2022-01-23 16:04:43,247 : INFO : loading wv recursively from d2v_vec25.model.wv.* with mmap=None
2022-01-23 16:04:43,248 : INFO : setting ignored attribute cum_table to None
2022-01-23 16:04:43,628 : INFO : Doc2Vec lifecycle event {'fname': 'd2v_vec25.model', 'datetime': '2022-01-23T16:04:43.628351', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'loaded'}
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, row in tqdm(enumerate(data[:1000])):


0it [00:00, ?it/s]

2022-01-23 16:04:43,822 : INFO : loading Doc2Vec object from d2v_vec50.model
2022-01-23 16:04:43,895 : INFO : loading dv recursively from d2v_vec50.model.dv.* with mmap=None
2022-01-23 16:04:43,914 : INFO : loading vectors from d2v_vec50.model.dv.vectors.npy with mmap=None
2022-01-23 16:04:44,067 : INFO : loading wv recursively from d2v_vec50.model.wv.* with mmap=None
2022-01-23 16:04:44,068 : INFO : setting ignored attribute cum_table to None
2022-01-23 16:04:44,491 : INFO : Doc2Vec lifecycle event {'fname': 'd2v_vec50.model', 'datetime': '2022-01-23T16:04:44.491951', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'loaded'}
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, row in tqdm(enumerate(data[:1000])):


0it [00:00, ?it/s]

2022-01-23 16:04:44,605 : INFO : loading Doc2Vec object from d2v_vec100.model
2022-01-23 16:04:44,694 : INFO : loading dv recursively from d2v_vec100.model.dv.* with mmap=None
2022-01-23 16:04:44,694 : INFO : loading vectors from d2v_vec100.model.dv.vectors.npy with mmap=None
2022-01-23 16:04:45,111 : INFO : loading wv recursively from d2v_vec100.model.wv.* with mmap=None
2022-01-23 16:04:45,112 : INFO : setting ignored attribute cum_table to None
2022-01-23 16:04:45,564 : INFO : Doc2Vec lifecycle event {'fname': 'd2v_vec100.model', 'datetime': '2022-01-23T16:04:45.564516', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'loaded'}
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, row in tqdm(enumerate(data[:1000])):


0it [00:00, ?it/s]

2022-01-23 16:04:45,758 : INFO : loading Doc2Vec object from d2v_vec200.model
2022-01-23 16:04:45,861 : INFO : loading dv recursively from d2v_vec200.model.dv.* with mmap=None
2022-01-23 16:04:45,862 : INFO : loading vectors from d2v_vec200.model.dv.vectors.npy with mmap=None
2022-01-23 16:04:46,550 : INFO : loading wv recursively from d2v_vec200.model.wv.* with mmap=None
2022-01-23 16:04:46,550 : INFO : setting ignored attribute cum_table to None
2022-01-23 16:04:46,990 : INFO : Doc2Vec lifecycle event {'fname': 'd2v_vec200.model', 'datetime': '2022-01-23T16:04:46.990819', 'gensim': '4.1.2', 'python': '3.8.1 (default, Jun  6 2020, 13:30:44) \n[Clang 11.0.3 (clang-1103.0.32.59)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'loaded'}
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, row in tqdm(enumerate(data[:1000])):


0it [00:00, ?it/s]