In [1]:
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py
# https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import os
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import multiprocessing
cores = multiprocessing.cpu_count()
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim.test.utils import get_tmpfile
from sklearn.metrics import accuracy_score, f1_score
import collections


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nsuse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nsuse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2023-05-01 13:22:27,759 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2023-05-01 13:22:27,761 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2023-05-01 13:22:27,762 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)", 'datetime': '2023-05-01T13:22:27.762886', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [3]:
cores = multiprocessing.cpu_count()
cores

20

In [4]:
completed_non_ea_bibs = []
completed_non_ea_text = []
with open('.//data//non_ea_papers_text.txt', 'r', encoding="utf-8") as f:
    for line in f:
        bib = line.split(",")[0].strip()
        text = line.split(",")[1].strip()
        completed_non_ea_bibs.append(bib)
        completed_non_ea_text.append(text)
n_neg = len(completed_non_ea_bibs)

In [5]:
completed_ea_bibs = []
completed_ea_text = []
with open('.//data//ea_papers_text.txt', 'r', encoding="utf-8") as f:
    for line in f:
        bib = line.split(",")[0].strip()
        text = line.split(",")[1].strip()
        completed_ea_bibs.append(bib)
        completed_ea_text.append(text)
n_pos = len(completed_ea_bibs)

In [6]:
pos_training_corpus = pd.DataFrame({"text":completed_ea_text, "bib":completed_ea_bibs, "label":np.repeat(1, len(completed_ea_bibs))})
neg_training_corpus = pd.DataFrame({"text":completed_non_ea_text, "bib":completed_non_ea_bibs, "label":np.repeat(0, len(completed_non_ea_bibs))})

training_corpus = pd.concat([pos_training_corpus, neg_training_corpus])
training_corpus = training_corpus.sample(frac=1, random_state=42).reset_index(drop=True)
training_corpus["index"] = training_corpus.index
training_corpus

Unnamed: 0,text,bib,label,index
0,arxivastroph0403324v4 17 jul 2022dark matter ...,2004LNP...653..141S,0,0
1,mon not r astron soc 000 19 0000 printed 20 au...,2013MNRAS.428.1077S,1,1
2,mon not r astron soc 427 3435–3467 2012 doi101...,2012MNRAS.427.3435A,0,2
3,astronomy astrophysics manuscript no hd20v2 ce...,2019arXiv190808754S,1,3
4,the microphysics of collisionless shock wavesa...,2016RPPh...79d6901M,0,4
...,...,...,...,...
7558,mon not r astron soc 360 869–891 2005 doi10111...,2005MNRAS.360..869L,0,7558
7559,arxivastroph0508228v1 10 aug 2005adarkjetdomi...,2005Natur.436..819G,0,7559
7560,1accepted for publication in nature20 october ...,2006Natur.444.1044G,0,7560
7561,arxiv11045230v3 astrophep 7 jul 2011accepted...,2011ApJ...737L..18W,1,7561


In [9]:
training_corpus["bib"].to_csv(".//data//training_bibs.csv", index=False)

In [25]:
print(n_neg)
print(n_pos)
n_neg/n_pos

4025
3538


1.137648388920294

In [27]:
n_pos/n_neg

0.8790062111801242

In [6]:
# neg_training_corpus_subset = neg_training_corpus.sample(n=n_pos, random_state=42).reset_index(drop=True)

# balanced_training_corpus = pd.concat([pos_training_corpus, neg_training_corpus_subset])
# balanced_training_corpus = balanced_training_corpus.sample(frac=1, random_state=42).reset_index(drop=True)

# len(balanced_training_corpus[balanced_training_corpus["label"] == 0].index) == len(balanced_training_corpus[balanced_training_corpus["label"] == 1].index)

In [7]:
stopWords = set(stopwords.words('english'))

def tokenize_text(text):
    tokens = []
    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if len(word) < 2 or word in stopWords:
                continue
            tokens.append(word.lower())
    return tokens

# punctuation was removed and text was set to lower case earlier (Convert PDFs to Text.ipynb)

In [8]:
training_corpus_tagged = training_corpus.apply(lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r["index"]]), axis=1).tolist()

In [10]:
train_subset_tagged, test_subset_tagged = train_test_split(training_corpus_tagged, test_size=0.1, random_state=42)

In [11]:
# https://stackoverflow.com/questions/69762635/what-are-the-negative-sample-parameters

model = Doc2Vec(dm=0, vector_size=600, negative=5, sample=0, workers=cores, epochs=30)

model.build_vocab(train_subset_tagged)
model.train(train_subset_tagged, total_examples=model.corpus_count, epochs=model.epochs)

2023-02-23 11:02:30,597 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dbow,d600,n5,mc5,t20)', 'datetime': '2023-02-23T11:02:30.597754', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
2023-02-23 11:02:30,601 : INFO : collecting all words and their counts
2023-02-23 11:02:30,601 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\nsuse\anaconda3\lib\logging\__init__.py", line 1083, in emit
    msg = self.format(record)
  File "C:\Users\nsuse\anaconda3\lib\logging\__init__.py", line 927, in format
    return fmt.format(record)
  File "C:\Users\nsuse\anaconda3\lib\logging\__init__.py", line 663, in format
    record.message = record.getMessage()
  File "C:\Users\nsuse\anaconda3\lib\logging\__init__.py", line 367, in getMessage
    msg = msg % self.args
TypeError: not enough

2023-02-23 11:03:14,820 : INFO : EPOCH 1 - PROGRESS: at 47.94% examples, 1211556 words/s, in_qsize 40, out_qsize 0
2023-02-23 11:03:15,845 : INFO : EPOCH 1 - PROGRESS: at 50.63% examples, 1199717 words/s, in_qsize 40, out_qsize 29
2023-02-23 11:03:16,926 : INFO : EPOCH 1 - PROGRESS: at 54.38% examples, 1212519 words/s, in_qsize 40, out_qsize 19
2023-02-23 11:03:17,879 : INFO : EPOCH 1 - PROGRESS: at 57.67% examples, 1213822 words/s, in_qsize 40, out_qsize 6
2023-02-23 11:03:18,884 : INFO : EPOCH 1 - PROGRESS: at 61.03% examples, 1215330 words/s, in_qsize 37, out_qsize 4
2023-02-23 11:03:19,904 : INFO : EPOCH 1 - PROGRESS: at 64.18% examples, 1216128 words/s, in_qsize 40, out_qsize 7
2023-02-23 11:03:20,908 : INFO : EPOCH 1 - PROGRESS: at 67.57% examples, 1218583 words/s, in_qsize 39, out_qsize 0
2023-02-23 11:03:21,910 : INFO : EPOCH 1 - PROGRESS: at 70.63% examples, 1215518 words/s, in_qsize 40, out_qsize 9
2023-02-23 11:03:22,927 : INFO : EPOCH 1 - PROGRESS: at 74.11% examples, 12199

2023-02-23 11:04:01,820 : INFO : worker thread finished; awaiting finish of 8 more threads
2023-02-23 11:04:01,820 : INFO : worker thread finished; awaiting finish of 7 more threads
2023-02-23 11:04:01,824 : INFO : worker thread finished; awaiting finish of 6 more threads
2023-02-23 11:04:01,824 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 11:04:01,824 : INFO : worker thread finished; awaiting finish of 4 more threads
2023-02-23 11:04:01,824 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-02-23 11:04:01,840 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-02-23 11:04:01,844 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-02-23 11:04:01,848 : INFO : worker thread finished; awaiting finish of 0 more threads
2023-02-23 11:04:01,852 : INFO : EPOCH - 2 : training on 47823808 raw words (37665975 effective words) took 31.2s, 1208843 effective words/s
2023-02-23 11:04:02,887 : INFO : EPOCH 3

2023-02-23 11:04:50,314 : INFO : EPOCH 4 - PROGRESS: at 50.26% examples, 1151845 words/s, in_qsize 39, out_qsize 0
2023-02-23 11:04:51,319 : INFO : EPOCH 4 - PROGRESS: at 53.45% examples, 1153708 words/s, in_qsize 39, out_qsize 0
2023-02-23 11:04:52,325 : INFO : EPOCH 4 - PROGRESS: at 56.46% examples, 1153862 words/s, in_qsize 12, out_qsize 0
2023-02-23 11:04:53,334 : INFO : EPOCH 4 - PROGRESS: at 59.64% examples, 1154913 words/s, in_qsize 38, out_qsize 1
2023-02-23 11:04:54,381 : INFO : EPOCH 4 - PROGRESS: at 62.52% examples, 1152546 words/s, in_qsize 36, out_qsize 3
2023-02-23 11:04:55,434 : INFO : EPOCH 4 - PROGRESS: at 65.72% examples, 1152405 words/s, in_qsize 40, out_qsize 11
2023-02-23 11:04:56,451 : INFO : EPOCH 4 - PROGRESS: at 68.95% examples, 1154459 words/s, in_qsize 40, out_qsize 3
2023-02-23 11:04:57,451 : INFO : EPOCH 4 - PROGRESS: at 72.04% examples, 1155478 words/s, in_qsize 37, out_qsize 0
2023-02-23 11:04:58,469 : INFO : EPOCH 4 - PROGRESS: at 75.10% examples, 115495

2023-02-23 11:05:38,322 : INFO : worker thread finished; awaiting finish of 10 more threads
2023-02-23 11:05:38,323 : INFO : worker thread finished; awaiting finish of 9 more threads
2023-02-23 11:05:38,324 : INFO : worker thread finished; awaiting finish of 8 more threads
2023-02-23 11:05:38,324 : INFO : worker thread finished; awaiting finish of 7 more threads
2023-02-23 11:05:38,326 : INFO : worker thread finished; awaiting finish of 6 more threads
2023-02-23 11:05:38,327 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 11:05:38,329 : INFO : worker thread finished; awaiting finish of 4 more threads
2023-02-23 11:05:38,330 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-02-23 11:05:38,334 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-02-23 11:05:38,335 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-02-23 11:05:38,338 : INFO : worker thread finished; awaiting finish of 0 more thread

2023-02-23 11:06:25,784 : INFO : EPOCH 7 - PROGRESS: at 54.39% examples, 1178026 words/s, in_qsize 40, out_qsize 2
2023-02-23 11:06:26,791 : INFO : EPOCH 7 - PROGRESS: at 57.46% examples, 1178311 words/s, in_qsize 35, out_qsize 4
2023-02-23 11:06:27,800 : INFO : EPOCH 7 - PROGRESS: at 60.76% examples, 1180180 words/s, in_qsize 40, out_qsize 3
2023-02-23 11:06:28,806 : INFO : EPOCH 7 - PROGRESS: at 63.80% examples, 1179866 words/s, in_qsize 40, out_qsize 3
2023-02-23 11:06:29,816 : INFO : EPOCH 7 - PROGRESS: at 66.94% examples, 1180405 words/s, in_qsize 40, out_qsize 3
2023-02-23 11:06:30,835 : INFO : EPOCH 7 - PROGRESS: at 70.09% examples, 1180204 words/s, in_qsize 40, out_qsize 5
2023-02-23 11:06:31,842 : INFO : EPOCH 7 - PROGRESS: at 73.23% examples, 1180657 words/s, in_qsize 38, out_qsize 1
2023-02-23 11:06:32,854 : INFO : EPOCH 7 - PROGRESS: at 76.36% examples, 1180016 words/s, in_qsize 40, out_qsize 1
2023-02-23 11:06:33,860 : INFO : EPOCH 7 - PROGRESS: at 79.61% examples, 1180851

2023-02-23 11:07:11,438 : INFO : worker thread finished; awaiting finish of 7 more threads
2023-02-23 11:07:11,438 : INFO : worker thread finished; awaiting finish of 6 more threads
2023-02-23 11:07:11,438 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 11:07:11,442 : INFO : worker thread finished; awaiting finish of 4 more threads
2023-02-23 11:07:11,442 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-02-23 11:07:11,446 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-02-23 11:07:11,450 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-02-23 11:07:11,454 : INFO : worker thread finished; awaiting finish of 0 more threads
2023-02-23 11:07:11,454 : INFO : EPOCH - 8 : training on 47823808 raw words (37665975 effective words) took 31.2s, 1207781 effective words/s
2023-02-23 11:07:12,482 : INFO : EPOCH 9 - PROGRESS: at 3.26% examples, 1139840 words/s, in_qsize 35, out_qsize 4
2023-02-23 11:07:

2023-02-23 11:08:01,459 : INFO : EPOCH 10 - PROGRESS: at 64.93% examples, 1260045 words/s, in_qsize 29, out_qsize 45
2023-02-23 11:08:02,467 : INFO : EPOCH 10 - PROGRESS: at 69.16% examples, 1274493 words/s, in_qsize 37, out_qsize 2
2023-02-23 11:08:03,598 : INFO : EPOCH 10 - PROGRESS: at 72.58% examples, 1268121 words/s, in_qsize 23, out_qsize 25
2023-02-23 11:08:04,667 : INFO : EPOCH 10 - PROGRESS: at 76.33% examples, 1270857 words/s, in_qsize 40, out_qsize 15
2023-02-23 11:08:05,688 : INFO : EPOCH 10 - PROGRESS: at 80.09% examples, 1275848 words/s, in_qsize 35, out_qsize 4
2023-02-23 11:08:06,735 : INFO : EPOCH 10 - PROGRESS: at 83.41% examples, 1272818 words/s, in_qsize 39, out_qsize 10
2023-02-23 11:08:07,742 : INFO : EPOCH 10 - PROGRESS: at 86.82% examples, 1270291 words/s, in_qsize 8, out_qsize 18
2023-02-23 11:08:08,803 : INFO : EPOCH 10 - PROGRESS: at 89.73% examples, 1260861 words/s, in_qsize 8, out_qsize 56
2023-02-23 11:08:09,819 : INFO : EPOCH 10 - PROGRESS: at 93.89% exam

2023-02-23 11:08:41,304 : INFO : EPOCH - 11 : training on 47823808 raw words (37665975 effective words) took 29.8s, 1262756 effective words/s
2023-02-23 11:08:42,320 : INFO : EPOCH 12 - PROGRESS: at 2.66% examples, 923427 words/s, in_qsize 33, out_qsize 6
2023-02-23 11:08:43,322 : INFO : EPOCH 12 - PROGRESS: at 5.36% examples, 948274 words/s, in_qsize 40, out_qsize 4
2023-02-23 11:08:44,361 : INFO : EPOCH 12 - PROGRESS: at 7.82% examples, 946343 words/s, in_qsize 30, out_qsize 19
2023-02-23 11:08:45,368 : INFO : EPOCH 12 - PROGRESS: at 10.98% examples, 996042 words/s, in_qsize 36, out_qsize 1
2023-02-23 11:08:46,395 : INFO : EPOCH 12 - PROGRESS: at 13.50% examples, 986077 words/s, in_qsize 20, out_qsize 19
2023-02-23 11:08:47,392 : INFO : EPOCH 12 - PROGRESS: at 16.51% examples, 1006570 words/s, in_qsize 40, out_qsize 0
2023-02-23 11:08:48,399 : INFO : EPOCH 12 - PROGRESS: at 19.39% examples, 1018584 words/s, in_qsize 40, out_qsize 0
2023-02-23 11:08:49,403 : INFO : EPOCH 12 - PROGRESS

2023-02-23 11:09:36,990 : INFO : EPOCH 13 - PROGRESS: at 85.69% examples, 1310656 words/s, in_qsize 40, out_qsize 60
2023-02-23 11:09:37,993 : INFO : EPOCH 13 - PROGRESS: at 90.60% examples, 1329500 words/s, in_qsize 40, out_qsize 2
2023-02-23 11:09:39,017 : INFO : EPOCH 13 - PROGRESS: at 94.06% examples, 1329038 words/s, in_qsize 30, out_qsize 9
2023-02-23 11:09:40,022 : INFO : EPOCH 13 - PROGRESS: at 97.90% examples, 1332730 words/s, in_qsize 36, out_qsize 3
2023-02-23 11:09:40,477 : INFO : worker thread finished; awaiting finish of 19 more threads
2023-02-23 11:09:40,477 : INFO : worker thread finished; awaiting finish of 18 more threads
2023-02-23 11:09:40,481 : INFO : worker thread finished; awaiting finish of 17 more threads
2023-02-23 11:09:40,485 : INFO : worker thread finished; awaiting finish of 16 more threads
2023-02-23 11:09:40,489 : INFO : worker thread finished; awaiting finish of 15 more threads
2023-02-23 11:09:40,494 : INFO : worker thread finished; awaiting finish of

2023-02-23 11:10:16,424 : INFO : EPOCH 15 - PROGRESS: at 23.41% examples, 1203932 words/s, in_qsize 40, out_qsize 5
2023-02-23 11:10:17,666 : INFO : EPOCH 15 - PROGRESS: at 26.65% examples, 1171596 words/s, in_qsize 40, out_qsize 53
2023-02-23 11:10:18,677 : INFO : EPOCH 15 - PROGRESS: at 30.88% examples, 1222890 words/s, in_qsize 40, out_qsize 3
2023-02-23 11:10:19,680 : INFO : EPOCH 15 - PROGRESS: at 34.26% examples, 1230713 words/s, in_qsize 40, out_qsize 0
2023-02-23 11:10:20,711 : INFO : EPOCH 15 - PROGRESS: at 37.82% examples, 1233151 words/s, in_qsize 40, out_qsize 5
2023-02-23 11:10:21,716 : INFO : EPOCH 15 - PROGRESS: at 41.58% examples, 1242925 words/s, in_qsize 39, out_qsize 0
2023-02-23 11:10:22,861 : INFO : EPOCH 15 - PROGRESS: at 44.81% examples, 1227911 words/s, in_qsize 40, out_qsize 37
2023-02-23 11:10:23,866 : INFO : EPOCH 15 - PROGRESS: at 48.96% examples, 1249549 words/s, in_qsize 39, out_qsize 0
2023-02-23 11:10:24,904 : INFO : EPOCH 15 - PROGRESS: at 52.00% exampl

2023-02-23 11:11:08,656 : INFO : worker thread finished; awaiting finish of 14 more threads
2023-02-23 11:11:08,656 : INFO : worker thread finished; awaiting finish of 13 more threads
2023-02-23 11:11:08,660 : INFO : worker thread finished; awaiting finish of 12 more threads
2023-02-23 11:11:08,665 : INFO : worker thread finished; awaiting finish of 11 more threads
2023-02-23 11:11:08,667 : INFO : worker thread finished; awaiting finish of 10 more threads
2023-02-23 11:11:08,669 : INFO : worker thread finished; awaiting finish of 9 more threads
2023-02-23 11:11:08,673 : INFO : worker thread finished; awaiting finish of 8 more threads
2023-02-23 11:11:08,675 : INFO : worker thread finished; awaiting finish of 7 more threads
2023-02-23 11:11:08,677 : INFO : worker thread finished; awaiting finish of 6 more threads
2023-02-23 11:11:08,681 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 11:11:08,685 : INFO : worker thread finished; awaiting finish of 4 more th

2023-02-23 11:11:52,813 : INFO : EPOCH 18 - PROGRESS: at 49.50% examples, 1273499 words/s, in_qsize 40, out_qsize 8
2023-02-23 11:11:53,859 : INFO : EPOCH 18 - PROGRESS: at 52.76% examples, 1267872 words/s, in_qsize 40, out_qsize 21
2023-02-23 11:11:54,949 : INFO : EPOCH 18 - PROGRESS: at 56.01% examples, 1259052 words/s, in_qsize 40, out_qsize 43
2023-02-23 11:11:56,026 : INFO : EPOCH 18 - PROGRESS: at 60.36% examples, 1274780 words/s, in_qsize 40, out_qsize 16
2023-02-23 11:11:57,104 : INFO : EPOCH 18 - PROGRESS: at 63.71% examples, 1273649 words/s, in_qsize 40, out_qsize 22
2023-02-23 11:11:58,109 : INFO : EPOCH 18 - PROGRESS: at 67.13% examples, 1273214 words/s, in_qsize 40, out_qsize 19
2023-02-23 11:11:59,113 : INFO : EPOCH 18 - PROGRESS: at 71.03% examples, 1282047 words/s, in_qsize 40, out_qsize 2
2023-02-23 11:12:00,124 : INFO : EPOCH 18 - PROGRESS: at 74.29% examples, 1280080 words/s, in_qsize 40, out_qsize 4
2023-02-23 11:12:01,155 : INFO : EPOCH 18 - PROGRESS: at 77.74% exa

2023-02-23 11:12:37,072 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 11:12:37,072 : INFO : worker thread finished; awaiting finish of 4 more threads
2023-02-23 11:12:37,076 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-02-23 11:12:37,076 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-02-23 11:12:37,080 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-02-23 11:12:37,088 : INFO : worker thread finished; awaiting finish of 0 more threads
2023-02-23 11:12:37,088 : INFO : EPOCH - 19 : training on 47823808 raw words (37665975 effective words) took 29.6s, 1274513 effective words/s
2023-02-23 11:12:38,146 : INFO : EPOCH 20 - PROGRESS: at 3.32% examples, 1131879 words/s, in_qsize 40, out_qsize 11
2023-02-23 11:12:39,179 : INFO : EPOCH 20 - PROGRESS: at 7.01% examples, 1225805 words/s, in_qsize 31, out_qsize 8
2023-02-23 11:12:40,179 : INFO : EPOCH 20 - PROGRESS: at 10.64% examples, 1265454

2023-02-23 11:13:28,167 : INFO : EPOCH 21 - PROGRESS: at 73.80% examples, 1283319 words/s, in_qsize 37, out_qsize 2
2023-02-23 11:13:29,183 : INFO : EPOCH 21 - PROGRESS: at 77.05% examples, 1281076 words/s, in_qsize 40, out_qsize 9
2023-02-23 11:13:30,193 : INFO : EPOCH 21 - PROGRESS: at 80.52% examples, 1279795 words/s, in_qsize 26, out_qsize 13
2023-02-23 11:13:31,226 : INFO : EPOCH 21 - PROGRESS: at 84.19% examples, 1282021 words/s, in_qsize 35, out_qsize 4
2023-02-23 11:13:32,261 : INFO : EPOCH 21 - PROGRESS: at 87.67% examples, 1279296 words/s, in_qsize 29, out_qsize 10
2023-02-23 11:13:33,304 : INFO : EPOCH 21 - PROGRESS: at 91.11% examples, 1278663 words/s, in_qsize 18, out_qsize 14
2023-02-23 11:13:34,344 : INFO : EPOCH 21 - PROGRESS: at 94.65% examples, 1278905 words/s, in_qsize 40, out_qsize 17
2023-02-23 11:13:35,367 : INFO : EPOCH 21 - PROGRESS: at 98.22% examples, 1280942 words/s, in_qsize 34, out_qsize 5
2023-02-23 11:13:35,721 : INFO : worker thread finished; awaiting fi

2023-02-23 11:14:07,279 : INFO : EPOCH 23 - PROGRESS: at 6.35% examples, 1142256 words/s, in_qsize 39, out_qsize 3
2023-02-23 11:14:08,349 : INFO : EPOCH 23 - PROGRESS: at 9.68% examples, 1151725 words/s, in_qsize 28, out_qsize 21
2023-02-23 11:14:09,349 : INFO : EPOCH 23 - PROGRESS: at 13.47% examples, 1227937 words/s, in_qsize 40, out_qsize 0
2023-02-23 11:14:10,364 : INFO : EPOCH 23 - PROGRESS: at 16.91% examples, 1235330 words/s, in_qsize 37, out_qsize 6
2023-02-23 11:14:11,374 : INFO : EPOCH 23 - PROGRESS: at 20.48% examples, 1249889 words/s, in_qsize 38, out_qsize 1
2023-02-23 11:14:12,373 : INFO : EPOCH 23 - PROGRESS: at 23.80% examples, 1247576 words/s, in_qsize 40, out_qsize 10
2023-02-23 11:14:13,415 : INFO : EPOCH 23 - PROGRESS: at 27.30% examples, 1251888 words/s, in_qsize 40, out_qsize 7
2023-02-23 11:14:14,415 : INFO : EPOCH 23 - PROGRESS: at 30.87% examples, 1268980 words/s, in_qsize 39, out_qsize 0
2023-02-23 11:14:15,418 : INFO : EPOCH 23 - PROGRESS: at 34.03% examples

2023-02-23 11:15:03,229 : INFO : worker thread finished; awaiting finish of 19 more threads
2023-02-23 11:15:03,241 : INFO : worker thread finished; awaiting finish of 18 more threads
2023-02-23 11:15:03,241 : INFO : worker thread finished; awaiting finish of 17 more threads
2023-02-23 11:15:03,245 : INFO : worker thread finished; awaiting finish of 16 more threads
2023-02-23 11:15:03,245 : INFO : worker thread finished; awaiting finish of 15 more threads
2023-02-23 11:15:03,245 : INFO : worker thread finished; awaiting finish of 14 more threads
2023-02-23 11:15:03,249 : INFO : worker thread finished; awaiting finish of 13 more threads
2023-02-23 11:15:03,253 : INFO : worker thread finished; awaiting finish of 12 more threads
2023-02-23 11:15:03,259 : INFO : worker thread finished; awaiting finish of 11 more threads
2023-02-23 11:15:03,259 : INFO : worker thread finished; awaiting finish of 10 more threads
2023-02-23 11:15:03,264 : INFO : worker thread finished; awaiting finish of 9 mo

2023-02-23 11:15:42,336 : INFO : EPOCH 26 - PROGRESS: at 36.19% examples, 1316375 words/s, in_qsize 40, out_qsize 5
2023-02-23 11:15:43,340 : INFO : EPOCH 26 - PROGRESS: at 39.82% examples, 1316119 words/s, in_qsize 40, out_qsize 5
2023-02-23 11:15:44,373 : INFO : EPOCH 26 - PROGRESS: at 43.48% examples, 1313734 words/s, in_qsize 40, out_qsize 10
2023-02-23 11:15:45,437 : INFO : EPOCH 26 - PROGRESS: at 46.80% examples, 1306432 words/s, in_qsize 40, out_qsize 28
2023-02-23 11:15:46,439 : INFO : EPOCH 26 - PROGRESS: at 50.71% examples, 1318014 words/s, in_qsize 39, out_qsize 0
2023-02-23 11:15:47,449 : INFO : EPOCH 26 - PROGRESS: at 54.20% examples, 1318962 words/s, in_qsize 40, out_qsize 1
2023-02-23 11:15:48,454 : INFO : EPOCH 26 - PROGRESS: at 57.60% examples, 1317164 words/s, in_qsize 40, out_qsize 6
2023-02-23 11:15:49,535 : INFO : EPOCH 26 - PROGRESS: at 60.67% examples, 1303994 words/s, in_qsize 40, out_qsize 48
2023-02-23 11:15:50,565 : INFO : EPOCH 26 - PROGRESS: at 64.55% examp

2023-02-23 11:16:31,247 : INFO : worker thread finished; awaiting finish of 10 more threads
2023-02-23 11:16:31,248 : INFO : worker thread finished; awaiting finish of 9 more threads
2023-02-23 11:16:31,249 : INFO : worker thread finished; awaiting finish of 8 more threads
2023-02-23 11:16:31,250 : INFO : worker thread finished; awaiting finish of 7 more threads
2023-02-23 11:16:31,251 : INFO : worker thread finished; awaiting finish of 6 more threads
2023-02-23 11:16:31,252 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 11:16:31,254 : INFO : worker thread finished; awaiting finish of 4 more threads
2023-02-23 11:16:31,257 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-02-23 11:16:31,260 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-02-23 11:16:31,261 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-02-23 11:16:31,269 : INFO : worker thread finished; awaiting finish of 0 more thread

2023-02-23 11:17:18,759 : INFO : EPOCH 29 - PROGRESS: at 63.08% examples, 1274747 words/s, in_qsize 40, out_qsize 4
2023-02-23 11:17:19,805 : INFO : EPOCH 29 - PROGRESS: at 66.53% examples, 1274189 words/s, in_qsize 20, out_qsize 19
2023-02-23 11:17:20,845 : INFO : EPOCH 29 - PROGRESS: at 70.47% examples, 1281156 words/s, in_qsize 30, out_qsize 9
2023-02-23 11:17:21,905 : INFO : EPOCH 29 - PROGRESS: at 73.88% examples, 1280049 words/s, in_qsize 40, out_qsize 17
2023-02-23 11:17:22,927 : INFO : EPOCH 29 - PROGRESS: at 77.62% examples, 1285410 words/s, in_qsize 25, out_qsize 7
2023-02-23 11:17:23,929 : INFO : EPOCH 29 - PROGRESS: at 81.38% examples, 1290124 words/s, in_qsize 38, out_qsize 3
2023-02-23 11:17:24,934 : INFO : EPOCH 29 - PROGRESS: at 84.98% examples, 1290781 words/s, in_qsize 40, out_qsize 2
2023-02-23 11:17:25,970 : INFO : EPOCH 29 - PROGRESS: at 88.58% examples, 1291327 words/s, in_qsize 40, out_qsize 6
2023-02-23 11:17:26,986 : INFO : EPOCH 29 - PROGRESS: at 91.87% exampl

2023-02-23 11:17:57,399 : INFO : EPOCH - 30 : training on 47823808 raw words (37665975 effective words) took 28.3s, 1332628 effective words/s
2023-02-23 11:17:57,399 : INFO : Doc2Vec lifecycle event {'msg': 'training on 1434714240 raw words (1129979250 effective words) took 898.0s, 1258336 effective words/s', 'datetime': '2023-02-23T11:17:57.399018', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'train'}


In [12]:
len(model.wv)

339129

In [13]:
len(model.dv)

7563

In [15]:
ranks = []
for doc_id in range(len(train_subset_tagged)):
    inferred_vector = model.infer_vector(train_subset_tagged[doc_id].words)
    tag = train_subset_tagged[doc_id].tags[0]
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(tag)
    ranks.append(rank)

counter = collections.Counter(ranks)
print(counter)
ranking_accuracy = counter[0]/(sum([counter[i] for i in range(len(counter))]))
print(ranking_accuracy)

Counter({0: 6753, 1: 53})
0.9922127534528358


In [16]:
correct = 0
for doc_id in range(len(test_subset_tagged)):
    inferred_vector = model.infer_vector(test_subset_tagged[doc_id].words)
    tag = test_subset_tagged[doc_id].tags[0]
    true_label = training_corpus[training_corpus["index"]==tag]["label"].values[0]
    
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    scores0 = []
    scores1 = []
    for docid, weight in sims:
        label = training_corpus[training_corpus["index"]==docid]["label"].values[0]
        if label == 0:
            scores0.append(weight)
        elif label == 1:
            scores1.append(weight)
            
    predicted_label = np.argmax([np.mean(scores0), np.mean(scores1)])
    correct += (predicted_label == true_label)
    
print(correct/len(test_subset_tagged))

0.964332892998679


In [17]:
# doc2vec model assessments
# dm \\ vector_size \\ negative \\ sample \\ train accuracy     \\ test accuracy
# 0  \\ 300         \\ 5        \\ 0      \\ 0.9922127534528358 \\ 0.9603698811096433
# 1  \\ 300         \\ 5        \\ 0      \\ 0.9908903908316191 \\ 0.9577278731836195
# 0  \\ 600         \\ 5        \\ 0      \\ 0.9925066118131061  \\ 0.964332892998679
# 1  \\ 600         \\ 5        \\ 0      \\ 0.9904496032912137  \\ 0.9590488771466315
# 0  \\ 900         \\ 5        \\ 0      \\ 0.9922127534528358 \\ 0.9616908850726552
# 0  \\ 1200        \\ 5        \\ 0      \\ 0.9920658242727005 \\ 0.9616908850726552
# 1  \\ 1200        \\ 5        \\ 0      \\ 0.9913311783720247 \\ 0.9603698811096433
# 0  \\ 600         \\ 10       \\ 0      \\ 0.9920658242727005  \\ 0.9630118890356671
# 0  \\ 600         \\ 20       \\ 0      \\ 0.9920658242727005  \\ 0.9630118890356671
# 0  \\ 600         \\ 5        \\ 1e-5   \\ 0.9920658242727005  \\ 0.9630118890356671


In [19]:
model = Doc2Vec(dm=0, vector_size=600, negative=5, sample=0, workers=cores, epochs=30)

# train final model on all data
model.build_vocab(training_corpus_tagged)
model.train(training_corpus_tagged, total_examples=model.corpus_count, epochs=model.epochs)

2023-02-23 14:01:58,383 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dbow,d600,n5,mc5,t20)', 'datetime': '2023-02-23T14:01:58.383137', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
2023-02-23 14:02:00,028 : INFO : collecting all words and their counts
2023-02-23 14:02:00,028 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2023-02-23 14:02:27,650 : INFO : collected 5592890 word types and 7563 unique tags from a corpus of 7563 examples and 53124816 words
2023-02-23 14:02:27,650 : INFO : Creating a fresh vocabulary
2023-02-23 14:02:31,629 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 369198 unique words (6.6012025982989115%% of original 5592890, drops 5223692)', 'datetime': '2023-02-23T14:02:31.629467', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Wi

2023-02-23 14:03:18,347 : INFO : EPOCH - 1 : training on 53124816 raw words (41917369 effective words) took 35.0s, 1197046 effective words/s
2023-02-23 14:03:19,365 : INFO : EPOCH 2 - PROGRESS: at 2.64% examples, 1073781 words/s, in_qsize 30, out_qsize 5
2023-02-23 14:03:20,370 : INFO : EPOCH 2 - PROGRESS: at 5.71% examples, 1177800 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:03:21,382 : INFO : EPOCH 2 - PROGRESS: at 8.49% examples, 1171887 words/s, in_qsize 37, out_qsize 3
2023-02-23 14:03:22,384 : INFO : EPOCH 2 - PROGRESS: at 11.44% examples, 1176144 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:03:23,388 : INFO : EPOCH 2 - PROGRESS: at 14.31% examples, 1171967 words/s, in_qsize 26, out_qsize 4
2023-02-23 14:03:24,390 : INFO : EPOCH 2 - PROGRESS: at 17.29% examples, 1181749 words/s, in_qsize 29, out_qsize 0
2023-02-23 14:03:25,392 : INFO : EPOCH 2 - PROGRESS: at 20.14% examples, 1181025 words/s, in_qsize 34, out_qsize 0
2023-02-23 14:03:26,394 : INFO : EPOCH 2 - PROGRESS: at 2

2023-02-23 14:04:13,323 : INFO : EPOCH 3 - PROGRESS: at 57.05% examples, 1181211 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:04:14,389 : INFO : EPOCH 3 - PROGRESS: at 60.00% examples, 1180130 words/s, in_qsize 32, out_qsize 7
2023-02-23 14:04:15,389 : INFO : EPOCH 3 - PROGRESS: at 63.00% examples, 1183299 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:04:16,388 : INFO : EPOCH 3 - PROGRESS: at 65.77% examples, 1182634 words/s, in_qsize 33, out_qsize 1
2023-02-23 14:04:17,399 : INFO : EPOCH 3 - PROGRESS: at 68.56% examples, 1183978 words/s, in_qsize 28, out_qsize 2
2023-02-23 14:04:18,402 : INFO : EPOCH 3 - PROGRESS: at 71.10% examples, 1180592 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:04:19,398 : INFO : EPOCH 3 - PROGRESS: at 73.87% examples, 1178704 words/s, in_qsize 40, out_qsize 0
2023-02-23 14:04:20,399 : INFO : EPOCH 3 - PROGRESS: at 76.61% examples, 1177433 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:04:21,399 : INFO : EPOCH 3 - PROGRESS: at 79.68% examples, 1178440

2023-02-23 14:05:03,803 : INFO : worker thread finished; awaiting finish of 13 more threads
2023-02-23 14:05:03,807 : INFO : worker thread finished; awaiting finish of 12 more threads
2023-02-23 14:05:03,811 : INFO : worker thread finished; awaiting finish of 11 more threads
2023-02-23 14:05:03,818 : INFO : worker thread finished; awaiting finish of 10 more threads
2023-02-23 14:05:03,820 : INFO : worker thread finished; awaiting finish of 9 more threads
2023-02-23 14:05:03,820 : INFO : worker thread finished; awaiting finish of 8 more threads
2023-02-23 14:05:03,820 : INFO : worker thread finished; awaiting finish of 7 more threads
2023-02-23 14:05:03,832 : INFO : worker thread finished; awaiting finish of 6 more threads
2023-02-23 14:05:03,838 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 14:05:03,844 : INFO : worker thread finished; awaiting finish of 4 more threads
2023-02-23 14:05:03,846 : INFO : worker thread finished; awaiting finish of 3 more thr

2023-02-23 14:05:48,088 : INFO : EPOCH 6 - PROGRESS: at 26.62% examples, 1209070 words/s, in_qsize 38, out_qsize 5
2023-02-23 14:05:49,093 : INFO : EPOCH 6 - PROGRESS: at 29.56% examples, 1213813 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:05:50,099 : INFO : EPOCH 6 - PROGRESS: at 32.53% examples, 1214889 words/s, in_qsize 38, out_qsize 1
2023-02-23 14:05:51,103 : INFO : EPOCH 6 - PROGRESS: at 35.58% examples, 1217473 words/s, in_qsize 40, out_qsize 0
2023-02-23 14:05:52,111 : INFO : EPOCH 6 - PROGRESS: at 38.50% examples, 1217059 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:05:53,128 : INFO : EPOCH 6 - PROGRESS: at 41.25% examples, 1210501 words/s, in_qsize 40, out_qsize 7
2023-02-23 14:05:54,134 : INFO : EPOCH 6 - PROGRESS: at 44.20% examples, 1212600 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:05:55,145 : INFO : EPOCH 6 - PROGRESS: at 47.23% examples, 1216475 words/s, in_qsize 32, out_qsize 0
2023-02-23 14:05:56,157 : INFO : EPOCH 6 - PROGRESS: at 50.06% examples, 1213261

2023-02-23 14:06:44,567 : INFO : EPOCH 7 - PROGRESS: at 93.02% examples, 1240644 words/s, in_qsize 40, out_qsize 4
2023-02-23 14:06:45,599 : INFO : EPOCH 7 - PROGRESS: at 95.91% examples, 1240568 words/s, in_qsize 36, out_qsize 3
2023-02-23 14:06:46,613 : INFO : EPOCH 7 - PROGRESS: at 98.94% examples, 1242892 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:06:46,837 : INFO : worker thread finished; awaiting finish of 19 more threads
2023-02-23 14:06:46,837 : INFO : worker thread finished; awaiting finish of 18 more threads
2023-02-23 14:06:46,841 : INFO : worker thread finished; awaiting finish of 17 more threads
2023-02-23 14:06:46,845 : INFO : worker thread finished; awaiting finish of 16 more threads
2023-02-23 14:06:46,853 : INFO : worker thread finished; awaiting finish of 15 more threads
2023-02-23 14:06:46,853 : INFO : worker thread finished; awaiting finish of 14 more threads
2023-02-23 14:06:46,857 : INFO : worker thread finished; awaiting finish of 13 more threads
2023-02-23 

2023-02-23 14:07:22,394 : INFO : EPOCH 9 - PROGRESS: at 5.94% examples, 1223875 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:07:23,396 : INFO : EPOCH 9 - PROGRESS: at 9.03% examples, 1254715 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:07:24,405 : INFO : EPOCH 9 - PROGRESS: at 12.24% examples, 1263655 words/s, in_qsize 40, out_qsize 0
2023-02-23 14:07:25,445 : INFO : EPOCH 9 - PROGRESS: at 15.35% examples, 1249008 words/s, in_qsize 33, out_qsize 6
2023-02-23 14:07:26,466 : INFO : EPOCH 9 - PROGRESS: at 18.56% examples, 1255183 words/s, in_qsize 34, out_qsize 5
2023-02-23 14:07:27,467 : INFO : EPOCH 9 - PROGRESS: at 21.71% examples, 1264152 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:07:28,478 : INFO : EPOCH 9 - PROGRESS: at 24.79% examples, 1265967 words/s, in_qsize 39, out_qsize 2
2023-02-23 14:07:29,483 : INFO : EPOCH 9 - PROGRESS: at 27.77% examples, 1267095 words/s, in_qsize 39, out_qsize 4
2023-02-23 14:07:30,488 : INFO : EPOCH 9 - PROGRESS: at 30.91% examples, 1271006 w

2023-02-23 14:08:17,738 : INFO : EPOCH 10 - PROGRESS: at 72.29% examples, 1242177 words/s, in_qsize 39, out_qsize 7
2023-02-23 14:08:18,740 : INFO : EPOCH 10 - PROGRESS: at 75.38% examples, 1244254 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:08:19,742 : INFO : EPOCH 10 - PROGRESS: at 78.50% examples, 1244867 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:08:20,740 : INFO : EPOCH 10 - PROGRESS: at 81.63% examples, 1246240 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:08:21,745 : INFO : EPOCH 10 - PROGRESS: at 84.45% examples, 1245161 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:08:22,751 : INFO : EPOCH 10 - PROGRESS: at 87.48% examples, 1245042 words/s, in_qsize 40, out_qsize 1
2023-02-23 14:08:23,752 : INFO : EPOCH 10 - PROGRESS: at 90.60% examples, 1246017 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:08:24,753 : INFO : EPOCH 10 - PROGRESS: at 93.57% examples, 1245640 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:08:25,755 : INFO : EPOCH 10 - PROGRESS: at 96.17% examples

2023-02-23 14:09:01,347 : INFO : worker thread finished; awaiting finish of 6 more threads
2023-02-23 14:09:01,347 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 14:09:01,351 : INFO : worker thread finished; awaiting finish of 4 more threads
2023-02-23 14:09:01,352 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-02-23 14:09:01,353 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-02-23 14:09:01,354 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-02-23 14:09:01,356 : INFO : worker thread finished; awaiting finish of 0 more threads
2023-02-23 14:09:01,357 : INFO : EPOCH - 11 : training on 53124816 raw words (41917369 effective words) took 34.3s, 1220363 effective words/s
2023-02-23 14:09:02,375 : INFO : EPOCH 12 - PROGRESS: at 2.82% examples, 1155485 words/s, in_qsize 40, out_qsize 2
2023-02-23 14:09:03,392 : INFO : EPOCH 12 - PROGRESS: at 5.80% examples, 1188100 words/s, in_qsize 38, out

2023-02-23 14:09:50,676 : INFO : EPOCH 13 - PROGRESS: at 48.09% examples, 1244595 words/s, in_qsize 32, out_qsize 5
2023-02-23 14:09:51,677 : INFO : EPOCH 13 - PROGRESS: at 51.05% examples, 1246001 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:09:52,681 : INFO : EPOCH 13 - PROGRESS: at 54.20% examples, 1247258 words/s, in_qsize 40, out_qsize 2
2023-02-23 14:09:53,684 : INFO : EPOCH 13 - PROGRESS: at 57.25% examples, 1250451 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:09:54,689 : INFO : EPOCH 13 - PROGRESS: at 60.28% examples, 1249088 words/s, in_qsize 35, out_qsize 1
2023-02-23 14:09:55,697 : INFO : EPOCH 13 - PROGRESS: at 63.29% examples, 1250399 words/s, in_qsize 38, out_qsize 0
2023-02-23 14:09:56,704 : INFO : EPOCH 13 - PROGRESS: at 66.28% examples, 1250530 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:09:57,707 : INFO : EPOCH 13 - PROGRESS: at 69.15% examples, 1251136 words/s, in_qsize 40, out_qsize 2
2023-02-23 14:09:58,708 : INFO : EPOCH 13 - PROGRESS: at 72.22% examples

2023-02-23 14:10:41,194 : INFO : worker thread finished; awaiting finish of 14 more threads
2023-02-23 14:10:41,201 : INFO : worker thread finished; awaiting finish of 13 more threads
2023-02-23 14:10:41,207 : INFO : worker thread finished; awaiting finish of 12 more threads
2023-02-23 14:10:41,213 : INFO : worker thread finished; awaiting finish of 11 more threads
2023-02-23 14:10:41,214 : INFO : worker thread finished; awaiting finish of 10 more threads
2023-02-23 14:10:41,218 : INFO : worker thread finished; awaiting finish of 9 more threads
2023-02-23 14:10:41,222 : INFO : worker thread finished; awaiting finish of 8 more threads
2023-02-23 14:10:41,228 : INFO : worker thread finished; awaiting finish of 7 more threads
2023-02-23 14:10:41,229 : INFO : worker thread finished; awaiting finish of 6 more threads
2023-02-23 14:10:41,234 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 14:10:41,235 : INFO : worker thread finished; awaiting finish of 4 more th

2023-02-23 14:11:23,723 : INFO : EPOCH 16 - PROGRESS: at 26.31% examples, 1200403 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:11:24,723 : INFO : EPOCH 16 - PROGRESS: at 29.21% examples, 1204391 words/s, in_qsize 33, out_qsize 1
2023-02-23 14:11:25,727 : INFO : EPOCH 16 - PROGRESS: at 32.06% examples, 1204150 words/s, in_qsize 36, out_qsize 3
2023-02-23 14:11:26,742 : INFO : EPOCH 16 - PROGRESS: at 35.09% examples, 1207131 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:11:27,739 : INFO : EPOCH 16 - PROGRESS: at 38.05% examples, 1209086 words/s, in_qsize 32, out_qsize 0
2023-02-23 14:11:28,756 : INFO : EPOCH 16 - PROGRESS: at 41.06% examples, 1210643 words/s, in_qsize 38, out_qsize 4
2023-02-23 14:11:29,752 : INFO : EPOCH 16 - PROGRESS: at 43.91% examples, 1209763 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:11:30,767 : INFO : EPOCH 16 - PROGRESS: at 46.56% examples, 1202504 words/s, in_qsize 39, out_qsize 10
2023-02-23 14:11:31,788 : INFO : EPOCH 16 - PROGRESS: at 49.54% example

2023-02-23 14:12:19,309 : INFO : EPOCH 17 - PROGRESS: at 92.32% examples, 1268041 words/s, in_qsize 38, out_qsize 1
2023-02-23 14:12:20,327 : INFO : EPOCH 17 - PROGRESS: at 95.17% examples, 1266774 words/s, in_qsize 38, out_qsize 1
2023-02-23 14:12:21,329 : INFO : EPOCH 17 - PROGRESS: at 97.96% examples, 1267050 words/s, in_qsize 38, out_qsize 1
2023-02-23 14:12:21,853 : INFO : worker thread finished; awaiting finish of 19 more threads
2023-02-23 14:12:21,858 : INFO : worker thread finished; awaiting finish of 18 more threads
2023-02-23 14:12:21,884 : INFO : worker thread finished; awaiting finish of 17 more threads
2023-02-23 14:12:21,888 : INFO : worker thread finished; awaiting finish of 16 more threads
2023-02-23 14:12:21,888 : INFO : worker thread finished; awaiting finish of 15 more threads
2023-02-23 14:12:21,888 : INFO : worker thread finished; awaiting finish of 14 more threads
2023-02-23 14:12:21,888 : INFO : worker thread finished; awaiting finish of 13 more threads
2023-02-

2023-02-23 14:12:57,391 : INFO : EPOCH 19 - PROGRESS: at 9.08% examples, 1259877 words/s, in_qsize 30, out_qsize 2
2023-02-23 14:12:58,394 : INFO : EPOCH 19 - PROGRESS: at 12.46% examples, 1279622 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:12:59,407 : INFO : EPOCH 19 - PROGRESS: at 15.38% examples, 1263445 words/s, in_qsize 27, out_qsize 4
2023-02-23 14:13:00,409 : INFO : EPOCH 19 - PROGRESS: at 18.56% examples, 1265425 words/s, in_qsize 37, out_qsize 0
2023-02-23 14:13:01,419 : INFO : EPOCH 19 - PROGRESS: at 21.70% examples, 1269769 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:13:02,428 : INFO : EPOCH 19 - PROGRESS: at 24.83% examples, 1274884 words/s, in_qsize 38, out_qsize 0
2023-02-23 14:13:03,453 : INFO : EPOCH 19 - PROGRESS: at 27.87% examples, 1272237 words/s, in_qsize 37, out_qsize 6
2023-02-23 14:13:04,465 : INFO : EPOCH 19 - PROGRESS: at 30.99% examples, 1276726 words/s, in_qsize 40, out_qsize 5
2023-02-23 14:13:05,469 : INFO : EPOCH 19 - PROGRESS: at 34.23% examples,

2023-02-23 14:13:52,122 : INFO : EPOCH 20 - PROGRESS: at 76.23% examples, 1269028 words/s, in_qsize 37, out_qsize 3
2023-02-23 14:13:53,128 : INFO : EPOCH 20 - PROGRESS: at 79.49% examples, 1271047 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:13:54,145 : INFO : EPOCH 20 - PROGRESS: at 82.45% examples, 1270333 words/s, in_qsize 38, out_qsize 4
2023-02-23 14:13:55,157 : INFO : EPOCH 20 - PROGRESS: at 85.53% examples, 1269960 words/s, in_qsize 31, out_qsize 3
2023-02-23 14:13:56,161 : INFO : EPOCH 20 - PROGRESS: at 88.68% examples, 1271128 words/s, in_qsize 31, out_qsize 3
2023-02-23 14:13:57,174 : INFO : EPOCH 20 - PROGRESS: at 91.93% examples, 1271714 words/s, in_qsize 38, out_qsize 2
2023-02-23 14:13:58,173 : INFO : EPOCH 20 - PROGRESS: at 94.88% examples, 1272528 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:13:59,181 : INFO : EPOCH 20 - PROGRESS: at 97.62% examples, 1271296 words/s, in_qsize 27, out_qsize 3
2023-02-23 14:13:59,843 : INFO : worker thread finished; awaiting finish

2023-02-23 14:14:33,083 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-02-23 14:14:33,084 : INFO : worker thread finished; awaiting finish of 0 more threads
2023-02-23 14:14:33,085 : INFO : EPOCH - 21 : training on 53124816 raw words (41917369 effective words) took 33.2s, 1262682 effective words/s
2023-02-23 14:14:34,097 : INFO : EPOCH 22 - PROGRESS: at 2.95% examples, 1229446 words/s, in_qsize 40, out_qsize 0
2023-02-23 14:14:35,098 : INFO : EPOCH 22 - PROGRESS: at 5.91% examples, 1228980 words/s, in_qsize 39, out_qsize 3
2023-02-23 14:14:36,101 : INFO : EPOCH 22 - PROGRESS: at 8.95% examples, 1249977 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:14:37,125 : INFO : EPOCH 22 - PROGRESS: at 12.14% examples, 1250540 words/s, in_qsize 38, out_qsize 2
2023-02-23 14:14:38,126 : INFO : EPOCH 22 - PROGRESS: at 15.46% examples, 1263872 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:14:39,141 : INFO : EPOCH 22 - PROGRESS: at 18.45% examples, 1254013 words/s, in_qsize

2023-02-23 14:15:26,219 : INFO : EPOCH 23 - PROGRESS: at 62.30% examples, 1289921 words/s, in_qsize 37, out_qsize 0
2023-02-23 14:15:27,221 : INFO : EPOCH 23 - PROGRESS: at 65.32% examples, 1289646 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:15:28,223 : INFO : EPOCH 23 - PROGRESS: at 68.31% examples, 1289741 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:15:29,229 : INFO : EPOCH 23 - PROGRESS: at 71.37% examples, 1290312 words/s, in_qsize 39, out_qsize 3
2023-02-23 14:15:30,238 : INFO : EPOCH 23 - PROGRESS: at 74.44% examples, 1290607 words/s, in_qsize 40, out_qsize 1
2023-02-23 14:15:31,269 : INFO : EPOCH 23 - PROGRESS: at 77.67% examples, 1289094 words/s, in_qsize 34, out_qsize 5
2023-02-23 14:15:32,273 : INFO : EPOCH 23 - PROGRESS: at 80.80% examples, 1290213 words/s, in_qsize 37, out_qsize 2
2023-02-23 14:15:33,277 : INFO : EPOCH 23 - PROGRESS: at 83.97% examples, 1291509 words/s, in_qsize 40, out_qsize 0
2023-02-23 14:15:34,285 : INFO : EPOCH 23 - PROGRESS: at 87.03% examples

2023-02-23 14:16:11,255 : INFO : worker thread finished; awaiting finish of 8 more threads
2023-02-23 14:16:11,256 : INFO : worker thread finished; awaiting finish of 7 more threads
2023-02-23 14:16:11,258 : INFO : worker thread finished; awaiting finish of 6 more threads
2023-02-23 14:16:11,263 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 14:16:11,265 : INFO : worker thread finished; awaiting finish of 4 more threads
2023-02-23 14:16:11,269 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-02-23 14:16:11,274 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-02-23 14:16:11,276 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-02-23 14:16:11,277 : INFO : worker thread finished; awaiting finish of 0 more threads
2023-02-23 14:16:11,278 : INFO : EPOCH - 24 : training on 53124816 raw words (41917369 effective words) took 32.8s, 1279679 effective words/s
2023-02-23 14:16:12,290 : INFO : EPOCH 

2023-02-23 14:16:59,531 : INFO : EPOCH 26 - PROGRESS: at 50.48% examples, 1308926 words/s, in_qsize 40, out_qsize 1
2023-02-23 14:17:00,555 : INFO : EPOCH 26 - PROGRESS: at 53.31% examples, 1300510 words/s, in_qsize 36, out_qsize 10
2023-02-23 14:17:01,556 : INFO : EPOCH 26 - PROGRESS: at 56.67% examples, 1305043 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:17:02,560 : INFO : EPOCH 26 - PROGRESS: at 59.74% examples, 1304077 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:17:03,567 : INFO : EPOCH 26 - PROGRESS: at 62.75% examples, 1301525 words/s, in_qsize 37, out_qsize 2
2023-02-23 14:17:04,569 : INFO : EPOCH 26 - PROGRESS: at 65.87% examples, 1303670 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:17:05,570 : INFO : EPOCH 26 - PROGRESS: at 68.99% examples, 1303728 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:17:06,590 : INFO : EPOCH 26 - PROGRESS: at 72.01% examples, 1302735 words/s, in_qsize 40, out_qsize 2
2023-02-23 14:17:07,609 : INFO : EPOCH 26 - PROGRESS: at 75.18% example

2023-02-23 14:17:47,586 : INFO : worker thread finished; awaiting finish of 10 more threads
2023-02-23 14:17:47,589 : INFO : worker thread finished; awaiting finish of 9 more threads
2023-02-23 14:17:47,594 : INFO : worker thread finished; awaiting finish of 8 more threads
2023-02-23 14:17:47,601 : INFO : worker thread finished; awaiting finish of 7 more threads
2023-02-23 14:17:47,603 : INFO : worker thread finished; awaiting finish of 6 more threads
2023-02-23 14:17:47,604 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 14:17:47,606 : INFO : worker thread finished; awaiting finish of 4 more threads
2023-02-23 14:17:47,607 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-02-23 14:17:47,609 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-02-23 14:17:47,611 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-02-23 14:17:47,619 : INFO : worker thread finished; awaiting finish of 0 more thread

2023-02-23 14:18:33,709 : INFO : EPOCH 29 - PROGRESS: at 45.10% examples, 1327150 words/s, in_qsize 40, out_qsize 0
2023-02-23 14:18:34,742 : INFO : EPOCH 29 - PROGRESS: at 48.27% examples, 1325223 words/s, in_qsize 36, out_qsize 3
2023-02-23 14:18:35,747 : INFO : EPOCH 29 - PROGRESS: at 51.45% examples, 1327950 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:18:36,761 : INFO : EPOCH 29 - PROGRESS: at 54.75% examples, 1326319 words/s, in_qsize 40, out_qsize 3
2023-02-23 14:18:37,761 : INFO : EPOCH 29 - PROGRESS: at 57.99% examples, 1330378 words/s, in_qsize 39, out_qsize 0
2023-02-23 14:18:38,803 : INFO : EPOCH 29 - PROGRESS: at 61.15% examples, 1325193 words/s, in_qsize 39, out_qsize 9
2023-02-23 14:18:39,819 : INFO : EPOCH 29 - PROGRESS: at 64.42% examples, 1326518 words/s, in_qsize 33, out_qsize 6
2023-02-23 14:18:40,828 : INFO : EPOCH 29 - PROGRESS: at 67.67% examples, 1329375 words/s, in_qsize 40, out_qsize 1
2023-02-23 14:18:41,840 : INFO : EPOCH 29 - PROGRESS: at 70.57% examples

2023-02-23 14:19:22,788 : INFO : worker thread finished; awaiting finish of 13 more threads
2023-02-23 14:19:22,792 : INFO : worker thread finished; awaiting finish of 12 more threads
2023-02-23 14:19:22,792 : INFO : worker thread finished; awaiting finish of 11 more threads
2023-02-23 14:19:22,796 : INFO : worker thread finished; awaiting finish of 10 more threads
2023-02-23 14:19:22,796 : INFO : worker thread finished; awaiting finish of 9 more threads
2023-02-23 14:19:22,800 : INFO : worker thread finished; awaiting finish of 8 more threads
2023-02-23 14:19:22,800 : INFO : worker thread finished; awaiting finish of 7 more threads
2023-02-23 14:19:22,804 : INFO : worker thread finished; awaiting finish of 6 more threads
2023-02-23 14:19:22,808 : INFO : worker thread finished; awaiting finish of 5 more threads
2023-02-23 14:19:22,812 : INFO : worker thread finished; awaiting finish of 4 more threads
2023-02-23 14:19:22,816 : INFO : worker thread finished; awaiting finish of 3 more thr

In [21]:
fname = ".//models//doc2vec//model_dbow_vs_600_n_5_s_0.bin"
model.save(fname)

2023-02-23 14:20:13,746 : INFO : Doc2Vec lifecycle event {'fname_or_handle': './/models//doc2vec//model_dbow_vs_600_n_5_s_0.bin', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-02-23T14:20:13.746165', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'saving'}
2023-02-23 14:20:13,747 : INFO : storing np array 'vectors' to .//models//doc2vec//model_dbow_vs_600_n_5_s_0.bin.wv.vectors.npy
2023-02-23 14:20:14,713 : INFO : storing np array 'syn1neg' to .//models//doc2vec//model_dbow_vs_600_n_5_s_0.bin.syn1neg.npy
2023-02-23 14:20:15,819 : INFO : not storing attribute cum_table
2023-02-23 14:20:16,081 : INFO : saved .//models//doc2vec//model_dbow_vs_600_n_5_s_0.bin
