## 1 - Imports

In [None]:
import pandas as pd
import global_options
from nltk.tokenize import sent_tokenize
from pathlib import Path
import stanza
from stanza.server import CoreNLPClient
import os


## 2 - Data manipulation

In [2]:
data = pd.read_csv(Path(global_options.DATA_FOLDER, "input", "SustainableInnovation_FinancialTimes.csv"))

# Remove duplicates
data = data.drop_duplicates(subset = 'Article title')

# Combine columns into 'full_text' and clean up quotation marks
data['full_text'] = data['Article title'] + ' ' + data['Article authors'] + ' ' + data['Full article text']

# Select the desired columns
data = data[['full_text']]

# Replace the newline operator as space
data['full_text'] = data['full_text'].str.replace('\n', ' ', regex = False)

#
data['index'] = range(1, len(data)+1)

# Write to text files
with open(Path(global_options.DATA_FOLDER, "input", "documents.txt"), "w") as file_docs:
    for _, row in data.iterrows():
        file_docs.write(f"{row['full_text']}\n")
        
with open(Path(global_options.DATA_FOLDER, "input", "document_ids.txt"), "w") as file_docs:
    for _, row in data.iterrows():
        file_docs.write(f"{row['index']}\n")

## 3 - Run py files to expand the dictionary

In [17]:
%run parse_parallel.py

2024-03-16 00:43:46 INFO: Writing properties to tmp file: corenlp_server-aa27c3de77e641ba.props
2024-03-16 00:43:46 INFO: Starting server with command: java -Xmx8G -cp /Users/yuruchen/CoreNLP3/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9005 -timeout 12000000 -threads 2 -maxCharLength 1000000 -quiet False -serverProperties corenlp_server-aa27c3de77e641ba.props -preload -outputFormat serialized


2024-03-16 00:43:47.018371
Processing line: 100.


[main] INFO CoreNLP - --- StanfordCoreNLPServer#main() called ---
[main] INFO CoreNLP - Server default properties:
			(Note: unspecified annotator properties are English defaults)
			annotators = tokenize, ssplit, pos, lemma, ner, depparse
			inputFormat = text
			ner.applyFineGrained = false
			outputFormat = serialized
			prettyPrint = false
			threads = 2
[main] INFO CoreNLP - Threads: 2
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator pos
[main] INFO edu.stanford.nlp.tagger.maxent.MaxentTagger - Loading POS tagger from edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger ... done [0.9 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
[main] INFO edu.stanford.nlp.ie.AbstractSequenceClassifier - Loading classifier from edu/stanford/nlp/models/ner/english

2024-03-16 00:49:47.424619
Processing line: 200.
2024-03-16 00:55:37.072096
Processing line: 300.
2024-03-16 01:01:09.310444
Processing line: 400.
2024-03-16 01:06:49.155316
Processing line: 500.
2024-03-16 01:12:14.031469
Processing line: 600.
2024-03-16 01:17:30.363286
Processing line: 700.
2024-03-16 01:22:16.260370
Processing line: 800.
2024-03-16 01:27:23.632465
Processing line: 900.
2024-03-16 01:32:36.107924
Processing line: 1000.
2024-03-16 01:38:37.184010
Processing line: 1100.
2024-03-16 01:44:47.765442
Processing line: 1200.
2024-03-16 01:50:28.909348
Processing line: 1300.
2024-03-16 01:55:49.627356
Processing line: 1400.
2024-03-16 02:00:54.302903
Processing line: 1500.
2024-03-16 02:06:20.018923
Processing line: 1600.
2024-03-16 02:11:30.199081
Processing line: 1700.
2024-03-16 02:17:17.176778
Processing line: 1800.
2024-03-16 02:22:50.930291
Processing line: 1900.
2024-03-16 02:29:00.241419
Processing line: 2000.
2024-03-16 02:35:42.840330
Processing line: 2100.
2024-03-

2024-03-16 09:07:01.330132
Processing line: 16500.


[Thread-0] INFO CoreNLP - CoreNLP Server is shutting down.


In [3]:
%run clean_and_train.py

2024-03-16 11:52:14.235905
Processing line: 200000.
2024-03-16 11:52:24.423754
Processing line: 400000.
2024-03-16 11:52:35.670688
Processing line: 600000.
2024-03-16 11:52:46.389422
Processing line: 800000.
2024-03-16 11:52:58.203864
Processing line: 1000000.
2024-03-16 11:53:02.520718
Training phraser...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/unigram/documents.txt


  0%|                                                | 0/878099 [00:00<?, ?it/s]

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/unigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/unigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types


  1%|▏                                 | 4662/878099 [00:00<00:18, 46505.18it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #10000, processed 104693 words and 84708 word types


  2%|▌                                | 15569/878099 [00:00<00:16, 51850.74it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #20000, processed 218214 words and 156883 word types


  3%|▉                                | 25916/878099 [00:00<00:18, 46678.88it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #30000, processed 332959 words and 221705 word types


  4%|█▎                               | 35721/878099 [00:00<00:17, 48056.62it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #40000, processed 447220 words and 281980 word types


  5%|█▋                               | 45358/878099 [00:00<00:17, 46335.61it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #50000, processed 558613 words and 338356 word types


  7%|██▏                              | 59728/878099 [00:01<00:19, 41380.78it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #60000, processed 670105 words and 395697 word types


  8%|██▌                              | 68701/878099 [00:01<00:18, 43070.33it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #70000, processed 788447 words and 455109 word types


  9%|██▉                              | 78027/878099 [00:01<00:17, 44777.09it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #80000, processed 901934 words and 510004 word types


 10%|███▎                             | 86795/878099 [00:02<00:22, 35862.84it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #90000, processed 1011083 words and 562562 word types


 11%|███▋                             | 99484/878099 [00:02<00:19, 39746.69it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #100000, processed 1124552 words and 619426 word types


 12%|███▉                            | 108853/878099 [00:02<00:17, 43255.90it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #110000, processed 1237503 words and 678364 word types


 13%|████▎                           | 117282/878099 [00:02<00:22, 34453.82it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #120000, processed 1349159 words and 736092 word types


 15%|████▋                           | 129985/878099 [00:03<00:22, 32851.25it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #130000, processed 1459504 words and 787598 word types


 16%|█████                           | 137965/878099 [00:03<00:21, 35176.31it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #140000, processed 1570464 words and 833498 word types


 17%|█████▍                          | 147867/878099 [00:03<00:17, 41978.18it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #150000, processed 1681022 words and 877956 word types


 18%|█████▋                          | 156886/878099 [00:03<00:16, 42821.43it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #160000, processed 1787351 words and 920598 word types


 19%|██████▏                         | 169164/878099 [00:04<00:18, 37697.67it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #170000, processed 1894541 words and 969866 word types


 20%|██████▍                         | 178069/878099 [00:04<00:16, 41290.77it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #180000, processed 1998868 words and 1011325 word types


 22%|██████▉                         | 189192/878099 [00:04<00:22, 30692.99it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #190000, processed 2110460 words and 1056821 word types


 22%|███████▏                        | 197539/878099 [00:04<00:18, 36038.77it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #200000, processed 2214717 words and 1097121 word types


 23%|███████▌                        | 205969/878099 [00:05<00:17, 39280.95it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #210000, processed 2325317 words and 1141462 word types


 25%|███████▊                        | 215797/878099 [00:05<00:14, 44155.94it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #220000, processed 2428438 words and 1184704 word types


 26%|████████▎                       | 229726/878099 [00:05<00:14, 44691.22it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #230000, processed 2538999 words and 1228802 word types


 27%|████████▋                       | 239221/878099 [00:05<00:14, 43991.01it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #240000, processed 2651202 words and 1274273 word types


 28%|█████████                       | 247791/878099 [00:06<00:17, 35404.35it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #250000, processed 2762200 words and 1319309 word types


 29%|█████████▍                      | 257967/878099 [00:06<00:20, 30612.90it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #260000, processed 2874579 words and 1365048 word types


 31%|█████████▊                      | 269051/878099 [00:06<00:20, 29211.29it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #270000, processed 2985060 words and 1416672 word types


 31%|██████████                      | 276535/878099 [00:07<00:18, 33110.17it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #280000, processed 3097558 words and 1469699 word types


 33%|██████████▌                     | 288477/878099 [00:07<00:15, 37798.79it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #290000, processed 3209575 words and 1521103 word types


 34%|██████████▊                     | 296508/878099 [00:07<00:14, 38799.17it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #300000, processed 3316288 words and 1573362 word types


 35%|███████████▏                    | 308401/878099 [00:07<00:15, 37030.17it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #310000, processed 3420206 words and 1620835 word types


 36%|███████████▌                    | 316876/878099 [00:08<00:14, 39795.00it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #320000, processed 3530317 words and 1672259 word types


 37%|███████████▉                    | 328967/878099 [00:08<00:14, 38660.22it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #330000, processed 3642384 words and 1723679 word types


 38%|████████████▎                   | 338068/878099 [00:08<00:13, 40851.90it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #340000, processed 3750758 words and 1772176 word types


 39%|████████████▋                   | 346597/878099 [00:08<00:12, 41721.23it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #350000, processed 3864840 words and 1815447 word types


 41%|█████████████                   | 358824/878099 [00:09<00:13, 38769.73it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #360000, processed 3982758 words and 1852213 word types


 42%|█████████████▎                  | 366681/878099 [00:09<00:13, 37861.27it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #370000, processed 4096141 words and 1890400 word types


 43%|█████████████▊                  | 379929/878099 [00:09<00:12, 41462.09it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #380000, processed 4204453 words and 1927118 word types


 44%|██████████████▏                 | 388742/878099 [00:09<00:11, 42949.69it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #390000, processed 4318542 words and 1965308 word types


 45%|██████████████▍                 | 396808/878099 [00:10<00:13, 35436.55it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #400000, processed 4435459 words and 2010775 word types


 47%|██████████████▉                 | 408644/878099 [00:10<00:12, 37786.07it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #410000, processed 4551508 words and 2057367 word types


 47%|███████████████▏                | 416631/878099 [00:10<00:12, 37487.37it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #420000, processed 4667989 words and 2103350 word types


 48%|███████████████▍                | 425254/878099 [00:10<00:11, 40676.07it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #430000, processed 4781991 words and 2147498 word types


 50%|███████████████▉                | 439024/878099 [00:11<00:10, 41669.71it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #440000, processed 4894259 words and 2187605 word types


 51%|████████████████▎               | 447406/878099 [00:11<00:10, 39865.60it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #450000, processed 5009382 words and 2229279 word types


 52%|████████████████▌               | 455490/878099 [00:11<00:10, 39374.89it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #460000, processed 5119987 words and 2267843 word types


 53%|█████████████████               | 468173/878099 [00:12<00:10, 38964.06it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #470000, processed 5233241 words and 2303507 word types


 54%|█████████████████▎              | 475980/878099 [00:12<00:10, 37933.87it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #480000, processed 5339512 words and 2336119 word types


 56%|█████████████████▊              | 487416/878099 [00:12<00:12, 31822.02it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #490000, processed 5447882 words and 2379980 word types


 57%|██████████████████▏             | 498785/878099 [00:12<00:10, 34965.50it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #500000, processed 5550721 words and 2419466 word types


 58%|██████████████████▍             | 507574/878099 [00:13<00:09, 39129.63it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #510000, processed 5657046 words and 2463450 word types


 59%|██████████████████▉             | 519516/878099 [00:13<00:11, 31922.34it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #520000, processed 5759442 words and 2502928 word types


 60%|███████████████████▏            | 526385/878099 [00:13<00:10, 32434.98it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #530000, processed 5868421 words and 2537999 word types


 61%|███████████████████▋            | 539181/878099 [00:14<00:08, 39381.36it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #540000, processed 5976986 words and 2572795 word types


 62%|███████████████████▉            | 547398/878099 [00:14<00:12, 26159.53it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #550000, processed 6082527 words and 2607593 word types


 64%|████████████████████▎           | 557903/878099 [00:14<00:10, 30734.51it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #560000, processed 6187193 words and 2640347 word types


 65%|████████████████████▋           | 567551/878099 [00:15<00:11, 26890.54it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #570000, processed 6295210 words and 2672903 word types


 66%|█████████████████████           | 577318/878099 [00:15<00:10, 29210.05it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #580000, processed 6398404 words and 2703745 word types


 67%|█████████████████████▎          | 586169/878099 [00:15<00:08, 36303.64it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #590000, processed 6510641 words and 2734698 word types


 68%|█████████████████████▊          | 598627/878099 [00:16<00:07, 36384.78it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #600000, processed 6625616 words and 2765884 word types


 69%|██████████████████████▏         | 608138/878099 [00:16<00:06, 41530.22it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #610000, processed 6739628 words and 2801006 word types


 71%|██████████████████████▌         | 619998/878099 [00:16<00:09, 28308.44it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #620000, processed 6840448 words and 2835050 word types


 72%|██████████████████████▉         | 629486/878099 [00:17<00:09, 26940.01it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #630000, processed 6946484 words and 2870144 word types


 73%|███████████████████████▎        | 638308/878099 [00:17<00:09, 26126.80it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #640000, processed 7059927 words and 2910141 word types


 74%|███████████████████████▌        | 647255/878099 [00:17<00:10, 22666.98it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #650000, processed 7173250 words and 2946416 word types


 75%|████████████████████████        | 658699/878099 [00:18<00:08, 27258.85it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #660000, processed 7291368 words and 2985013 word types


 76%|████████████████████████▎       | 668690/878099 [00:18<00:06, 31450.12it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #670000, processed 7407944 words and 3023920 word types


 77%|████████████████████████▋       | 678788/878099 [00:19<00:06, 30063.80it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #680000, processed 7520513 words and 3060366 word types


 78%|█████████████████████████       | 687332/878099 [00:19<00:05, 36071.42it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #690000, processed 7629195 words and 3095622 word types


 79%|█████████████████████████▍      | 696447/878099 [00:19<00:04, 41185.69it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #700000, processed 7733803 words and 3128577 word types


 81%|█████████████████████████▊      | 709350/878099 [00:19<00:04, 40988.77it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #710000, processed 7847124 words and 3163857 word types


 82%|██████████████████████████▏     | 718120/878099 [00:19<00:03, 42521.34it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #720000, processed 7959056 words and 3199677 word types


 83%|██████████████████████████▌     | 727502/878099 [00:20<00:03, 44975.57it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #730000, processed 8068621 words and 3235818 word types


 84%|██████████████████████████▊     | 736710/878099 [00:20<00:03, 45362.96it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #740000, processed 8181723 words and 3272573 word types


 85%|███████████████████████████▏    | 745868/878099 [00:20<00:02, 45393.93it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #750000, processed 8295324 words and 3307326 word types


 86%|███████████████████████████▌    | 755312/878099 [00:20<00:02, 45716.75it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #760000, processed 8414148 words and 3341883 word types


 87%|███████████████████████████▉    | 765250/878099 [00:20<00:02, 47494.87it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #770000, processed 8532313 words and 3379169 word types


 88%|████████████████████████████▏   | 775184/878099 [00:21<00:02, 48750.47it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #780000, processed 8644019 words and 3416473 word types


 89%|████████████████████████████▋   | 785825/878099 [00:21<00:01, 51015.11it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #790000, processed 8754656 words and 3450483 word types


 91%|█████████████████████████████   | 795951/878099 [00:21<00:01, 49828.03it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #800000, processed 8867048 words and 3484998 word types


 92%|█████████████████████████████▍  | 807705/878099 [00:21<00:01, 54941.90it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #810000, processed 8963007 words and 3513505 word types


 93%|█████████████████████████████▊  | 819235/878099 [00:21<00:01, 53704.68it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #820000, processed 9078562 words and 3549784 word types


 95%|██████████████████████████████▏ | 829853/878099 [00:22<00:00, 50010.67it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #830000, processed 9191042 words and 3586541 word types


 96%|██████████████████████████████▌ | 839767/878099 [00:22<00:00, 47162.85it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #840000, processed 9301362 words and 3633062 word types


 96%|██████████████████████████████▊ | 845048/878099 [00:22<00:00, 48739.63it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #850000, processed 9408468 words and 3677556 word types


 98%|███████████████████████████████▏| 857328/878099 [00:22<00:00, 37676.74it/s]

INFO:gensim.models.phrases:collected 3709981 word types from a corpus of 9488170 words (unigram + bigrams) and 857328 sentences
INFO:gensim.models.phrases:using 3709981 counts as vocab in Phrases<0 vocab, min_count=10, threshold=10, max_vocab_size=40000000>
INFO:gensim.utils:saving Phrases object under models/phrases/bigram.mod, separately None
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/bigram.mod', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}





INFO:gensim.utils:saved models/phrases/bigram.mod
INFO:gensim.utils:loading Phrases object from models/phrases/bigram.mod
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/bigram.mod', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loaded models/phrases/bigram.mod


100%|████████████████████████████████| 878099/878099 [00:52<00:00, 16780.22it/s]


2024-03-16 11:54:33.869845
Training phraser...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/bigram/documents.txt


  0%|                                                | 0/878099 [00:00<?, ?it/s]

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/bigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/bigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types


  1%|▎                                 | 9236/878099 [00:00<00:18, 47379.32it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #10000, processed 97737 words and 85965 word types


  2%|▋                                | 19348/878099 [00:00<00:17, 49489.44it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #20000, processed 202402 words and 161015 word types


  3%|█                                | 29596/878099 [00:00<00:17, 48220.11it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #30000, processed 308522 words and 229172 word types


  5%|█▍                               | 39660/878099 [00:00<00:17, 49308.57it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #40000, processed 414193 words and 292762 word types


  6%|█▊                               | 49825/878099 [00:01<00:21, 37902.33it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #50000, processed 517662 words and 352438 word types


  7%|██▏                              | 59768/878099 [00:01<00:18, 43232.21it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #60000, processed 622468 words and 412903 word types


  8%|██▌                              | 68934/878099 [00:01<00:18, 43642.86it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #70000, processed 733808 words and 475569 word types


  9%|██▉                              | 78116/878099 [00:01<00:18, 44215.00it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #80000, processed 840748 words and 533549 word types


 10%|███▎                             | 87878/878099 [00:01<00:16, 46691.11it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #90000, processed 943828 words and 588976 word types


 11%|███▋                             | 97248/878099 [00:02<00:17, 45871.76it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #100000, processed 1050551 words and 649051 word types


 12%|███▉                            | 106366/878099 [00:02<00:17, 43657.76it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #110000, processed 1157561 words and 711051 word types


 14%|████▎                           | 118856/878099 [00:02<00:24, 31559.37it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #120000, processed 1264106 words and 771645 word types


 15%|████▋                           | 127595/878099 [00:03<00:20, 37099.62it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #130000, processed 1368424 words and 826049 word types


 16%|████▉                           | 136678/878099 [00:03<00:18, 40842.33it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #140000, processed 1472097 words and 875382 word types


 17%|█████▎                          | 145656/878099 [00:03<00:17, 42884.47it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #150000, processed 1575358 words and 923202 word types


 18%|█████▊                          | 159985/878099 [00:03<00:16, 44107.00it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #160000, processed 1675176 words and 968950 word types


 19%|██████▏                         | 169575/878099 [00:04<00:15, 45904.53it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #170000, processed 1776982 words and 1020963 word types


 20%|██████▌                         | 179749/878099 [00:04<00:14, 48562.00it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #180000, processed 1875030 words and 1065521 word types


 22%|██████▉                         | 189387/878099 [00:04<00:14, 47045.62it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #190000, processed 1979437 words and 1114294 word types


 23%|███████▎                        | 199138/878099 [00:04<00:14, 47671.31it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #200000, processed 2077120 words and 1157580 word types


 24%|███████▌                        | 209015/878099 [00:04<00:13, 48535.55it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #210000, processed 2180888 words and 1205165 word types


 25%|███████▉                        | 218586/878099 [00:05<00:14, 46578.19it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #220000, processed 2278691 words and 1251067 word types


 26%|████████▎                       | 227705/878099 [00:05<00:14, 44054.98it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #230000, processed 2382345 words and 1298412 word types


 27%|████████▋                       | 236901/878099 [00:05<00:14, 44277.25it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #240000, processed 2487276 words and 1347239 word types


 28%|████████▉                       | 245980/878099 [00:05<00:14, 44782.74it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #250000, processed 2591412 words and 1395582 word types


 30%|█████████▍                      | 259984/878099 [00:06<00:16, 38396.01it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #260000, processed 2697101 words and 1444837 word types


 31%|█████████▊                      | 269030/878099 [00:06<00:14, 41588.38it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #270000, processed 2802629 words and 1499429 word types


 32%|██████████                      | 277637/878099 [00:06<00:15, 37883.03it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #280000, processed 2909626 words and 1555335 word types


 33%|██████████▍                     | 287044/878099 [00:06<00:14, 42010.70it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #290000, processed 3016705 words and 1609444 word types


 34%|██████████▊                     | 296016/878099 [00:06<00:13, 42196.73it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #300000, processed 3119372 words and 1664064 word types


 35%|███████████                     | 305210/878099 [00:07<00:12, 44249.45it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #310000, processed 3219226 words and 1714012 word types


 36%|███████████▋                    | 319374/878099 [00:07<00:12, 44860.27it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #320000, processed 3324744 words and 1767834 word types


 37%|███████████▉                    | 328540/878099 [00:07<00:12, 44952.40it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #330000, processed 3431781 words and 1822028 word types


 39%|████████████▎                   | 338430/878099 [00:07<00:11, 45773.56it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #340000, processed 3535404 words and 1873213 word types


 40%|████████████▋                   | 347707/878099 [00:08<00:11, 45003.84it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #350000, processed 3642656 words and 1919937 word types


 41%|█████████████                   | 356731/878099 [00:08<00:11, 44493.64it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #360000, processed 3751051 words and 1960918 word types


 42%|█████████████▎                  | 365603/878099 [00:08<00:11, 44095.92it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #370000, processed 3856953 words and 2002769 word types


 43%|█████████████▊                  | 378806/878099 [00:08<00:14, 34414.70it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #380000, processed 3958477 words and 2043134 word types


 44%|██████████████                  | 386768/878099 [00:09<00:13, 35906.84it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #390000, processed 4064747 words and 2085064 word types


 45%|██████████████▌                 | 398564/878099 [00:09<00:13, 35303.78it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #400000, processed 4174727 words and 2133901 word types


 46%|██████████████▊                 | 406365/878099 [00:09<00:12, 37160.06it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #410000, processed 4284644 words and 2183773 word types


 48%|███████████████▏                | 418251/878099 [00:09<00:11, 38896.79it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #420000, processed 4395575 words and 2233185 word types


 49%|███████████████▌                | 426200/878099 [00:10<00:11, 37908.87it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #430000, processed 4503678 words and 2280727 word types


 50%|███████████████▉                | 438845/878099 [00:10<00:10, 40737.15it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #440000, processed 4608749 words and 2324342 word types


 51%|████████████████▎               | 447022/878099 [00:10<00:10, 40319.62it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #450000, processed 4716874 words and 2369710 word types


 52%|████████████████▋               | 458653/878099 [00:11<00:12, 34873.66it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #460000, processed 4821251 words and 2411689 word types


 53%|████████████████▉               | 466288/878099 [00:11<00:12, 33802.96it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #470000, processed 4926621 words and 2450806 word types


 54%|█████████████████▎              | 475483/878099 [00:11<00:10, 39703.80it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #480000, processed 5025698 words and 2486539 word types


 56%|█████████████████▊              | 488739/878099 [00:11<00:09, 39742.08it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #490000, processed 5129623 words and 2532715 word types


 57%|██████████████████▏             | 498480/878099 [00:12<00:08, 44084.15it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #500000, processed 5227936 words and 2574602 word types


 58%|██████████████████▌             | 508395/878099 [00:12<00:07, 47093.79it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #510000, processed 5329840 words and 2620811 word types


 59%|██████████████████▊             | 517734/878099 [00:12<00:07, 45486.14it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #520000, processed 5428015 words and 2662452 word types


 60%|███████████████████▏            | 526640/878099 [00:12<00:08, 40010.32it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #530000, processed 5530624 words and 2700482 word types


 61%|███████████████████▋            | 538595/878099 [00:12<00:09, 37590.70it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #540000, processed 5632829 words and 2738293 word types


 63%|████████████████████            | 549880/878099 [00:13<00:08, 37112.86it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #550000, processed 5732551 words and 2776113 word types


 64%|████████████████████▎           | 557838/878099 [00:13<00:11, 28498.63it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #560000, processed 5831484 words and 2811835 word types


 65%|████████████████████▊           | 569730/878099 [00:13<00:09, 33258.79it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #570000, processed 5933307 words and 2847453 word types


 66%|█████████████████████           | 579414/878099 [00:14<00:07, 40159.09it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #580000, processed 6030496 words and 2881302 word types


 67%|█████████████████████▍          | 587850/878099 [00:14<00:07, 39908.24it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #590000, processed 6135358 words and 2915478 word types


 68%|█████████████████████▋          | 595968/878099 [00:14<00:07, 36621.19it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #600000, processed 6241375 words and 2950634 word types


 69%|██████████████████████▏         | 608052/878099 [00:14<00:06, 38887.48it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #610000, processed 6347646 words and 2989567 word types


 70%|██████████████████████▍         | 616363/878099 [00:15<00:06, 39490.50it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #620000, processed 6443423 words and 3026282 word types


 72%|██████████████████████▉         | 629789/878099 [00:15<00:05, 41442.97it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #630000, processed 6544291 words and 3064323 word types


 73%|███████████████████████▎        | 638062/878099 [00:15<00:06, 39962.12it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #640000, processed 6651819 words and 3107300 word types


 74%|███████████████████████▌        | 646071/878099 [00:15<00:05, 39488.35it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #650000, processed 6758504 words and 3146762 word types


 75%|███████████████████████▉        | 657595/878099 [00:16<00:06, 34702.99it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #660000, processed 6869853 words and 3189030 word types


 76%|████████████████████████▍       | 669034/878099 [00:16<00:06, 33227.61it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #670000, processed 6980064 words and 3231654 word types


 77%|████████████████████████▋       | 676576/878099 [00:16<00:05, 35518.43it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #680000, processed 7086955 words and 3271165 word types


 78%|█████████████████████████       | 687844/878099 [00:17<00:05, 34863.88it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #690000, processed 7190310 words and 3309166 word types


 80%|█████████████████████████▍      | 699328/878099 [00:17<00:04, 36042.62it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #700000, processed 7289821 words and 3344735 word types


 81%|█████████████████████████▊      | 706942/878099 [00:17<00:04, 36539.04it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #710000, processed 7396774 words and 3383334 word types


 82%|██████████████████████████▏     | 717876/878099 [00:17<00:04, 35404.27it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #720000, processed 7502881 words and 3422252 word types


 83%|██████████████████████████▍     | 726865/878099 [00:18<00:03, 40217.55it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #730000, processed 7607465 words and 3461569 word types


 84%|██████████████████████████▉     | 739117/878099 [00:18<00:03, 39285.20it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #740000, processed 7714590 words and 3501491 word types


 85%|███████████████████████████▏    | 746970/878099 [00:18<00:03, 37550.53it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #750000, processed 7821838 words and 3539473 word types


 86%|███████████████████████████▌    | 757636/878099 [00:19<00:04, 29034.59it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #760000, processed 7933426 words and 3577546 word types


 88%|████████████████████████████    | 769174/878099 [00:19<00:03, 33832.87it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #770000, processed 8045359 words and 3618220 word types


 88%|████████████████████████████▎   | 776949/878099 [00:19<00:03, 33698.44it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #780000, processed 8151435 words and 3658536 word types


 90%|████████████████████████████▊   | 789127/878099 [00:19<00:02, 35262.06it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #790000, processed 8255893 words and 3695777 word types


 91%|█████████████████████████████▏  | 799496/878099 [00:20<00:02, 33146.15it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #800000, processed 8362307 words and 3733605 word types


 92%|█████████████████████████████▍  | 807022/878099 [00:20<00:01, 35579.95it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #810000, processed 8453399 words and 3764561 word types


 93%|█████████████████████████████▊  | 818030/878099 [00:20<00:01, 34284.70it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #820000, processed 8562404 words and 3804095 word types


 94%|██████████████████████████████▏ | 827963/878099 [00:21<00:01, 30421.56it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #830000, processed 8669483 words and 3843916 word types


 95%|██████████████████████████████▌ | 837163/878099 [00:21<00:01, 25685.02it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #840000, processed 8775832 words and 3892673 word types


 97%|██████████████████████████████▉ | 848726/878099 [00:21<00:00, 33462.41it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #850000, processed 8879321 words and 3939249 word types


 98%|███████████████████████████████▏| 857328/878099 [00:22<00:00, 38667.76it/s]

INFO:gensim.models.phrases:collected 3973422 word types from a corpus of 8956235 words (unigram + bigrams) and 857328 sentences
INFO:gensim.models.phrases:using 3973422 counts as vocab in Phrases<0 vocab, min_count=10, threshold=10, max_vocab_size=40000000>
INFO:gensim.utils:saving Phrases object under models/phrases/trigram.mod, separately None
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/trigram.mod', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}





INFO:gensim.utils:saved models/phrases/trigram.mod
INFO:gensim.utils:loading Phrases object from models/phrases/trigram.mod
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/trigram.mod', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loaded models/phrases/trigram.mod


100%|████████████████████████████████| 878099/878099 [00:58<00:00, 15136.63it/s]


2024-03-16 11:56:15.440236
Training w2v model...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/trigram/documents.txt
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 95780 words, keeping 16704 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processe

INFO:gensim.models.word2vec:PROGRESS: at sentence #690000, processed 7062354 words, keeping 308052 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #700000, processed 7160490 words, keeping 310909 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #710000, processed 7265593 words, keeping 313970 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #720000, processed 7370033 words, keeping 317284 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #730000, processed 7473381 words, keeping 320600 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #740000, processed 7578754 words, keeping 324139 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #750000, processed 7684286 words, keeping 327436 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #760000, processed 7793770 words, keeping 330630 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #770000, processed 7903943 words, keeping 334148 word types
INFO:gensi

INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 93.42% examples, 325098 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 2 - PROGRESS: at 98.57% examples, 326635 words/s, in_qsize 3, out_qsize 0
DEBUG:gensim.models.base_any2vec:job loop exiting, total 881 jobs
DEBUG:gensim.models.base_any2vec:worker exiting, processed 441 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 440 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 2 : training on 8802600 raw words (7174476 effective words) took 21.9s, 327474 effective words/s
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 

INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 43.05% examples, 329646 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 46.65% examples, 322629 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 49.43% examples, 308564 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 52.12% examples, 299749 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 55.79% examples, 296412 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 60.84% examples, 295385 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 66.59% examples, 301550 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 71.85% examples, 305262 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 5 - PROGRESS: at 76.45% examples, 306784 words/s, in_qsize 3, out_

INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 5.54% examples, 390486 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 10.80% examples, 383337 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 15.49% examples, 369230 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 21.46% examples, 379402 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 27.09% examples, 378216 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 30.72% examples, 357944 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 36.13% examples, 361997 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 41.37% examples, 363852 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 8 - PROGRESS: at 46.96% examples, 368884 words/s, in_qsize 3, out_q

INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 443 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 10 : training on 8802600 raw words (7174385 effective words) took 20.0s, 358496 effective words/s
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.base_any2vec:EPOCH 11 - PROGRESS: at 3.71% examples, 262017 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 11 - PROGRESS: at 9.30% examples, 330046 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 11 - PROGRESS:

INFO:gensim.models.base_any2vec:EPOCH 13 - PROGRESS: at 76.66% examples, 385539 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 13 - PROGRESS: at 82.39% examples, 387402 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 13 - PROGRESS: at 87.49% examples, 386578 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 13 - PROGRESS: at 94.21% examples, 392446 words/s, in_qsize 3, out_qsize 0
DEBUG:gensim.models.base_any2vec:job loop exiting, total 881 jobs
DEBUG:gensim.models.base_any2vec:worker exiting, processed 439 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 442 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 13 : training on 8802600 raw words (7173877 effective words) took 18.0s, 398070 effective words/s
INFO:gensim.models.word2vec:

INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 30.72% examples, 361388 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 36.59% examples, 369316 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 40.71% examples, 360788 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 46.54% examples, 367096 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 52.12% examples, 371467 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 57.63% examples, 366277 words/s, in_qsize 4, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 63.99% examples, 371259 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 70.43% examples, 376867 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 75.46% examples, 375246 words/s, in_qsiz

INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.base_any2vec:EPOCH 19 - PROGRESS: at 5.54% examples, 382224 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 19 - PROGRESS: at 9.63% examples, 340410 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 19 - PROGRESS: at 15.13% examples, 359253 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 19 - PROGRESS: at 20.76% examples, 365591 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 19 - PROGRESS: at 26.52% examples, 360894 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 19 - PROGRESS: at 31.49% examples, 358333 words/s, in_qsize 3

In [1]:
%run create_dict.py

Vocab size in the w2v model: 57115
Dictionary created. 
Dictionary deduplicated. 
Dictionary saved at outputs/dict/expanded_dict.csv
