## 1 - Imports

In [None]:
import pandas as pd
import global_options
from nltk.tokenize import sent_tokenize
from pathlib import Path
import stanza
from stanza.server import CoreNLPClient
import os

## 2 - Data manipulation

In [2]:
data = pd.read_csv(Path(global_options.DATA_FOLDER, "input", "Blockchain_FinancialTimes.csv"))

# Remove duplicates
data = data.drop_duplicates(subset = 'Article title')

# Combine columns into 'full_text' and clean up quotation marks
data['full_text'] = data['Article title'] + ' ' + data['Article authors'] + ' ' + data['Full article text']

# Select the desired columns
data = data[['full_text']]

# Replace the newline operator as space
data['full_text'] = data['full_text'].str.replace('\n', ' ', regex = False)

#
data['index'] = range(1, len(data)+1)

data['full_text'] = data['full_text'].str.replace('â\u0080\u0098', "‘", regex=False)
data['full_text'] = data['full_text'].str.replace('â\u0080\u0099', "’", regex=False)
data['full_text'] = data['full_text'].str.replace('â\u0080\u009c', "“", regex=False)
data['full_text'] = data['full_text'].str.replace('â\u0080\u009d', "”", regex=False)
data['full_text'] = data['full_text'].str.replace('â\u0080\u0094', "—", regex=False)


# Write to text files
with open(Path(global_options.DATA_FOLDER, "input", "documents.txt"), "w") as file_docs:
    for _, row in data.iterrows():
        file_docs.write(f"{row['full_text']}\n")
        
with open(Path(global_options.DATA_FOLDER, "input", "document_ids.txt"), "w") as file_docs:
    for _, row in data.iterrows():
        file_docs.write(f"{row['index']}\n")

## 3 - Run py files to expand the dictionary

In [4]:
%run parse_parallel.py

2024-03-16 23:41:56 INFO: Writing properties to tmp file: corenlp_server-4ff0c3070de0458c.props
2024-03-16 23:41:56 INFO: Starting server with command: java -Xmx8G -cp /Users/yuruchen/CoreNLP/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9004 -timeout 12000000 -threads 2 -maxCharLength 1000000 -quiet False -serverProperties corenlp_server-4ff0c3070de0458c.props -preload -outputFormat serialized


2024-03-16 23:41:56.283366
Processing line: 100.


[main] INFO CoreNLP - --- StanfordCoreNLPServer#main() called ---
[main] INFO CoreNLP - Server default properties:
			(Note: unspecified annotator properties are English defaults)
			annotators = tokenize, ssplit, pos, lemma, ner, depparse
			inputFormat = text
			ner.applyFineGrained = false
			outputFormat = serialized
			prettyPrint = false
			threads = 2
[main] INFO CoreNLP - Threads: 2
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator pos
[main] INFO edu.stanford.nlp.tagger.maxent.MaxentTagger - Loading POS tagger from edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger ... done [0.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
[main] INFO edu.stanford.nlp.ie.AbstractSequenceClassifier - Loading classifier from edu/stanford/nlp/models/ner/english

2024-03-16 23:43:52.150247
Processing line: 200.
2024-03-16 23:46:11.234658
Processing line: 300.
2024-03-16 23:48:58.463076
Processing line: 400.
2024-03-17 00:04:15.659402
Processing line: 500.
2024-03-17 00:06:09.288758
Processing line: 600.
2024-03-17 00:11:24.049966
Processing line: 700.
2024-03-17 00:13:18.206309
Processing line: 800.
2024-03-17 00:15:39.783039
Processing line: 900.
2024-03-17 00:17:22.841182
Processing line: 1000.
2024-03-17 00:19:14.551950
Processing line: 1100.
2024-03-17 00:20:59.846853
Processing line: 1200.
2024-03-17 00:23:13.845903
Processing line: 1300.
2024-03-17 00:24:59.245850
Processing line: 1400.
2024-03-17 00:26:52.328737
Processing line: 1500.
2024-03-17 00:28:38.332137
Processing line: 1600.
2024-03-17 00:30:23.306562
Processing line: 1700.
2024-03-17 00:32:08.725556
Processing line: 1800.
2024-03-17 00:34:02.936031
Processing line: 1900.
2024-03-17 00:36:07.781849
Processing line: 2000.
2024-03-17 00:38:13.653082
Processing line: 2100.
2024-03-

[Thread-0] INFO CoreNLP - CoreNLP Server is shutting down.


In [5]:
%run clean_and_train.py

2024-03-17 02:10:50.042162
Processing line: 200000.
2024-03-17 02:10:57.907009
Processing line: 400000.
2024-03-17 02:11:03.644445
Training phraser...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/unigram/documents.txt


  0%|                                                | 0/341135 [00:00<?, ?it/s]

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/unigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/unigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types


  2%|▊                                 | 8350/341135 [00:00<00:03, 83485.95it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #10000, processed 90818 words and 70981 word types


  5%|█▋                               | 17644/341135 [00:00<00:03, 89040.45it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #20000, processed 185424 words and 131273 word types


  8%|██▌                              | 26549/341135 [00:00<00:03, 87588.70it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #30000, processed 285132 words and 192197 word types


 10%|███▍                             | 35311/341135 [00:00<00:03, 87415.08it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #40000, processed 378659 words and 243789 word types


 13%|████▎                            | 44055/341135 [00:00<00:03, 85331.36it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #50000, processed 487333 words and 303281 word types


 15%|█████                            | 52597/341135 [00:00<00:03, 80807.74it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #60000, processed 595919 words and 362199 word types


 20%|██████▋                          | 68588/341135 [00:00<00:03, 77015.68it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #70000, processed 706287 words and 419365 word types


 22%|███████▍                         | 76309/341135 [00:00<00:03, 75597.01it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #80000, processed 809934 words and 468445 word types


 25%|████████▏                        | 84452/341135 [00:01<00:03, 77306.52it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #90000, processed 913909 words and 514585 word types


 29%|█████████▋                       | 99877/341135 [00:01<00:03, 76362.55it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #100000, processed 1017279 words and 561666 word types


 32%|██████████                      | 107672/341135 [00:01<00:03, 76828.71it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #110000, processed 1121098 words and 610417 word types


 34%|██████████▊                     | 115361/341135 [00:01<00:02, 75378.73it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #120000, processed 1231387 words and 663131 word types


 36%|███████████▌                    | 122907/341135 [00:01<00:02, 74105.87it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #130000, processed 1342720 words and 714610 word types


 40%|████████████▉                   | 137633/341135 [00:01<00:02, 69794.83it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #140000, processed 1446361 words and 759740 word types


 42%|█████████████▌                  | 144677/341135 [00:01<00:02, 69624.26it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #150000, processed 1561165 words and 808616 word types


 46%|██████████████▉                 | 158581/341135 [00:02<00:02, 67943.86it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #160000, processed 1679738 words and 863039 word types


 48%|███████████████▌                | 165396/341135 [00:02<00:02, 66678.55it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #170000, processed 1794559 words and 913802 word types


 52%|████████████████▊               | 178801/341135 [00:02<00:02, 66487.07it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #180000, processed 1911366 words and 964164 word types


 54%|█████████████████▍              | 185458/341135 [00:02<00:02, 66082.82it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #190000, processed 2026809 words and 1015071 word types


 58%|██████████████████▋             | 199287/341135 [00:02<00:02, 67671.88it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #200000, processed 2136251 words and 1058754 word types


 60%|███████████████████▎            | 206059/341135 [00:02<00:02, 67151.11it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #210000, processed 2253258 words and 1106543 word types


 64%|████████████████████▌           | 219553/341135 [00:03<00:01, 65624.49it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #220000, processed 2365538 words and 1156779 word types


 66%|█████████████████████▏          | 226217/341135 [00:03<00:01, 65920.96it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #230000, processed 2481135 words and 1205209 word types


 70%|██████████████████████▍         | 239410/341135 [00:03<00:01, 65477.11it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #240000, processed 2596184 words and 1251523 word types


 72%|███████████████████████         | 246231/341135 [00:03<00:01, 66283.95it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #250000, processed 2705839 words and 1296158 word types


 74%|███████████████████████▋        | 253091/341135 [00:03<00:01, 66968.86it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #260000, processed 2804425 words and 1336847 word types


 79%|█████████████████████████▏      | 268231/341135 [00:03<00:01, 71128.25it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #270000, processed 2908915 words and 1380207 word types


 81%|█████████████████████████▊      | 275348/341135 [00:03<00:01, 58302.37it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #280000, processed 3019173 words and 1425004 word types


 85%|███████████████████████████     | 288704/341135 [00:04<00:00, 62206.87it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #290000, processed 3130337 words and 1471063 word types


 87%|███████████████████████████▋    | 295396/341135 [00:04<00:00, 63519.10it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #300000, processed 3232514 words and 1514984 word types


 89%|████████████████████████████▍   | 302940/341135 [00:04<00:00, 66911.97it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #310000, processed 3329968 words and 1555480 word types


 93%|█████████████████████████████▊  | 318141/341135 [00:04<00:00, 71552.17it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #320000, processed 3424186 words and 1594182 word types


 95%|██████████████████████████████▌ | 325377/341135 [00:04<00:00, 71047.42it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #330000, processed 3533407 words and 1636374 word types


100%|███████████████████████████████▊| 339541/341135 [00:04<00:00, 68726.82it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #340000, processed 3643472 words and 1681467 word types


100%|███████████████████████████████▉| 340328/341135 [00:04<00:00, 70302.20it/s]

INFO:gensim.models.phrases:collected 1683026 word types from a corpus of 3647052 words (unigram + bigrams) and 340328 sentences
INFO:gensim.models.phrases:using 1683026 counts as vocab in Phrases<0 vocab, min_count=10, threshold=10, max_vocab_size=40000000>
INFO:gensim.utils:saving Phrases object under models/phrases/bigram.mod, separately None
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/bigram.mod', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}





INFO:gensim.utils:saved models/phrases/bigram.mod
INFO:gensim.utils:loading Phrases object from models/phrases/bigram.mod
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/bigram.mod', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loaded models/phrases/bigram.mod


100%|████████████████████████████████| 341135/341135 [00:09<00:00, 34364.60it/s]


2024-03-17 02:11:22.119791
Training phraser...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/bigram/documents.txt


  0%|                                                | 0/341135 [00:00<?, ?it/s]

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/bigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/bigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types


  3%|▉                                 | 9843/341135 [00:00<00:03, 98426.10it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #10000, processed 85724 words and 71988 word types


  6%|█▉                               | 19686/341135 [00:00<00:03, 95824.15it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #20000, processed 174994 words and 134126 word types


  9%|██▊                              | 29274/341135 [00:00<00:03, 91812.58it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #30000, processed 269372 words and 197132 word types


 11%|███▋                             | 38639/341135 [00:00<00:03, 92511.41it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #40000, processed 357696 words and 250723 word types


 14%|████▋                            | 47903/341135 [00:00<00:03, 89210.40it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #50000, processed 459978 words and 312823 word types


 17%|█████▍                           | 56846/341135 [00:00<00:03, 84084.22it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #60000, processed 563198 words and 374122 word types


 19%|██████▎                          | 65305/341135 [00:00<00:03, 82036.32it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #70000, processed 667182 words and 433901 word types


 22%|███████                          | 73541/341135 [00:00<00:03, 80293.89it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #80000, processed 764895 words and 485499 word types


 26%|████████▋                        | 89966/341135 [00:01<00:03, 81099.43it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #90000, processed 863019 words and 534078 word types


 29%|█████████▍                       | 98089/341135 [00:01<00:02, 81108.82it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #100000, processed 960968 words and 583360 word types


 31%|█████████▉                      | 106209/341135 [00:01<00:02, 80437.83it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #110000, processed 1059200 words and 634397 word types


 33%|██████████▋                     | 114259/341135 [00:01<00:02, 78498.06it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #120000, processed 1163858 words and 689745 word types


 38%|████████████▏                   | 129670/341135 [00:01<00:02, 74730.90it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #130000, processed 1269436 words and 743735 word types


 40%|████████████▉                   | 137394/341135 [00:01<00:02, 75451.92it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #140000, processed 1367658 words and 791246 word types


 42%|█████████████▌                  | 144955/341135 [00:01<00:02, 73113.55it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #150000, processed 1475613 words and 842960 word types


 47%|██████████████▉                 | 159467/341135 [00:02<00:02, 70995.67it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #160000, processed 1588420 words and 900222 word types


 49%|███████████████▋                | 166577/341135 [00:02<00:02, 69750.19it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #170000, processed 1698164 words and 953727 word types


 51%|████████████████▎               | 173560/341135 [00:02<00:02, 68670.63it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #180000, processed 1809218 words and 1006892 word types


 55%|█████████████████▌              | 187282/341135 [00:02<00:02, 68355.51it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #190000, processed 1918810 words and 1061152 word types


 57%|██████████████████▏             | 194553/341135 [00:02<00:02, 69630.75it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #200000, processed 2022278 words and 1107628 word types


 61%|███████████████████▌            | 208482/341135 [00:02<00:01, 69070.31it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #210000, processed 2133451 words and 1158398 word types


 63%|████████████████████▏           | 215392/341135 [00:02<00:01, 68114.02it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #220000, processed 2240749 words and 1211050 word types


 67%|█████████████████████▌          | 229508/341135 [00:03<00:01, 68694.72it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #230000, processed 2350578 words and 1262182 word types


 69%|██████████████████████▏         | 236381/341135 [00:03<00:01, 68380.25it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #240000, processed 2459561 words and 1310999 word types


 71%|██████████████████████▊         | 243222/341135 [00:03<00:01, 68016.63it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #250000, processed 2563859 words and 1357859 word types


 76%|████████████████████████▏       | 257771/341135 [00:03<00:01, 70567.66it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #260000, processed 2658006 words and 1400523 word types


 78%|████████████████████████▊       | 264831/341135 [00:03<00:01, 65996.37it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #270000, processed 2757693 words and 1446099 word types


 82%|██████████████████████████      | 278450/341135 [00:04<00:02, 29299.65it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #280000, processed 2862999 words and 1493208 word types


 84%|██████████████████████████▋     | 284863/341135 [00:04<00:01, 34631.26it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #290000, processed 2969503 words and 1541662 word types


 87%|███████████████████████████▉    | 297576/341135 [00:04<00:01, 42801.53it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #300000, processed 3067956 words and 1587564 word types


 89%|████████████████████████████▌   | 305113/341135 [00:04<00:00, 49844.71it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #310000, processed 3161483 words and 1629947 word types


 92%|█████████████████████████████▎  | 312475/341135 [00:04<00:00, 55456.06it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #320000, processed 3251841 words and 1670429 word types


 96%|██████████████████████████████▋ | 327240/341135 [00:04<00:00, 62900.55it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #330000, processed 3355540 words and 1715030 word types


 98%|███████████████████████████████▎| 334130/341135 [00:05<00:00, 64396.09it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #340000, processed 3460730 words and 1762368 word types


100%|███████████████████████████████▉| 340328/341135 [00:05<00:00, 66188.74it/s]

INFO:gensim.models.phrases:collected 1764008 word types from a corpus of 3464175 words (unigram + bigrams) and 340328 sentences
INFO:gensim.models.phrases:using 1764008 counts as vocab in Phrases<0 vocab, min_count=10, threshold=10, max_vocab_size=40000000>
INFO:gensim.utils:saving Phrases object under models/phrases/trigram.mod, separately None
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/trigram.mod', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}





INFO:gensim.utils:saved models/phrases/trigram.mod
INFO:gensim.utils:loading Phrases object from models/phrases/trigram.mod
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/trigram.mod', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loaded models/phrases/trigram.mod


100%|████████████████████████████████| 341135/341135 [00:09<00:00, 34441.26it/s]


2024-03-17 02:11:40.517178
Training w2v model...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/trigram/documents.txt
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 84323 words, keeping 13585 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processe

INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 24.00% examples, 623428 words/s, in_qsize 3, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 47.98% examples, 638858 words/s, in_qsize 3, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 71.49% examples, 655307 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 96.08% examples, 651459 words/s, in_qsize 4, out_qsize 0
DEBUG:gensim.models.base_any2vec:job loop exiting, total 342 jobs
DEBUG:gensim.models.base_any2vec:worker exiting, processed 170 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 172 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 3 : training on 3414983 raw words (2739900 effective words) took 4.2s, 654535 effective words/s
INFO:gensim.models.word2vec:readin

DEBUG:gensim.models.base_any2vec:job loop exiting, total 342 jobs
DEBUG:gensim.models.base_any2vec:worker exiting, processed 171 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 171 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 9 : training on 3414983 raw words (2739209 effective words) took 4.1s, 663342 effective words/s
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.base_any2vec:EPOCH 10 - PROGRESS: at 16.35% examples, 413013 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec

INFO:gensim.models.base_any2vec:EPOCH - 15 : training on 3414983 raw words (2739552 effective words) took 4.6s, 600695 effective words/s
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 25.87% examples, 663465 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 49.34% examples, 657414 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 71.49% examples, 655017 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 16 - PROGRESS: at 96.96% examples, 659011 words/s, in_qsize 4, out_qsize 0
DEBUG:gensim.models.base_any2vec:job loop exiting, total 342 jobs
DEBUG:g

In [13]:
%run create_dict.py

Vocab size in the w2v model: 29311
Dictionary created. 
Dictionary deduplicated. 
Dictionary saved at outputs/dict/expanded_dict.csv
