## 1 - Imports

In [None]:
import pandas as pd
import global_options
from nltk.tokenize import sent_tokenize
from pathlib import Path
import stanza
from stanza.server import CoreNLPClient
import os

## 2 - Data manipulation

In [3]:
data = pd.read_csv(Path(global_options.DATA_FOLDER, "input", "AI_FinancialTimes.csv"))

# Remove duplicates
data = data.drop_duplicates(subset = 'Article title')

# Combine columns into 'full_text' and clean up quotation marks
data['full_text'] = data['Article title'] + ' ' + data['Article authors'] + ' ' + data['Full article text']

# Select the desired columns
data = data[['full_text']]

# Replace the newline operator as space
data['full_text'] = data['full_text'].str.replace('\n', ' ', regex = False)

#
data['index'] = range(1, len(data)+1)

# Write to text files
with open(Path(global_options.DATA_FOLDER, "input", "documents.txt"), "w") as file_docs:
    for _, row in data.iterrows():
        file_docs.write(f"{row['full_text']}\n")
        
with open(Path(global_options.DATA_FOLDER, "input", "document_ids.txt"), "w") as file_docs:
    for _, row in data.iterrows():
        file_docs.write(f"{row['index']}\n")

## 3 - Run py files to expand the dictionary

In [6]:
%run parse_parallel.py

2024-03-16 00:39:59 INFO: Writing properties to tmp file: corenlp_server-3f2e10fe33964cb4.props
2024-03-16 00:39:59 INFO: Starting server with command: java -Xmx8G -cp /Users/yuruchen/CoreNLP1/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9003 -timeout 12000000 -threads 2 -maxCharLength 1000000 -quiet False -serverProperties corenlp_server-3f2e10fe33964cb4.props -preload -outputFormat serialized


2024-03-16 00:39:59.923125
Processing line: 100.


[main] INFO CoreNLP - --- StanfordCoreNLPServer#main() called ---
[main] INFO CoreNLP - Server default properties:
			(Note: unspecified annotator properties are English defaults)
			annotators = tokenize, ssplit, pos, lemma, ner, depparse
			inputFormat = text
			ner.applyFineGrained = false
			outputFormat = serialized
			prettyPrint = false
			threads = 2
[main] INFO CoreNLP - Threads: 2
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator pos
[main] INFO edu.stanford.nlp.tagger.maxent.MaxentTagger - Loading POS tagger from edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger ... done [0.6 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
[main] INFO edu.stanford.nlp.ie.AbstractSequenceClassifier - Loading classifier from edu/stanford/nlp/models/ner/english

2024-03-16 00:44:05.610448
Processing line: 200.
2024-03-16 00:49:52.454312
Processing line: 300.
2024-03-16 00:55:30.997653
Processing line: 400.
2024-03-16 01:00:53.280133
Processing line: 500.
2024-03-16 01:06:53.744023
Processing line: 600.
2024-03-16 01:12:24.276245
Processing line: 700.
2024-03-16 01:17:44.092840
Processing line: 800.
2024-03-16 01:22:41.227862
Processing line: 900.
2024-03-16 01:27:47.043831
Processing line: 1000.
2024-03-16 01:32:56.544332
Processing line: 1100.
2024-03-16 01:38:57.896836
Processing line: 1200.
2024-03-16 01:45:42.919293
Processing line: 1300.
2024-03-16 01:51:30.720173
Processing line: 1400.
2024-03-16 01:57:03.028715
Processing line: 1500.
2024-03-16 02:02:36.684644
Processing line: 1600.
2024-03-16 02:08:08.130982
Processing line: 1700.
2024-03-16 02:13:32.767848
Processing line: 1800.
2024-03-16 02:18:45.701248
Processing line: 1900.
2024-03-16 02:24:46.288262
Processing line: 2000.
2024-03-16 02:31:27.610431
Processing line: 2100.
2024-03-

[Thread-0] INFO CoreNLP - CoreNLP Server is shutting down.


In [2]:
%run clean_and_train.py

2024-03-16 10:57:07.357413
Processing line: 200000.
2024-03-16 10:57:16.494286
Processing line: 400000.
2024-03-16 10:57:21.597032
Training phraser...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/unigram/documents.txt


  0%|                                                | 0/303123 [00:00<?, ?it/s]

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/unigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/unigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types


  2%|▌                                 | 5468/303123 [00:00<00:05, 54573.62it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #10000, processed 96497 words and 78812 word types


  6%|█▉                               | 18243/303123 [00:00<00:04, 60660.26it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #20000, processed 199073 words and 146239 word types


  8%|██▋                              | 24311/303123 [00:00<00:04, 56463.52it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #30000, processed 302464 words and 210179 word types


 12%|███▉                             | 36567/303123 [00:00<00:04, 59173.33it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #40000, processed 402044 words and 266829 word types


 16%|█████▎                           | 48376/303123 [00:00<00:04, 53661.90it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #50000, processed 509924 words and 323851 word types


 20%|██████▍                          | 59202/303123 [00:01<00:04, 51179.04it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #60000, processed 615759 words and 378116 word types


 21%|███████                          | 64601/303123 [00:01<00:04, 51975.15it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #70000, processed 711286 words and 431137 word types


 25%|████████▎                        | 76271/303123 [00:01<00:04, 52548.29it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #80000, processed 816444 words and 489182 word types


 29%|█████████▍                       | 86700/303123 [00:01<00:05, 39556.34it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #90000, processed 928603 words and 551329 word types


 32%|██████████▍                      | 95769/303123 [00:01<00:05, 41445.84it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #100000, processed 1037993 words and 610878 word types


 35%|███████████▏                    | 105791/303123 [00:02<00:04, 45701.00it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #110000, processed 1147420 words and 668787 word types


 39%|████████████▌                   | 119342/303123 [00:02<00:04, 37218.89it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #120000, processed 1254216 words and 725082 word types


 41%|█████████████▏                  | 124908/303123 [00:02<00:04, 41933.50it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #130000, processed 1355721 words and 777022 word types


 45%|██████████████▍                 | 136621/303123 [00:02<00:03, 49784.45it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #140000, processed 1448085 words and 821925 word types


 49%|███████████████▋                | 148812/303123 [00:03<00:02, 54994.63it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #150000, processed 1544718 words and 866207 word types


 53%|████████████████▊               | 159716/303123 [00:03<00:02, 51012.09it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #160000, processed 1655931 words and 920705 word types


 56%|█████████████████▉              | 169821/303123 [00:03<00:02, 47178.64it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #170000, processed 1764866 words and 974556 word types


 59%|██████████████████▉             | 179272/303123 [00:03<00:03, 37529.51it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #180000, processed 1872101 words and 1026980 word types


 63%|████████████████████            | 189628/303123 [00:03<00:02, 44101.37it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #190000, processed 1972616 words and 1073419 word types


 66%|█████████████████████           | 199858/303123 [00:04<00:02, 46611.64it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #200000, processed 2073767 words and 1116904 word types


 69%|██████████████████████          | 209577/303123 [00:04<00:02, 39700.91it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #210000, processed 2184476 words and 1166236 word types


 71%|██████████████████████▋         | 214762/303123 [00:04<00:02, 42791.62it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #220000, processed 2298734 words and 1216786 word types


 74%|███████████████████████▊        | 225277/303123 [00:04<00:01, 47358.34it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #230000, processed 2405248 words and 1265856 word types


 78%|████████████████████████▊       | 235437/303123 [00:04<00:01, 47956.57it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #240000, processed 2517914 words and 1316205 word types


 81%|█████████████████████████▉      | 245179/303123 [00:05<00:01, 47138.08it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #250000, processed 2624505 words and 1361544 word types


 86%|███████████████████████████▍    | 259553/303123 [00:05<00:01, 39133.03it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #260000, processed 2725145 words and 1404096 word types


 89%|████████████████████████████▌   | 269971/303123 [00:05<00:00, 44875.80it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #270000, processed 2832780 words and 1450014 word types


 92%|█████████████████████████████▌  | 279994/303123 [00:05<00:00, 47380.75it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #280000, processed 2944149 words and 1496993 word types


 96%|██████████████████████████████▌ | 289900/303123 [00:06<00:00, 48499.95it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #290000, processed 3053205 words and 1544760 word types


 97%|███████████████████████████████ | 294452/303123 [00:06<00:00, 47066.49it/s]

INFO:gensim.models.phrases:collected 1565707 word types from a corpus of 3100326 words (unigram + bigrams) and 294452 sentences
INFO:gensim.models.phrases:using 1565707 counts as vocab in Phrases<0 vocab, min_count=10, threshold=10, max_vocab_size=40000000>
INFO:gensim.utils:saving Phrases object under models/phrases/bigram.mod, separately None
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/bigram.mod', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}





INFO:gensim.utils:saved models/phrases/bigram.mod
INFO:gensim.utils:loading Phrases object from models/phrases/bigram.mod
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/bigram.mod', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loaded models/phrases/bigram.mod


100%|████████████████████████████████| 303123/303123 [00:13<00:00, 22350.93it/s]


2024-03-16 10:57:46.921195
Training phraser...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/bigram/documents.txt


  0%|                                                | 0/303123 [00:00<?, ?it/s]

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/bigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/bigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types


  2%|▋                                 | 5705/303123 [00:00<00:05, 57038.09it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #10000, processed 92000 words and 79750 word types


  6%|██                               | 19200/303123 [00:00<00:04, 65070.59it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #20000, processed 189530 words and 148769 word types


  8%|██▊                              | 25708/303123 [00:00<00:04, 63500.10it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #30000, processed 288277 words and 214449 word types


 13%|████▏                            | 38444/303123 [00:00<00:04, 62321.56it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #40000, processed 383316 words and 272958 word types


 15%|████▊                            | 44686/303123 [00:00<00:04, 60553.38it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #50000, processed 485916 words and 332070 word types


 19%|██████▏                          | 56714/303123 [00:00<00:04, 57569.14it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #60000, processed 586794 words and 388380 word types


 23%|███████▍                         | 68228/303123 [00:01<00:04, 56771.96it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #70000, processed 678992 words and 443091 word types


 25%|████████                         | 74447/303123 [00:01<00:03, 58364.19it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #80000, processed 780120 words and 502999 word types


 28%|█████████▎                       | 85783/303123 [00:01<00:04, 52841.52it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #90000, processed 888066 words and 567110 word types


 32%|██████████▌                      | 96699/303123 [00:01<00:03, 53696.92it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #100000, processed 993717 words and 628591 word types


 35%|███████████▎                    | 107405/303123 [00:01<00:03, 51743.95it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #110000, processed 1099487 words and 688590 word types


 39%|████████████▍                   | 117577/303123 [00:02<00:03, 47511.51it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #120000, processed 1202928 words and 746783 word types


 42%|█████████████▌                  | 128608/303123 [00:02<00:03, 51303.69it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #130000, processed 1301097 words and 800476 word types


 44%|██████████████▏                 | 134277/303123 [00:02<00:03, 52859.20it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #140000, processed 1390486 words and 846990 word types


 48%|███████████████▍                | 146047/303123 [00:02<00:02, 55740.24it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #150000, processed 1483859 words and 892838 word types


 52%|████████████████▌               | 157211/303123 [00:02<00:02, 52654.59it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #160000, processed 1591188 words and 949400 word types


 55%|█████████████████▋              | 167820/303123 [00:03<00:02, 50949.38it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #170000, processed 1696456 words and 1005205 word types


 59%|██████████████████▊             | 178688/303123 [00:03<00:02, 52087.43it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #180000, processed 1799907 words and 1059614 word types


 61%|███████████████████▍            | 184085/303123 [00:03<00:02, 52634.94it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #190000, processed 1896784 words and 1107985 word types


 65%|████████████████████▋           | 195889/303123 [00:03<00:01, 55744.94it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #200000, processed 1993719 words and 1153631 word types


 68%|█████████████████████▊          | 207132/303123 [00:03<00:01, 55703.59it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #210000, processed 2099784 words and 1205271 word types


 72%|██████████████████████▉         | 217756/303123 [00:04<00:01, 49160.91it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #220000, processed 2209349 words and 1258199 word types


 75%|████████████████████████        | 227852/303123 [00:04<00:01, 49284.80it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #230000, processed 2311996 words and 1308999 word types


 78%|█████████████████████████       | 237810/303123 [00:04<00:01, 44585.97it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #240000, processed 2419957 words and 1361532 word types


 82%|██████████████████████████      | 247427/303123 [00:04<00:01, 45842.71it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #250000, processed 2521978 words and 1408959 word types


 85%|███████████████████████████▏    | 257328/303123 [00:04<00:01, 41215.27it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #260000, processed 2618716 words and 1453398 word types


 88%|████████████████████████████▏   | 267282/303123 [00:05<00:00, 45070.48it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #270000, processed 2722156 words and 1501266 word types


 92%|█████████████████████████████▎  | 277695/303123 [00:05<00:00, 48505.69it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #280000, processed 2828961 words and 1550273 word types


 95%|██████████████████████████████▍ | 288243/303123 [00:05<00:00, 50132.46it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #290000, processed 2933932 words and 1600200 word types


 97%|███████████████████████████████ | 294452/303123 [00:05<00:00, 51901.51it/s]

INFO:gensim.models.phrases:collected 1622145 word types from a corpus of 2979547 words (unigram + bigrams) and 294452 sentences
INFO:gensim.models.phrases:using 1622145 counts as vocab in Phrases<0 vocab, min_count=10, threshold=10, max_vocab_size=40000000>
INFO:gensim.utils:saving Phrases object under models/phrases/trigram.mod, separately None
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/trigram.mod', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}





INFO:gensim.utils:saved models/phrases/trigram.mod
INFO:gensim.utils:loading Phrases object from models/phrases/trigram.mod
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/trigram.mod', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loaded models/phrases/trigram.mod


100%|████████████████████████████████| 303123/303123 [00:12<00:00, 24205.90it/s]


2024-03-16 10:58:11.037366
Training w2v model...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/trigram/documents.txt
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 90817 words, keeping 15131 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processe

INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 38.68% examples, 424605 words/s, in_qsize 4, out_qsize 1
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 63.71% examples, 476048 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 3 - PROGRESS: at 87.74% examples, 486419 words/s, in_qsize 3, out_qsize 0
DEBUG:gensim.models.base_any2vec:job loop exiting, total 296 jobs
DEBUG:gensim.models.base_any2vec:worker exiting, processed 148 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 148 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 3 : training on 2951787 raw words (2388388 effective words) took 4.8s, 495110 effective words/s
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/docu

INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 19.41% examples, 401526 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 37.70% examples, 420267 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 59.04% examples, 444187 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 75.33% examples, 429807 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 9 - PROGRESS: at 95.98% examples, 440116 words/s, in_qsize 3, out_qsize 0
DEBUG:gensim.models.base_any2vec:job loop exiting, total 296 jobs
DEBUG:gensim.models.base_any2vec:worker exiting, processed 148 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 148 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 9 : training on 29

DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.base_any2vec:EPOCH 15 - PROGRESS: at 17.64% examples, 403977 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 15 - PROGRESS: at 34.51% examples, 403036 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 15 - PROGRESS: at 55.70% examples, 434486 words/s, in_qsize 3, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 15 - PROGRESS: at 74.08% examples, 437597 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.base_any2vec:EPOCH 15 - PROGRESS: at 92.42% examples, 436497 words/s, in_qsize 3, out_qsize 0
DEBUG:gensim.models.base_any2vec:job loop exiting, total 296 jobs
DEBUG:gensim.models.base_any2vec:worker exiting, processed 148 jobs
INFO:gensim.models.base_any2vec:worker th

INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 20 : training on 2951787 raw words (2389125 effective words) took 6.0s, 400979 effective words/s
INFO:gensim.models.base_any2vec:training on a 59035740 raw words (47772471 effective words) took 112.2s, 425746 effective words/s
INFO:gensim.utils:saving Word2Vec object under models/w2v/w2v.mod, separately None
INFO:gensim.utils:not storing attribute vectors_norm
INFO:gensim.utils:not storing attribute cum_table
DEBUG:smart_open.smart_open_lib:{'uri': 'models/w2v/w2v.mod', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:saved models/w2v/w2v.mod


In [3]:
%run create_dict.py

Vocab size in the w2v model: 26710
Dictionary created. 
Dictionary deduplicated. 
Dictionary saved at outputs/dict/expanded_dict.csv
