## 1 - Imports

In [None]:
import pandas as pd
import global_options
from nltk.tokenize import sent_tokenize
from pathlib import Path
import stanza
from stanza.server import CoreNLPClient
import os

## 2 - Data manipulation

In [2]:
data = pd.read_csv(Path(global_options.DATA_FOLDER, "input", "cat-1.csv"))

# Remove duplicates
data = data.drop_duplicates(subset = 'Article title')

# Combine columns into 'full_text' and clean up quotation marks
data['full_text'] = data['Article title'] + ' ' + data['Article authors'] + ' ' + data['Full article text']

# Select the desired columns
data = data[['full_text']]

# Replace the newline operator as space
data['full_text'] = data['full_text'].str.replace('\n', ' ', regex = False)

#
data['index'] = range(1, len(data)+1)

# Write to text files
with open(Path(global_options.DATA_FOLDER, "input", "documents.txt"), "w") as file_docs:
    for _, row in data.iterrows():
        file_docs.write(f"{row['full_text']}\n")
        
with open(Path(global_options.DATA_FOLDER, "input", "document_ids.txt"), "w") as file_docs:
    for _, row in data.iterrows():
        file_docs.write(f"{row['index']}\n")

## 3 - Run py files to expand the dictionary

In [6]:
%run parse_parallel.py

2024-03-16 10:47:56 INFO: Writing properties to tmp file: corenlp_server-25fbad575db0453c.props
2024-03-16 10:47:56 INFO: Starting server with command: java -Xmx8G -cp /Users/yuruchen/CoreNLP/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9003 -timeout 12000000 -threads 8 -maxCharLength 1000000 -quiet False -serverProperties corenlp_server-25fbad575db0453c.props -preload -outputFormat serialized


2024-03-16 10:47:56.203613
Processing line: 100.


[main] INFO CoreNLP - --- StanfordCoreNLPServer#main() called ---
[main] INFO CoreNLP - Server default properties:
			(Note: unspecified annotator properties are English defaults)
			annotators = tokenize, ssplit, pos, lemma, ner, depparse
			inputFormat = text
			ner.applyFineGrained = false
			outputFormat = serialized
			prettyPrint = false
			threads = 8
[main] INFO CoreNLP - Threads: 8
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator pos
[main] INFO edu.stanford.nlp.tagger.maxent.MaxentTagger - Loading POS tagger from edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger ... done [0.8 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
[main] INFO edu.stanford.nlp.ie.AbstractSequenceClassifier - Loading classifier from edu/stanford/nlp/models/ner/english

2024-03-16 10:50:24.880591
Processing line: 200.
2024-03-16 10:52:41.867479
Processing line: 300.
2024-03-16 10:54:36.678857
Processing line: 400.
2024-03-16 10:56:39.953159
Processing line: 500.
2024-03-16 10:58:35.075827
Processing line: 600.
2024-03-16 11:00:58.674102
Processing line: 700.
2024-03-16 11:03:07.051575
Processing line: 800.
2024-03-16 11:05:14.518816
Processing line: 900.
2024-03-16 11:07:07.357357
Processing line: 1000.
2024-03-16 11:09:13.899252
Processing line: 1100.
2024-03-16 11:11:25.429357
Processing line: 1200.
2024-03-16 11:13:39.651115
Processing line: 1300.
2024-03-16 11:15:47.228514
Processing line: 1400.
2024-03-16 11:17:53.393650
Processing line: 1500.
2024-03-16 11:19:55.368456
Processing line: 1600.
2024-03-16 11:22:08.439339
Processing line: 1700.
2024-03-16 11:24:29.696391
Processing line: 1800.
2024-03-16 11:26:53.855858
Processing line: 1900.
2024-03-16 11:29:00.690857
Processing line: 2000.
2024-03-16 11:31:13.401129
Processing line: 2100.
2024-03-

[Thread-0] INFO CoreNLP - CoreNLP Server is shutting down.


In [3]:
%run clean_and_train.py

2024-03-16 12:52:59.124269
Processing line: 200000.
2024-03-16 12:53:06.490929
Processing line: 400000.
2024-03-16 12:53:11.486978
Training phraser...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/unigram/documents.txt


  0%|                                                | 0/335476 [00:00<?, ?it/s]

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/unigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/unigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types


  2%|▋                                 | 6859/335476 [00:00<00:04, 68574.37it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #10000, processed 117543 words and 99576 word types


  4%|█▎                               | 13798/335476 [00:00<00:04, 69050.47it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #20000, processed 230331 words and 180340 word types


  8%|██▊                              | 27985/335476 [00:00<00:04, 70439.81it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #30000, processed 342257 words and 252140 word types


 11%|███▌                             | 35712/335476 [00:00<00:04, 72898.35it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #40000, processed 451055 words and 317904 word types


 13%|████▏                            | 43002/335476 [00:00<00:04, 71011.78it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #50000, processed 564270 words and 383988 word types


 17%|█████▌                           | 57028/335476 [00:00<00:04, 68562.27it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #60000, processed 678940 words and 450835 word types


 19%|██████▎                          | 63977/335476 [00:00<00:03, 68839.95it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #70000, processed 792329 words and 515916 word types


 23%|███████▋                         | 77668/335476 [00:01<00:03, 67521.00it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #80000, processed 906385 words and 578838 word types


 25%|████████▎                        | 84463/335476 [00:01<00:03, 67646.31it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #90000, processed 1021351 words and 642130 word types


 29%|█████████▋                       | 98090/335476 [00:01<00:03, 67497.60it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #100000, processed 1130976 words and 701045 word types


 31%|██████████                      | 104843/335476 [00:01<00:03, 64041.15it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #110000, processed 1236671 words and 756487 word types


 35%|███████████▎                    | 119053/335476 [00:01<00:03, 67711.25it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #120000, processed 1340110 words and 807711 word types


 38%|████████████                    | 126100/335476 [00:01<00:03, 68520.77it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #130000, processed 1448347 words and 860168 word types


 42%|█████████████▎                  | 139837/335476 [00:02<00:02, 67723.75it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #140000, processed 1559257 words and 914180 word types


 44%|██████████████                  | 146810/335476 [00:02<00:02, 68312.69it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #150000, processed 1667932 words and 968521 word types


 46%|██████████████▋                 | 153651/335476 [00:02<00:02, 67646.76it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #160000, processed 1775015 words and 1023744 word types


 50%|███████████████▉                | 167298/335476 [00:02<00:02, 67526.20it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #170000, processed 1884040 words and 1081229 word types


 52%|████████████████▌               | 174140/335476 [00:02<00:02, 67789.22it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #180000, processed 1987716 words and 1129288 word types


 56%|█████████████████▉              | 188181/335476 [00:02<00:02, 68243.23it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #190000, processed 2098246 words and 1177323 word types


 58%|██████████████████▌             | 195009/335476 [00:02<00:02, 67546.64it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #200000, processed 2206323 words and 1222175 word types


 62%|███████████████████▉            | 208719/335476 [00:03<00:01, 67973.54it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #210000, processed 2312976 words and 1265111 word types


 64%|████████████████████▌           | 215519/335476 [00:03<00:01, 67176.41it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #220000, processed 2423795 words and 1313026 word types


 68%|█████████████████████▊          | 228933/335476 [00:03<00:01, 66637.04it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #230000, processed 2532084 words and 1365995 word types


 70%|██████████████████████▍         | 235787/335476 [00:03<00:01, 67200.25it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #240000, processed 2638922 words and 1417306 word types


 74%|███████████████████████▋        | 248876/335476 [00:03<00:01, 59786.30it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #250000, processed 2753087 words and 1470334 word types


 76%|████████████████████████▎       | 255043/335476 [00:03<00:01, 60307.85it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #260000, processed 2872431 words and 1513387 word types


 80%|█████████████████████████▍      | 267331/335476 [00:04<00:01, 60124.58it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #270000, processed 2989185 words and 1554561 word types


 82%|██████████████████████████▏     | 274167/335476 [00:04<00:00, 62507.47it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #280000, processed 3089565 words and 1600422 word types


 86%|███████████████████████████▌    | 288366/335476 [00:04<00:00, 66744.14it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #290000, processed 3187938 words and 1647586 word types


 88%|████████████████████████████▏   | 295142/335476 [00:04<00:00, 67041.72it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #300000, processed 3294515 words and 1699618 word types


 92%|█████████████████████████████▍  | 308631/335476 [00:04<00:00, 67238.92it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #310000, processed 3401791 words and 1749464 word types


 94%|██████████████████████████████  | 315367/335476 [00:04<00:00, 66689.81it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #320000, processed 3509334 words and 1796084 word types


 98%|███████████████████████████████▎| 327722/335476 [00:04<00:00, 66604.18it/s]

INFO:gensim.models.phrases:collected 1832234 word types from a corpus of 3591847 words (unigram + bigrams) and 327722 sentences
INFO:gensim.models.phrases:using 1832234 counts as vocab in Phrases<0 vocab, min_count=10, threshold=10, max_vocab_size=40000000>
INFO:gensim.utils:saving Phrases object under models/phrases/bigram.mod, separately None
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/bigram.mod', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}





INFO:gensim.utils:saved models/phrases/bigram.mod
INFO:gensim.utils:loading Phrases object from models/phrases/bigram.mod
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/bigram.mod', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loaded models/phrases/bigram.mod


100%|████████████████████████████████| 335476/335476 [00:09<00:00, 35993.29it/s]


2024-03-16 12:53:28.986403
Training phraser...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/bigram/documents.txt


  0%|                                                | 0/335476 [00:00<?, ?it/s]

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/bigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/bigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types


  2%|▊                                 | 7629/335476 [00:00<00:04, 76276.43it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #10000, processed 111372 words and 101025 word types


  5%|█▌                               | 15257/335476 [00:00<00:04, 76256.54it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #20000, processed 218897 words and 183837 word types


  7%|██▎                              | 22989/335476 [00:00<00:04, 76737.82it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #30000, processed 325054 words and 258103 word types


 12%|███▊                             | 38774/335476 [00:00<00:03, 78181.49it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #40000, processed 428295 words and 326146 word types


 14%|████▌                            | 46593/335476 [00:00<00:03, 75227.90it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #50000, processed 535899 words and 394692 word types


 16%|█████▎                           | 54136/335476 [00:00<00:03, 73621.45it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #60000, processed 645851 words and 463823 word types


 21%|██████▊                          | 68840/335476 [00:00<00:03, 73134.34it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #70000, processed 754566 words and 530985 word types


 23%|███████▍                         | 76161/335476 [00:01<00:03, 72706.82it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #80000, processed 863706 words and 596178 word types


 25%|████████▏                        | 83437/335476 [00:01<00:03, 71886.05it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #90000, processed 973771 words and 661852 word types


 29%|█████████▌                       | 97771/335476 [00:01<00:03, 69964.51it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #100000, processed 1079227 words and 722807 word types


 31%|██████████                      | 105339/335476 [00:01<00:03, 71639.98it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #110000, processed 1180698 words and 780389 word types


 34%|██████████▋                     | 112512/335476 [00:01<00:03, 71094.61it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #120000, processed 1280027 words and 833662 word types


 38%|████████████▏                   | 127574/335476 [00:01<00:02, 72653.12it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #130000, processed 1383883 words and 888282 word types


 40%|████████████▉                   | 135002/335476 [00:01<00:02, 73133.21it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #140000, processed 1490328 words and 944563 word types


 45%|██████████████▎                 | 149470/335476 [00:02<00:02, 71309.88it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #150000, processed 1594599 words and 1001012 word types


 47%|██████████████▉                 | 156609/335476 [00:02<00:02, 70989.57it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #160000, processed 1697955 words and 1058194 word types


 49%|███████████████▌                | 163714/335476 [00:02<00:02, 70609.63it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #170000, processed 1803589 words and 1117522 word types


 53%|████████████████▉               | 178157/335476 [00:02<00:02, 71442.16it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #180000, processed 1902572 words and 1167901 word types


 55%|█████████████████▋              | 185304/335476 [00:02<00:02, 70845.33it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #190000, processed 2007625 words and 1218434 word types


 60%|███████████████████             | 199702/335476 [00:02<00:01, 71424.58it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #200000, processed 2109923 words and 1266088 word types


 62%|███████████████████▋            | 206847/335476 [00:02<00:01, 71036.83it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #210000, processed 2210968 words and 1311427 word types


 64%|████████████████████▍           | 213953/335476 [00:02<00:01, 70988.39it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #220000, processed 2316260 words and 1361914 word types


 68%|█████████████████████▊          | 228112/335476 [00:03<00:01, 66282.32it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #230000, processed 2420406 words and 1416783 word types


 70%|██████████████████████▍         | 235258/335476 [00:03<00:01, 67755.96it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #240000, processed 2523423 words and 1470034 word types


 74%|███████████████████████▋        | 248885/335476 [00:03<00:01, 67516.28it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #250000, processed 2633571 words and 1525298 word types


 76%|████████████████████████▍       | 255658/335476 [00:03<00:01, 66958.62it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #260000, processed 2746914 words and 1571092 word types


 80%|█████████████████████████▋      | 269095/335476 [00:03<00:00, 66518.53it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #270000, processed 2857273 words and 1615174 word types


 82%|██████████████████████████▎     | 276396/335476 [00:03<00:00, 68430.65it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #280000, processed 2953988 words and 1662815 word types


 85%|███████████████████████████     | 283941/335476 [00:03<00:00, 70508.45it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #290000, processed 3049257 words and 1711519 word types


 89%|████████████████████████████▍   | 298641/335476 [00:04<00:00, 71001.77it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #300000, processed 3152409 words and 1765287 word types


 91%|█████████████████████████████▏  | 305749/335476 [00:04<00:00, 70519.24it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #310000, processed 3255879 words and 1816948 word types


 95%|██████████████████████████████▌ | 319806/335476 [00:04<00:00, 69194.31it/s]

INFO:gensim.models.phrases:PROGRESS: at sentence #320000, processed 3359202 words and 1865572 word types


 98%|███████████████████████████████▎| 327722/335476 [00:04<00:00, 70920.99it/s]

INFO:gensim.models.phrases:collected 1903345 word types from a corpus of 3438588 words (unigram + bigrams) and 327722 sentences
INFO:gensim.models.phrases:using 1903345 counts as vocab in Phrases<0 vocab, min_count=10, threshold=10, max_vocab_size=40000000>
INFO:gensim.utils:saving Phrases object under models/phrases/trigram.mod, separately None
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/trigram.mod', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}





INFO:gensim.utils:saved models/phrases/trigram.mod
INFO:gensim.utils:loading Phrases object from models/phrases/trigram.mod
DEBUG:smart_open.smart_open_lib:{'uri': 'models/phrases/trigram.mod', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loaded models/phrases/trigram.mod


100%|████████████████████████████████| 335476/335476 [00:09<00:00, 36326.20it/s]


2024-03-16 12:53:46.572869
Training w2v model...
DEBUG:gensim.models.word2vec:single file given as source, rather than a directory of files
DEBUG:gensim.models.word2vec:consider using models.word2vec.LineSentence for a single file
INFO:gensim.models.word2vec:files read into PathLineSentences:data/processed/trigram/documents.txt
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 109969 words, keeping 18669 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, process

INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 6 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 44 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 5 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 43 jobs
DEBUG:gensim.models.base_any2vec:worker exiting, processed 44 jobs
DEBUG:gensim.models.base_any2vec:worker exiting, processed 42 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 4 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 45 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPO

DEBUG:gensim.models.base_any2vec:worker exiting, processed 44 jobs
DEBUG:gensim.models.base_any2vec:worker exiting, processed 43 jobs
DEBUG:gensim.models.base_any2vec:worker exiting, processed 41 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 4 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 46 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 42 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 6 : training on 3402500 raw words (2753742 effective words) took 2.8s, 995088 effective words/s
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
D

DEBUG:gensim.models.base_any2vec:worker exiting, processed 45 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 43 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 42 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 44 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 10 : training on 3402500 raw words (2753808 effective words) took 2.9s, 961515 effective words/s
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': N

INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 4 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 42 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 14 : training on 3402500 raw words (2754057 effective words) took 2.9s, 950927 effective words/s
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gen

INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 2 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 42 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 1 more threads
DEBUG:gensim.models.base_any2vec:worker exiting, processed 43 jobs
INFO:gensim.models.base_any2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.base_any2vec:EPOCH - 18 : training on 3402500 raw words (2753923 effective words) took 2.9s, 965823 effective words/s
INFO:gensim.models.word2vec:reading file data/processed/trigram/documents.txt
DEBUG:smart_open.smart_open_lib:{'uri': 'data/processed/trigram/documents.txt', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.models.base_any2vec:EPOCH 19 - PROGRESS: at 32.41% examples, 913300 words/s, in_qsize 15, out_qsize 0
INFO:ge

In [4]:
%run create_dict.py

Vocab size in the w2v model: 30787
Dictionary created. 
Dictionary deduplicated. 
Dictionary saved at outputs/dict/expanded_dict.csv
