# Word2Vec Trained

In [17]:
# imports needed and set up logging
import gensim 
import logging
import numpy as np

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [69]:
def read_input(datafile):
    f = open(datafile,'rb')
    for line in f:
        yield gensim.utils.simple_preprocess(line)

In [70]:
data_file="reviews_data.txt"
documents = list (read_input (data_file))

In [75]:
documents[1][:10]

['sep',
 'great',
 'budget',
 'hotel',
 'stayed',
 'two',
 'nights',
 'at',
 'aloft',
 'on']

In [72]:
#Parameters 

#size
#The size of the dense vector to represent each token or word. If you have very limited data, 
#then size should be a much smaller value. If you have lots of data, its good to experiment 
#with various sizes. A value of 100-150 has worked well for me.

#window
#The maximum distance between the target word and its neighboring word. If your neighbor's 
#position is greater than the maximum window width to the left and the right, 
#then, some neighbors are not considered as being related to the target word.
#In theory, a smaller window should give you terms that are more related. If you have lots of data,
#then the window size should not matter too much, as long as its a decent sized window.

#min_count
#Minimium frequency count of words. The model would ignore words that do not statisfy the min_count.
#Extremely infrequent words are usually unimportant, so its best to get rid of those. 
#Unless your dataset is really tiny, this does not really affect the model.

model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

2019-03-03 18:13:55,155 : INFO : collecting all words and their counts
2019-03-03 18:13:55,155 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-03-03 18:13:55,588 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2019-03-03 18:13:55,881 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2019-03-03 18:13:56,209 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2019-03-03 18:13:56,508 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2019-03-03 18:13:56,900 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2019-03-03 18:13:57,221 : INFO : PROGRESS: at sentence #60000, processed 11013723 words, keeping 76780 word types
2019-03-03 18:13:57,558 : INFO : PROGRESS: at sentence #70000, processed 12637525 words, keeping 83193 word types
2019-03-03 18:13:57,832 : INFO : PROG

2019-03-03 18:14:45,104 : INFO : EPOCH 1 - PROGRESS: at 80.80% examples, 655298 words/s, in_qsize 17, out_qsize 2
2019-03-03 18:14:46,111 : INFO : EPOCH 1 - PROGRESS: at 81.99% examples, 647412 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:14:47,123 : INFO : EPOCH 1 - PROGRESS: at 83.13% examples, 639382 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:14:48,128 : INFO : EPOCH 1 - PROGRESS: at 84.14% examples, 631290 words/s, in_qsize 17, out_qsize 2
2019-03-03 18:14:49,178 : INFO : EPOCH 1 - PROGRESS: at 85.13% examples, 623240 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:14:50,183 : INFO : EPOCH 1 - PROGRESS: at 88.24% examples, 629137 words/s, in_qsize 20, out_qsize 2
2019-03-03 18:14:51,179 : INFO : EPOCH 1 - PROGRESS: at 92.85% examples, 644891 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:14:52,195 : INFO : EPOCH 1 - PROGRESS: at 97.35% examples, 659502 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:14:53,182 : INFO : worker thread finished; awaiting finish of 9 more threa

2019-03-03 18:15:43,939 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-03-03 18:15:43,943 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-03-03 18:15:43,943 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-03-03 18:15:43,951 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-03-03 18:15:43,951 : INFO : EPOCH - 2 : training on 41519355 raw words (30347448 effective words) took 50.6s, 599548 effective words/s
2019-03-03 18:15:44,964 : INFO : EPOCH 3 - PROGRESS: at 3.87% examples, 1191976 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:15:45,965 : INFO : EPOCH 3 - PROGRESS: at 7.14% examples, 1094854 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:15:47,001 : INFO : EPOCH 3 - PROGRESS: at 8.23% examples, 835417 words/s, in_qsize 16, out_qsize 3
2019-03-03 18:15:48,048 : INFO : EPOCH 3 - PROGRESS: at 9.28% examples, 708135 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:15:49,090 : INFO : EPOCH 3 -

2019-03-03 18:16:46,004 : INFO : EPOCH 4 - PROGRESS: at 17.71% examples, 527583 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:16:47,024 : INFO : EPOCH 4 - PROGRESS: at 18.52% examples, 506759 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:16:48,084 : INFO : EPOCH 4 - PROGRESS: at 19.07% examples, 483107 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:16:49,171 : INFO : EPOCH 4 - PROGRESS: at 19.79% examples, 466000 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:16:50,203 : INFO : EPOCH 4 - PROGRESS: at 20.42% examples, 450673 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:16:51,207 : INFO : EPOCH 4 - PROGRESS: at 21.47% examples, 441405 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:16:52,231 : INFO : EPOCH 4 - PROGRESS: at 22.22% examples, 431590 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:16:53,238 : INFO : EPOCH 4 - PROGRESS: at 23.66% examples, 438511 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:16:54,242 : INFO : EPOCH 4 - PROGRESS: at 27.79% examples, 477996 words/s,

2019-03-03 18:17:50,762 : INFO : EPOCH 5 - PROGRESS: at 34.81% examples, 616926 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:17:51,770 : INFO : EPOCH 5 - PROGRESS: at 35.82% examples, 599984 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:17:52,784 : INFO : EPOCH 5 - PROGRESS: at 36.95% examples, 585326 words/s, in_qsize 17, out_qsize 2
2019-03-03 18:17:53,785 : INFO : EPOCH 5 - PROGRESS: at 40.01% examples, 598340 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:17:54,792 : INFO : EPOCH 5 - PROGRESS: at 43.91% examples, 618775 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:17:55,802 : INFO : EPOCH 5 - PROGRESS: at 47.91% examples, 640954 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:17:56,862 : INFO : EPOCH 5 - PROGRESS: at 49.97% examples, 637001 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:17:57,870 : INFO : EPOCH 5 - PROGRESS: at 50.90% examples, 622314 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:17:58,914 : INFO : EPOCH 5 - PROGRESS: at 51.81% examples, 608114 words/s,

2019-03-03 18:18:52,397 : INFO : EPOCH 1 - PROGRESS: at 47.37% examples, 539533 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:18:53,400 : INFO : EPOCH 1 - PROGRESS: at 49.43% examples, 540765 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:18:54,415 : INFO : EPOCH 1 - PROGRESS: at 53.53% examples, 564446 words/s, in_qsize 20, out_qsize 0
2019-03-03 18:18:55,430 : INFO : EPOCH 1 - PROGRESS: at 58.00% examples, 587571 words/s, in_qsize 17, out_qsize 2
2019-03-03 18:18:56,474 : INFO : EPOCH 1 - PROGRESS: at 61.59% examples, 600920 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:18:57,524 : INFO : EPOCH 1 - PROGRESS: at 62.95% examples, 593518 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:18:58,561 : INFO : EPOCH 1 - PROGRESS: at 64.48% examples, 586821 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:18:59,597 : INFO : EPOCH 1 - PROGRESS: at 65.52% examples, 578649 words/s, in_qsize 20, out_qsize 1
2019-03-03 18:19:00,609 : INFO : EPOCH 1 - PROGRESS: at 66.53% examples, 570886 words/s,

2019-03-03 18:19:57,737 : INFO : EPOCH 2 - PROGRESS: at 60.51% examples, 539985 words/s, in_qsize 20, out_qsize 0
2019-03-03 18:19:58,741 : INFO : EPOCH 2 - PROGRESS: at 61.66% examples, 534329 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:19:59,776 : INFO : EPOCH 2 - PROGRESS: at 62.97% examples, 529472 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:20:00,792 : INFO : EPOCH 2 - PROGRESS: at 64.25% examples, 523611 words/s, in_qsize 20, out_qsize 1
2019-03-03 18:20:01,820 : INFO : EPOCH 2 - PROGRESS: at 65.53% examples, 519580 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:20:02,872 : INFO : EPOCH 2 - PROGRESS: at 66.80% examples, 515846 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:20:03,881 : INFO : EPOCH 2 - PROGRESS: at 69.61% examples, 523588 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:20:04,909 : INFO : EPOCH 2 - PROGRESS: at 72.69% examples, 533005 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:20:05,917 : INFO : EPOCH 2 - PROGRESS: at 75.59% examples, 540456 words/s,

2019-03-03 18:21:03,084 : INFO : EPOCH 3 - PROGRESS: at 78.88% examples, 562568 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:21:04,124 : INFO : EPOCH 3 - PROGRESS: at 79.94% examples, 556656 words/s, in_qsize 20, out_qsize 0
2019-03-03 18:21:05,262 : INFO : EPOCH 3 - PROGRESS: at 81.09% examples, 550598 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:21:06,254 : INFO : EPOCH 3 - PROGRESS: at 84.12% examples, 558106 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:21:07,271 : INFO : EPOCH 3 - PROGRESS: at 88.22% examples, 571690 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:21:08,276 : INFO : EPOCH 3 - PROGRESS: at 91.78% examples, 580834 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:21:09,298 : INFO : EPOCH 3 - PROGRESS: at 94.05% examples, 582381 words/s, in_qsize 16, out_qsize 3
2019-03-03 18:21:10,331 : INFO : EPOCH 3 - PROGRESS: at 95.30% examples, 577441 words/s, in_qsize 19, out_qsize 1
2019-03-03 18:21:11,378 : INFO : EPOCH 3 - PROGRESS: at 96.45% examples, 572288 words/s,

2019-03-03 18:22:07,236 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-03-03 18:22:07,244 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-03-03 18:22:07,248 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-03-03 18:22:07,248 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-03-03 18:22:07,252 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-03-03 18:22:07,260 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-03-03 18:22:07,272 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-03-03 18:22:07,276 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-03-03 18:22:07,280 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-03-03 18:22:07,284 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-03-03 18:22:07,284 : INFO : EPOCH - 4 : training on 41519355 raw words (30350534 effe

2019-03-03 18:23:06,625 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-03-03 18:23:06,629 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-03-03 18:23:06,633 : INFO : EPOCH - 5 : training on 41519355 raw words (30347717 effective words) took 59.3s, 511417 effective words/s
2019-03-03 18:23:07,689 : INFO : EPOCH 6 - PROGRESS: at 0.89% examples, 292488 words/s, in_qsize 17, out_qsize 2
2019-03-03 18:23:08,712 : INFO : EPOCH 6 - PROGRESS: at 2.06% examples, 319063 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:23:09,732 : INFO : EPOCH 6 - PROGRESS: at 5.48% examples, 552177 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:23:10,736 : INFO : EPOCH 6 - PROGRESS: at 9.03% examples, 693911 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:23:11,752 : INFO : EPOCH 6 - PROGRESS: at 12.08% examples, 781277 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:23:12,791 : INFO : EPOCH 6 - PROGRESS: at 13.44% examples, 722808 words/s, in_qsize 19, out_qsize 

2019-03-03 18:24:09,485 : INFO : EPOCH 7 - PROGRESS: at 10.52% examples, 407759 words/s, in_qsize 15, out_qsize 4
2019-03-03 18:24:10,488 : INFO : EPOCH 7 - PROGRESS: at 13.55% examples, 481741 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:24:11,501 : INFO : EPOCH 7 - PROGRESS: at 16.80% examples, 545960 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:24:12,522 : INFO : EPOCH 7 - PROGRESS: at 18.94% examples, 566107 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:24:13,530 : INFO : EPOCH 7 - PROGRESS: at 19.85% examples, 548759 words/s, in_qsize 17, out_qsize 2
2019-03-03 18:24:14,534 : INFO : EPOCH 7 - PROGRESS: at 20.75% examples, 534289 words/s, in_qsize 20, out_qsize 1
2019-03-03 18:24:15,549 : INFO : EPOCH 7 - PROGRESS: at 22.04% examples, 523312 words/s, in_qsize 20, out_qsize 0
2019-03-03 18:24:16,581 : INFO : EPOCH 7 - PROGRESS: at 22.76% examples, 505062 words/s, in_qsize 17, out_qsize 2
2019-03-03 18:24:17,594 : INFO : EPOCH 7 - PROGRESS: at 23.51% examples, 492708 words/s,

2019-03-03 18:25:14,635 : INFO : EPOCH 8 - PROGRESS: at 26.87% examples, 642897 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:25:15,642 : INFO : EPOCH 8 - PROGRESS: at 30.53% examples, 667566 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:25:16,646 : INFO : EPOCH 8 - PROGRESS: at 32.88% examples, 665453 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:25:17,654 : INFO : EPOCH 8 - PROGRESS: at 33.66% examples, 640233 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:25:18,681 : INFO : EPOCH 8 - PROGRESS: at 34.45% examples, 616752 words/s, in_qsize 20, out_qsize 0
2019-03-03 18:25:19,701 : INFO : EPOCH 8 - PROGRESS: at 35.50% examples, 600240 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:25:20,729 : INFO : EPOCH 8 - PROGRESS: at 36.56% examples, 583950 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:25:21,753 : INFO : EPOCH 8 - PROGRESS: at 37.43% examples, 568026 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:25:22,872 : INFO : EPOCH 8 - PROGRESS: at 38.36% examples, 551156 words/s,

2019-03-03 18:26:19,825 : INFO : EPOCH 9 - PROGRESS: at 52.30% examples, 611058 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:26:20,837 : INFO : EPOCH 9 - PROGRESS: at 53.31% examples, 600595 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:26:21,880 : INFO : EPOCH 9 - PROGRESS: at 54.20% examples, 587819 words/s, in_qsize 20, out_qsize 1
2019-03-03 18:26:22,896 : INFO : EPOCH 9 - PROGRESS: at 55.26% examples, 577528 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:26:23,928 : INFO : EPOCH 9 - PROGRESS: at 56.13% examples, 566110 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:26:24,931 : INFO : EPOCH 9 - PROGRESS: at 57.14% examples, 557372 words/s, in_qsize 17, out_qsize 2
2019-03-03 18:26:25,959 : INFO : EPOCH 9 - PROGRESS: at 59.01% examples, 556625 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:26:26,975 : INFO : EPOCH 9 - PROGRESS: at 61.86% examples, 564547 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:26:28,022 : INFO : EPOCH 9 - PROGRESS: at 65.07% examples, 572375 words/s,

2019-03-03 18:27:24,284 : INFO : EPOCH 10 - PROGRESS: at 65.77% examples, 567371 words/s, in_qsize 18, out_qsize 1
2019-03-03 18:27:25,345 : INFO : EPOCH 10 - PROGRESS: at 67.90% examples, 568010 words/s, in_qsize 15, out_qsize 4
2019-03-03 18:27:26,345 : INFO : EPOCH 10 - PROGRESS: at 69.90% examples, 569149 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:27:27,345 : INFO : EPOCH 10 - PROGRESS: at 71.70% examples, 568802 words/s, in_qsize 20, out_qsize 0
2019-03-03 18:27:28,346 : INFO : EPOCH 10 - PROGRESS: at 73.66% examples, 568237 words/s, in_qsize 19, out_qsize 0
2019-03-03 18:27:29,391 : INFO : EPOCH 10 - PROGRESS: at 75.82% examples, 570131 words/s, in_qsize 20, out_qsize 3
2019-03-03 18:27:30,424 : INFO : EPOCH 10 - PROGRESS: at 77.61% examples, 569979 words/s, in_qsize 16, out_qsize 3
2019-03-03 18:27:31,462 : INFO : EPOCH 10 - PROGRESS: at 79.65% examples, 570797 words/s, in_qsize 20, out_qsize 3
2019-03-03 18:27:32,476 : INFO : EPOCH 10 - PROGRESS: at 81.56% examples, 571066

(303502323, 415193550)

In [73]:
w1 = "dirty"
model.wv.most_similar (positive=w1)

2019-03-03 18:33:50,858 : INFO : precomputing L2-norms of word weight vectors


[('filthy', 0.8692812919616699),
 ('unclean', 0.7802474498748779),
 ('stained', 0.7744835019111633),
 ('smelly', 0.753758430480957),
 ('grubby', 0.749112606048584),
 ('dusty', 0.7469067573547363),
 ('dingy', 0.7293612957000732),
 ('soiled', 0.7234719395637512),
 ('gross', 0.7164075374603271),
 ('disgusting', 0.71483314037323)]

In [74]:
w1 = "rome"
model.wv.most_similar (positive=w1)

[('barcelona', 0.750475287437439),
 ('paris', 0.670874297618866),
 ('madrid', 0.6648043394088745),
 ('vienna', 0.6618359684944153),
 ('prague', 0.6611852645874023),
 ('mexico', 0.6363353133201599),
 ('edinburgh', 0.6255320906639099),
 ('singapore', 0.6241265535354614),
 ('greece', 0.621086061000824),
 ('crillon', 0.6115080118179321)]