In [1]:
# imports needed and logging
import gzip
import gensim
import logging
from pprint import pprint

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Import Dataset

In [3]:
input_file = "./reviews_data.txt.gz"

with gzip.open(input_file, 'rb') as f:
    total_line=0
    for i, line in enumerate(f):
        total_line+=1
        if i == 0: print(line)

b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

In [4]:
print("There are %s reviews in this dataset." % total_line)

There are 255404 reviews in this dataset.


## Preprocess Dataset

In [5]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    logging.info("reading file {0}... his may take a while".format(input_file))
    
    with gzip.open(input_file, 'rb') as f:
        for i, line in enumerate(f):
            
            if(i%10000==0):
                logging.info("read {0} reviews".format(i))
                
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess(line)
            
# read the tokenized reviews into a list
# each review item becomes a series of words
# so this becomes a list of lists
documents = list(read_input(input_file))
logging.info("Done reading data file")

2018-11-24 11:31:45,878 : INFO : reading file ./reviews_data.txt.gz...this may take a while
2018-11-24 11:31:45,887 : INFO : read 0 reviews
2018-11-24 11:31:48,845 : INFO : read 10000 reviews
2018-11-24 11:31:51,907 : INFO : read 20000 reviews
2018-11-24 11:31:55,368 : INFO : read 30000 reviews
2018-11-24 11:31:58,583 : INFO : read 40000 reviews
2018-11-24 11:32:02,301 : INFO : read 50000 reviews
2018-11-24 11:32:05,941 : INFO : read 60000 reviews
2018-11-24 11:32:09,027 : INFO : read 70000 reviews
2018-11-24 11:32:11,768 : INFO : read 80000 reviews
2018-11-24 11:32:14,866 : INFO : read 90000 reviews
2018-11-24 11:32:18,098 : INFO : read 100000 reviews
2018-11-24 11:32:20,896 : INFO : read 110000 reviews
2018-11-24 11:32:25,515 : INFO : read 120000 reviews
2018-11-24 11:32:28,387 : INFO : read 130000 reviews
2018-11-24 11:32:32,903 : INFO : read 140000 reviews
2018-11-24 11:32:36,024 : INFO : read 150000 reviews
2018-11-24 11:32:39,919 : INFO : read 160000 reviews
2018-11-24 11:32:43,4

## Train The Word2Vec Model

In [6]:
word2vec_model = gensim.models.Word2Vec(documents, size=150, window=10, min_count=2, workers=10)

2018-11-24 13:03:20,739 : INFO : collecting all words and their counts
2018-11-24 13:03:20,742 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-24 13:03:21,369 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2018-11-24 13:03:21,871 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2018-11-24 13:03:22,894 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2018-11-24 13:03:23,639 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2018-11-24 13:03:24,180 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2018-11-24 13:03:24,724 : INFO : PROGRESS: at sentence #60000, processed 11013723 words, keeping 76781 word types
2018-11-24 13:03:25,196 : INFO : PROGRESS: at sentence #70000, processed 12637525 words, keeping 83194 word types
2018-11-24 13:03:25,662 : INFO : PROG

2018-11-24 13:04:13,890 : INFO : EPOCH 1 - PROGRESS: at 78.81% examples, 642839 words/s, in_qsize 16, out_qsize 4
2018-11-24 13:04:14,902 : INFO : EPOCH 1 - PROGRESS: at 81.21% examples, 645202 words/s, in_qsize 18, out_qsize 2
2018-11-24 13:04:15,921 : INFO : EPOCH 1 - PROGRESS: at 83.63% examples, 646809 words/s, in_qsize 17, out_qsize 2
2018-11-24 13:04:16,926 : INFO : EPOCH 1 - PROGRESS: at 86.08% examples, 649552 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:04:17,926 : INFO : EPOCH 1 - PROGRESS: at 88.52% examples, 649899 words/s, in_qsize 17, out_qsize 2
2018-11-24 13:04:18,938 : INFO : EPOCH 1 - PROGRESS: at 90.83% examples, 649904 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:04:19,951 : INFO : EPOCH 1 - PROGRESS: at 92.96% examples, 649184 words/s, in_qsize 20, out_qsize 0
2018-11-24 13:04:20,970 : INFO : EPOCH 1 - PROGRESS: at 95.06% examples, 648118 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:04:21,986 : INFO : EPOCH 1 - PROGRESS: at 97.29% examples, 647747 words/s,

2018-11-24 13:05:11,757 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-24 13:05:11,769 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-11-24 13:05:11,773 : INFO : EPOCH - 2 : training on 41519355 raw words (30350511 effective words) took 48.5s, 625787 effective words/s
2018-11-24 13:05:12,816 : INFO : EPOCH 3 - PROGRESS: at 2.21% examples, 687730 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:05:13,828 : INFO : EPOCH 3 - PROGRESS: at 4.65% examples, 707486 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:05:14,851 : INFO : EPOCH 3 - PROGRESS: at 7.08% examples, 713019 words/s, in_qsize 17, out_qsize 2
2018-11-24 13:05:15,872 : INFO : EPOCH 3 - PROGRESS: at 9.22% examples, 705319 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:05:16,874 : INFO : EPOCH 3 - PROGRESS: at 11.10% examples, 707633 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:05:17,907 : INFO : EPOCH 3 - PROGRESS: at 13.14% examples, 708191 words/s, in_qsize 16, out_qsize 

2018-11-24 13:06:13,721 : INFO : EPOCH 4 - PROGRESS: at 23.33% examples, 719238 words/s, in_qsize 17, out_qsize 2
2018-11-24 13:06:14,736 : INFO : EPOCH 4 - PROGRESS: at 25.37% examples, 716548 words/s, in_qsize 16, out_qsize 4
2018-11-24 13:06:15,754 : INFO : EPOCH 4 - PROGRESS: at 27.94% examples, 713307 words/s, in_qsize 20, out_qsize 0
2018-11-24 13:06:16,762 : INFO : EPOCH 4 - PROGRESS: at 30.22% examples, 708801 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:06:17,790 : INFO : EPOCH 4 - PROGRESS: at 32.66% examples, 704502 words/s, in_qsize 17, out_qsize 2
2018-11-24 13:06:18,819 : INFO : EPOCH 4 - PROGRESS: at 35.07% examples, 704014 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:06:19,822 : INFO : EPOCH 4 - PROGRESS: at 37.60% examples, 704843 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:06:20,829 : INFO : EPOCH 4 - PROGRESS: at 40.01% examples, 703035 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:06:21,864 : INFO : EPOCH 4 - PROGRESS: at 42.49% examples, 700384 words/s,

2018-11-24 13:07:18,881 : INFO : EPOCH 5 - PROGRESS: at 73.89% examples, 668037 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:07:19,916 : INFO : EPOCH 5 - PROGRESS: at 75.36% examples, 661623 words/s, in_qsize 20, out_qsize 1
2018-11-24 13:07:20,934 : INFO : EPOCH 5 - PROGRESS: at 77.13% examples, 658702 words/s, in_qsize 17, out_qsize 3
2018-11-24 13:07:21,941 : INFO : EPOCH 5 - PROGRESS: at 79.15% examples, 657859 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:07:22,952 : INFO : EPOCH 5 - PROGRESS: at 81.31% examples, 657784 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:07:23,968 : INFO : EPOCH 5 - PROGRESS: at 83.59% examples, 658211 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:07:25,016 : INFO : EPOCH 5 - PROGRESS: at 85.21% examples, 653876 words/s, in_qsize 14, out_qsize 5
2018-11-24 13:07:26,019 : INFO : EPOCH 5 - PROGRESS: at 87.21% examples, 651591 words/s, in_qsize 19, out_qsize 1
2018-11-24 13:07:27,047 : INFO : EPOCH 5 - PROGRESS: at 89.10% examples, 648192 words/s,

In [9]:
word2vec_model.train(documents, total_examples=len(documents), epochs=10)

2018-11-24 13:09:43,673 : INFO : training model with 10 workers on 70538 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2018-11-24 13:09:44,691 : INFO : EPOCH 1 - PROGRESS: at 2.10% examples, 661895 words/s, in_qsize 20, out_qsize 0
2018-11-24 13:09:45,707 : INFO : EPOCH 1 - PROGRESS: at 4.16% examples, 636344 words/s, in_qsize 20, out_qsize 0
2018-11-24 13:09:46,727 : INFO : EPOCH 1 - PROGRESS: at 6.45% examples, 654860 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:09:47,755 : INFO : EPOCH 1 - PROGRESS: at 8.62% examples, 655579 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:09:48,763 : INFO : EPOCH 1 - PROGRESS: at 10.54% examples, 664226 words/s, in_qsize 20, out_qsize 1
2018-11-24 13:09:49,787 : INFO : EPOCH 1 - PROGRESS: at 12.52% examples, 678656 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:09:50,790 : INFO : EPOCH 1 - PROGRESS: at 14.52% examples, 675952 words/s, in_qsize 20, out_qsize 6
2018-11-24 13:09:51,818 : INFO : EPOCH 1 - PROGRESS: a

2018-11-24 13:10:46,844 : INFO : EPOCH 2 - PROGRESS: at 26.51% examples, 680210 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:10:47,896 : INFO : EPOCH 2 - PROGRESS: at 29.09% examples, 679628 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:10:48,899 : INFO : EPOCH 2 - PROGRESS: at 31.67% examples, 681281 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:10:49,900 : INFO : EPOCH 2 - PROGRESS: at 34.12% examples, 684165 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:10:50,908 : INFO : EPOCH 2 - PROGRESS: at 36.76% examples, 687696 words/s, in_qsize 17, out_qsize 5
2018-11-24 13:10:51,909 : INFO : EPOCH 2 - PROGRESS: at 39.46% examples, 691414 words/s, in_qsize 16, out_qsize 3
2018-11-24 13:10:52,935 : INFO : EPOCH 2 - PROGRESS: at 42.12% examples, 691948 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:10:53,941 : INFO : EPOCH 2 - PROGRESS: at 44.70% examples, 692844 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:10:54,972 : INFO : EPOCH 2 - PROGRESS: at 47.35% examples, 695452 words/s,

2018-11-24 13:11:51,586 : INFO : EPOCH 3 - PROGRESS: at 77.00% examples, 662575 words/s, in_qsize 20, out_qsize 0
2018-11-24 13:11:52,600 : INFO : EPOCH 3 - PROGRESS: at 79.40% examples, 664652 words/s, in_qsize 20, out_qsize 0
2018-11-24 13:11:53,612 : INFO : EPOCH 3 - PROGRESS: at 81.72% examples, 665716 words/s, in_qsize 19, out_qsize 1
2018-11-24 13:11:54,613 : INFO : EPOCH 3 - PROGRESS: at 83.67% examples, 663575 words/s, in_qsize 16, out_qsize 3
2018-11-24 13:11:55,639 : INFO : EPOCH 3 - PROGRESS: at 85.94% examples, 664156 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:11:56,643 : INFO : EPOCH 3 - PROGRESS: at 88.63% examples, 666034 words/s, in_qsize 20, out_qsize 0
2018-11-24 13:11:57,649 : INFO : EPOCH 3 - PROGRESS: at 91.11% examples, 666973 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:11:58,664 : INFO : EPOCH 3 - PROGRESS: at 93.38% examples, 667001 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:11:59,690 : INFO : EPOCH 3 - PROGRESS: at 95.60% examples, 665882 words/s,

2018-11-24 13:12:47,496 : INFO : EPOCH 5 - PROGRESS: at 6.69% examples, 677569 words/s, in_qsize 20, out_qsize 1
2018-11-24 13:12:48,507 : INFO : EPOCH 5 - PROGRESS: at 8.63% examples, 657676 words/s, in_qsize 20, out_qsize 1
2018-11-24 13:12:49,524 : INFO : EPOCH 5 - PROGRESS: at 10.43% examples, 657609 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:12:50,534 : INFO : EPOCH 5 - PROGRESS: at 12.15% examples, 657015 words/s, in_qsize 20, out_qsize 2
2018-11-24 13:12:51,557 : INFO : EPOCH 5 - PROGRESS: at 14.32% examples, 665687 words/s, in_qsize 19, out_qsize 1
2018-11-24 13:12:52,583 : INFO : EPOCH 5 - PROGRESS: at 16.31% examples, 666949 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:12:53,598 : INFO : EPOCH 5 - PROGRESS: at 18.15% examples, 666926 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:12:54,599 : INFO : EPOCH 5 - PROGRESS: at 20.01% examples, 671700 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:12:55,670 : INFO : EPOCH 5 - PROGRESS: at 22.33% examples, 677228 words/s, i

2018-11-24 13:13:52,643 : INFO : EPOCH 6 - PROGRESS: at 34.54% examples, 693348 words/s, in_qsize 20, out_qsize 0
2018-11-24 13:13:53,644 : INFO : EPOCH 6 - PROGRESS: at 36.89% examples, 692408 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:13:54,667 : INFO : EPOCH 6 - PROGRESS: at 39.38% examples, 691904 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:13:55,685 : INFO : EPOCH 6 - PROGRESS: at 42.26% examples, 696062 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:13:56,719 : INFO : EPOCH 6 - PROGRESS: at 44.92% examples, 696524 words/s, in_qsize 15, out_qsize 4
2018-11-24 13:13:57,725 : INFO : EPOCH 6 - PROGRESS: at 47.22% examples, 695051 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:13:58,767 : INFO : EPOCH 6 - PROGRESS: at 49.66% examples, 693911 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:13:59,770 : INFO : EPOCH 6 - PROGRESS: at 52.22% examples, 696442 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:14:00,770 : INFO : EPOCH 6 - PROGRESS: at 54.63% examples, 698401 words/s,

2018-11-24 13:14:57,429 : INFO : EPOCH 7 - PROGRESS: at 65.86% examples, 535242 words/s, in_qsize 20, out_qsize 0
2018-11-24 13:14:58,475 : INFO : EPOCH 7 - PROGRESS: at 68.20% examples, 538363 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:14:59,496 : INFO : EPOCH 7 - PROGRESS: at 70.17% examples, 540263 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:15:00,509 : INFO : EPOCH 7 - PROGRESS: at 72.47% examples, 543501 words/s, in_qsize 16, out_qsize 3
2018-11-24 13:15:01,522 : INFO : EPOCH 7 - PROGRESS: at 74.74% examples, 545633 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:15:02,559 : INFO : EPOCH 7 - PROGRESS: at 75.86% examples, 541018 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:15:03,568 : INFO : EPOCH 7 - PROGRESS: at 77.44% examples, 540211 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:15:04,588 : INFO : EPOCH 7 - PROGRESS: at 79.06% examples, 539291 words/s, in_qsize 20, out_qsize 0
2018-11-24 13:15:05,623 : INFO : EPOCH 7 - PROGRESS: at 80.78% examples, 538611 words/s,

2018-11-24 13:16:02,390 : INFO : EPOCH 8 - PROGRESS: at 96.29% examples, 642023 words/s, in_qsize 20, out_qsize 0
2018-11-24 13:16:03,406 : INFO : EPOCH 8 - PROGRESS: at 98.66% examples, 642671 words/s, in_qsize 20, out_qsize 1
2018-11-24 13:16:03,860 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-11-24 13:16:03,863 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-11-24 13:16:03,874 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-11-24 13:16:03,892 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-11-24 13:16:03,908 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-11-24 13:16:03,912 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-11-24 13:16:03,917 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-24 13:16:03,929 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-11-24 13:16:03,936 : INFO : worker thre

2018-11-24 13:16:58,650 : INFO : EPOCH 10 - PROGRESS: at 1.55% examples, 489453 words/s, in_qsize 17, out_qsize 2
2018-11-24 13:16:59,667 : INFO : EPOCH 10 - PROGRESS: at 3.12% examples, 478129 words/s, in_qsize 17, out_qsize 2
2018-11-24 13:17:00,670 : INFO : EPOCH 10 - PROGRESS: at 5.25% examples, 536692 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:17:01,713 : INFO : EPOCH 10 - PROGRESS: at 7.30% examples, 552809 words/s, in_qsize 18, out_qsize 1
2018-11-24 13:17:02,737 : INFO : EPOCH 10 - PROGRESS: at 9.16% examples, 560216 words/s, in_qsize 19, out_qsize 0
2018-11-24 13:17:03,741 : INFO : EPOCH 10 - PROGRESS: at 10.81% examples, 569469 words/s, in_qsize 17, out_qsize 1
2018-11-24 13:17:04,752 : INFO : EPOCH 10 - PROGRESS: at 12.26% examples, 568247 words/s, in_qsize 17, out_qsize 3
2018-11-24 13:17:05,774 : INFO : EPOCH 10 - PROGRESS: at 14.20% examples, 577331 words/s, in_qsize 20, out_qsize 1
2018-11-24 13:17:06,783 : INFO : EPOCH 10 - PROGRESS: at 15.98% examples, 580774 word

(303485913, 415193550)

## Save The Model

In [11]:
word2vec_model.save("./weights/word2vec.model")

2018-11-24 13:38:46,474 : INFO : saving Word2Vec object under ./weights/word2vec.model, separately None
2018-11-24 13:38:46,478 : INFO : storing np array 'vectors' to ./weights/word2vec.model.wv.vectors.npy
2018-11-24 13:38:46,773 : INFO : not storing attribute vectors_norm
2018-11-24 13:38:46,785 : INFO : storing np array 'syn1neg' to ./weights/word2vec.model.trainables.syn1neg.npy
2018-11-24 13:38:48,093 : INFO : saved ./weights/word2vec.model


## Load The Model

In [13]:
word2vec_model = gensim.models.Word2Vec.load("./weights/word2vec.model")

2018-11-24 13:41:29,226 : INFO : loading Word2Vec object from ./weights/word2vec.model
2018-11-24 13:41:40,199 : INFO : loading wv recursively from ./weights/word2vec.model.wv.* with mmap=None
2018-11-24 13:41:40,201 : INFO : loading vectors from ./weights/word2vec.model.wv.vectors.npy with mmap=None
2018-11-24 13:41:40,375 : INFO : setting ignored attribute vectors_norm to None
2018-11-24 13:41:40,386 : INFO : loading vocabulary recursively from ./weights/word2vec.model.vocabulary.* with mmap=None
2018-11-24 13:41:40,393 : INFO : loading trainables recursively from ./weights/word2vec.model.trainables.* with mmap=None
2018-11-24 13:41:40,396 : INFO : loading syn1neg from ./weights/word2vec.model.trainables.syn1neg.npy with mmap=None
2018-11-24 13:41:40,570 : INFO : loaded ./weights/word2vec.model


#### Showing model info

In [80]:
model_info = {
    'epoch': word2vec_model.epochs,
    'total_corpus': word2vec_model.corpus_count,
    'workers': word2vec_model.workers,
    'window': word2vec_model.window,
    'vector_size': word2vec_model.vector_size
}

In [91]:
import pandas as pd
pd.DataFrame(data=[model_info])

Unnamed: 0,epoch,total_corpus,vector_size,window,workers
0,10,255404,150,10,10


## Test The Model

#### Finding the most related words

In [23]:
word = 'dirty'
word2vec_model.wv.most_similar(positive=word, topn=5)

[('filthy', 0.8659749031066895),
 ('unclean', 0.7855876088142395),
 ('stained', 0.7721947431564331),
 ('smelly', 0.7647020220756531),
 ('dusty', 0.7624214291572571)]

In [24]:
word = 'polite'
word2vec_model.wv.most_similar(positive=word, topn=5)

[('courteous', 0.9253534078598022),
 ('friendly', 0.8423811793327332),
 ('cordial', 0.8059393167495728),
 ('curteous', 0.8040379285812378),
 ('professional', 0.7898988723754883)]

In [25]:
word = 'france'
word2vec_model.wv.most_similar(positive=word, topn=5)

[('spain', 0.6542340517044067),
 ('canada', 0.6508133411407471),
 ('germany', 0.6300669312477112),
 ('detroit', 0.598978579044342),
 ('mexico', 0.5975478887557983)]

In [26]:
word = 'shocked'
word2vec_model.wv.most_similar(positive=word, topn=5)

[('horrified', 0.8133686780929565),
 ('amazed', 0.7889528870582581),
 ('stunned', 0.7694660425186157),
 ('astonished', 0.7680845260620117),
 ('appalled', 0.7520051598548889)]

In [33]:
# get everything related to stuff on the bed
wordp = ['bed', 'sheet', 'pillow'] # act as context
wordn = ['mattress'] # what should not be considered as related
word2vec_model.wv.most_similar(positive=wordp, negative=wordn, topn=5)

[('blanket', 0.6604424118995667),
 ('duvet', 0.6593495011329651),
 ('quilt', 0.6290340423583984),
 ('satin', 0.6129457950592041),
 ('pillowcase', 0.6124125719070435)]

#### Finding similarity between two words in the vocabulary

In [38]:
word2vec_model.wv.similarity(w1="dirty",w2="dirty")

1.0000000000000002

In [39]:
word2vec_model.wv.similarity(w1='dirty',w2= 'smelly')

0.7647020148051664

In [40]:
word2vec_model.wv.similarity(w1="dirty",w2="clean")

0.2601841075222059

#### Finding the odd one cut

In [41]:
word2vec_model.wv.doesnt_match(['cat', 'dog', 'france'])

'france'

In [43]:
word2vec_model.wv.doesnt_match(['bed', 'pillow', 'duvet', 'shower'])

'shower'

## Great!

#### <hr/>