In [1]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [2]:
data_file="reviews_data.txt.gz"

with gzip.open ('reviews_data.txt.gz', 'rb') as f:
    for i,line in enumerate (f):
        print(line)
        break


b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

In [3]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    logging.info("reading file {0}...this may take a while".format(input_file))
    
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                logging.info ("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)

# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list (read_input (data_file))
logging.info ("Done reading data file")

2018-12-18 23:04:45,742 : INFO : reading file reviews_data.txt.gz...this may take a while
2018-12-18 23:04:45,757 : INFO : read 0 reviews
2018-12-18 23:04:48,567 : INFO : read 10000 reviews
2018-12-18 23:04:51,658 : INFO : read 20000 reviews
2018-12-18 23:04:55,318 : INFO : read 30000 reviews
2018-12-18 23:04:59,522 : INFO : read 40000 reviews
2018-12-18 23:05:02,741 : INFO : read 50000 reviews
2018-12-18 23:05:06,186 : INFO : read 60000 reviews
2018-12-18 23:05:08,668 : INFO : read 70000 reviews
2018-12-18 23:05:12,250 : INFO : read 80000 reviews
2018-12-18 23:05:15,158 : INFO : read 90000 reviews
2018-12-18 23:05:17,800 : INFO : read 100000 reviews
2018-12-18 23:05:20,319 : INFO : read 110000 reviews
2018-12-18 23:05:25,120 : INFO : read 120000 reviews
2018-12-18 23:05:28,510 : INFO : read 130000 reviews
2018-12-18 23:05:32,030 : INFO : read 140000 reviews
2018-12-18 23:05:34,374 : INFO : read 150000 reviews
2018-12-18 23:05:36,911 : INFO : read 160000 reviews
2018-12-18 23:05:43,765

In [4]:
model = gensim.models.Word2Vec (documents, size=152, window=8, min_count=5, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

2018-12-18 23:06:13,288 : INFO : collecting all words and their counts
2018-12-18 23:06:13,288 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-12-18 23:06:13,686 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2018-12-18 23:06:14,123 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2018-12-18 23:06:14,516 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2018-12-18 23:06:14,891 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2018-12-18 23:06:15,410 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2018-12-18 23:06:16,047 : INFO : PROGRESS: at sentence #60000, processed 11013723 words, keeping 76781 word types
2018-12-18 23:06:16,479 : INFO : PROGRESS: at sentence #70000, processed 12637525 words, keeping 83194 word types
2018-12-18 23:06:16,852 : INFO : PROG

2018-12-18 23:07:04,456 : INFO : EPOCH 1 - PROGRESS: at 85.14% examples, 671596 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:07:05,480 : INFO : EPOCH 1 - PROGRESS: at 87.64% examples, 672172 words/s, in_qsize 20, out_qsize 0
2018-12-18 23:07:06,507 : INFO : EPOCH 1 - PROGRESS: at 90.49% examples, 675047 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:07:07,527 : INFO : EPOCH 1 - PROGRESS: at 92.86% examples, 675006 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:07:08,521 : INFO : EPOCH 1 - PROGRESS: at 95.17% examples, 674871 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:07:09,534 : INFO : EPOCH 1 - PROGRESS: at 98.15% examples, 679115 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:07:10,284 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-12-18 23:07:10,292 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-12-18 23:07:10,300 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-12-18 23:07:10,304 : INFO : worker thr

2018-12-18 23:08:00,865 : INFO : EPOCH 3 - PROGRESS: at 17.01% examples, 696692 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:08:01,914 : INFO : EPOCH 3 - PROGRESS: at 18.55% examples, 678284 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:08:02,935 : INFO : EPOCH 3 - PROGRESS: at 20.37% examples, 680627 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:08:03,954 : INFO : EPOCH 3 - PROGRESS: at 22.68% examples, 687113 words/s, in_qsize 20, out_qsize 2
2018-12-18 23:08:04,958 : INFO : EPOCH 3 - PROGRESS: at 24.43% examples, 686182 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:08:05,974 : INFO : EPOCH 3 - PROGRESS: at 26.82% examples, 684281 words/s, in_qsize 17, out_qsize 2
2018-12-18 23:08:06,968 : INFO : EPOCH 3 - PROGRESS: at 29.05% examples, 677924 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:08:08,004 : INFO : EPOCH 3 - PROGRESS: at 31.34% examples, 673904 words/s, in_qsize 20, out_qsize 0
2018-12-18 23:08:09,003 : INFO : EPOCH 3 - PROGRESS: at 33.40% examples, 667851 words/s,

2018-12-18 23:09:05,373 : INFO : EPOCH 4 - PROGRESS: at 76.05% examples, 790474 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:09:06,399 : INFO : EPOCH 4 - PROGRESS: at 78.96% examples, 794271 words/s, in_qsize 17, out_qsize 2
2018-12-18 23:09:07,410 : INFO : EPOCH 4 - PROGRESS: at 82.00% examples, 797875 words/s, in_qsize 16, out_qsize 3
2018-12-18 23:09:08,420 : INFO : EPOCH 4 - PROGRESS: at 84.84% examples, 800228 words/s, in_qsize 20, out_qsize 1
2018-12-18 23:09:09,415 : INFO : EPOCH 4 - PROGRESS: at 88.17% examples, 804237 words/s, in_qsize 18, out_qsize 1
2018-12-18 23:09:10,426 : INFO : EPOCH 4 - PROGRESS: at 91.33% examples, 806828 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:09:11,428 : INFO : EPOCH 4 - PROGRESS: at 94.52% examples, 810410 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:09:12,440 : INFO : EPOCH 4 - PROGRESS: at 97.53% examples, 811731 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:09:13,141 : INFO : worker thread finished; awaiting finish of 9 more threa

2018-12-18 23:09:58,128 : INFO : EPOCH 1 - PROGRESS: at 14.87% examples, 693354 words/s, in_qsize 17, out_qsize 2
2018-12-18 23:09:59,145 : INFO : EPOCH 1 - PROGRESS: at 17.48% examples, 720333 words/s, in_qsize 20, out_qsize 1
2018-12-18 23:10:00,144 : INFO : EPOCH 1 - PROGRESS: at 19.81% examples, 737420 words/s, in_qsize 19, out_qsize 1
2018-12-18 23:10:01,161 : INFO : EPOCH 1 - PROGRESS: at 22.29% examples, 747064 words/s, in_qsize 20, out_qsize 2
2018-12-18 23:10:02,172 : INFO : EPOCH 1 - PROGRESS: at 24.47% examples, 755274 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:10:03,177 : INFO : EPOCH 1 - PROGRESS: at 26.70% examples, 743018 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:10:04,199 : INFO : EPOCH 1 - PROGRESS: at 29.12% examples, 734897 words/s, in_qsize 17, out_qsize 2
2018-12-18 23:10:05,204 : INFO : EPOCH 1 - PROGRESS: at 32.22% examples, 743342 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:10:06,200 : INFO : EPOCH 1 - PROGRESS: at 35.27% examples, 753716 words/s,

2018-12-18 23:11:02,499 : INFO : EPOCH 2 - PROGRESS: at 95.91% examples, 894321 words/s, in_qsize 20, out_qsize 1
2018-12-18 23:11:03,519 : INFO : EPOCH 2 - PROGRESS: at 99.06% examples, 894119 words/s, in_qsize 18, out_qsize 1
2018-12-18 23:11:03,701 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-12-18 23:11:03,724 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-12-18 23:11:03,739 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-12-18 23:11:03,749 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-12-18 23:11:03,775 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-12-18 23:11:03,785 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-12-18 23:11:03,785 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-12-18 23:11:03,789 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-12-18 23:11:03,789 : INFO : worker thre

2018-12-18 23:11:58,404 : INFO : EPOCH 4 - PROGRESS: at 53.27% examples, 814592 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:11:59,404 : INFO : EPOCH 4 - PROGRESS: at 56.19% examples, 814369 words/s, in_qsize 16, out_qsize 3
2018-12-18 23:12:00,443 : INFO : EPOCH 4 - PROGRESS: at 58.81% examples, 811102 words/s, in_qsize 18, out_qsize 1
2018-12-18 23:12:01,441 : INFO : EPOCH 4 - PROGRESS: at 61.39% examples, 807829 words/s, in_qsize 16, out_qsize 3
2018-12-18 23:12:02,442 : INFO : EPOCH 4 - PROGRESS: at 64.72% examples, 811539 words/s, in_qsize 17, out_qsize 2
2018-12-18 23:12:03,450 : INFO : EPOCH 4 - PROGRESS: at 67.71% examples, 815113 words/s, in_qsize 20, out_qsize 0
2018-12-18 23:12:04,473 : INFO : EPOCH 4 - PROGRESS: at 70.74% examples, 819525 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:12:05,480 : INFO : EPOCH 4 - PROGRESS: at 74.00% examples, 822978 words/s, in_qsize 16, out_qsize 3
2018-12-18 23:12:06,489 : INFO : EPOCH 4 - PROGRESS: at 76.94% examples, 826414 words/s,

2018-12-18 23:12:53,350 : INFO : EPOCH 6 - PROGRESS: at 11.15% examples, 894686 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:12:54,391 : INFO : EPOCH 6 - PROGRESS: at 13.69% examples, 891265 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:12:55,398 : INFO : EPOCH 6 - PROGRESS: at 16.28% examples, 889509 words/s, in_qsize 17, out_qsize 2
2018-12-18 23:12:56,408 : INFO : EPOCH 6 - PROGRESS: at 18.83% examples, 893956 words/s, in_qsize 17, out_qsize 2
2018-12-18 23:12:57,412 : INFO : EPOCH 6 - PROGRESS: at 21.38% examples, 898887 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:12:58,425 : INFO : EPOCH 6 - PROGRESS: at 23.81% examples, 897458 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:12:59,422 : INFO : EPOCH 6 - PROGRESS: at 26.87% examples, 898893 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:13:00,449 : INFO : EPOCH 6 - PROGRESS: at 30.11% examples, 897163 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:13:01,471 : INFO : EPOCH 6 - PROGRESS: at 33.34% examples, 894680 words/s,

2018-12-18 23:13:57,671 : INFO : EPOCH 7 - PROGRESS: at 94.93% examples, 836043 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:13:58,688 : INFO : EPOCH 7 - PROGRESS: at 98.07% examples, 837834 words/s, in_qsize 16, out_qsize 3
2018-12-18 23:13:59,207 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-12-18 23:13:59,219 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-12-18 23:13:59,234 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-12-18 23:13:59,256 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-12-18 23:13:59,264 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-12-18 23:13:59,264 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-12-18 23:13:59,272 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-12-18 23:13:59,289 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-12-18 23:13:59,316 : INFO : worker thre

2018-12-18 23:14:53,540 : INFO : EPOCH 9 - PROGRESS: at 53.73% examples, 823318 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:14:54,544 : INFO : EPOCH 9 - PROGRESS: at 57.00% examples, 827962 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:14:55,544 : INFO : EPOCH 9 - PROGRESS: at 60.03% examples, 829962 words/s, in_qsize 18, out_qsize 1
2018-12-18 23:14:56,550 : INFO : EPOCH 9 - PROGRESS: at 63.12% examples, 831912 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:14:57,589 : INFO : EPOCH 9 - PROGRESS: at 66.42% examples, 834613 words/s, in_qsize 19, out_qsize 2
2018-12-18 23:14:58,624 : INFO : EPOCH 9 - PROGRESS: at 69.54% examples, 837746 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:14:59,652 : INFO : EPOCH 9 - PROGRESS: at 72.62% examples, 840463 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:15:00,667 : INFO : EPOCH 9 - PROGRESS: at 75.63% examples, 841776 words/s, in_qsize 19, out_qsize 0
2018-12-18 23:15:01,685 : INFO : EPOCH 9 - PROGRESS: at 78.52% examples, 843702 words/s,

(302558595, 415193550)

In [5]:
w1 = "dirty"
model.wv.most_similar (positive=w1)


2018-12-18 23:15:43,132 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('filthy', 0.8615992665290833),
 ('unclean', 0.7765707969665527),
 ('stained', 0.7706811428070068),
 ('dusty', 0.767686665058136),
 ('grubby', 0.7505663633346558),
 ('smelly', 0.7297718524932861),
 ('soiled', 0.7291343808174133),
 ('dingy', 0.7252703905105591),
 ('grimy', 0.7180421948432922),
 ('disgusting', 0.704257071018219)]

In [6]:
# look up top 6 words similar to 'polite'
w1 = ["polite"]
model.wv.most_similar (positive=w1,topn=6)

[('courteous', 0.9309547543525696),
 ('friendly', 0.8372588157653809),
 ('professional', 0.8213374614715576),
 ('cordial', 0.820091962814331),
 ('attentive', 0.8005121946334839),
 ('personable', 0.7917860150337219)]

In [7]:
# look up top 6 words similar to 'france'
w1 = ["france"]
model.wv.most_similar (positive=w1,topn=6)

[('germany', 0.7076193690299988),
 ('canada', 0.6501572728157043),
 ('barcelona', 0.6208020448684692),
 ('spain', 0.6119197010993958),
 ('hawaii', 0.6101654767990112),
 ('austria', 0.5945177674293518)]

In [8]:
# look up top 6 words similar to 'shocked'
w1 = ["shocked"]
model.wv.most_similar (positive=w1,topn=6)

[('horrified', 0.8167334794998169),
 ('amazed', 0.8055417537689209),
 ('astonished', 0.7929306030273438),
 ('appalled', 0.7908390760421753),
 ('dismayed', 0.7825530767440796),
 ('stunned', 0.7590216994285583)]

In [9]:
# get everything related to stuff on the bed
w1 = ["bed",'sheet','pillow']
w2 = ['couch']
model.wv.most_similar (positive=w1,negative=w2,topn=10)

[('duvet', 0.7238868474960327),
 ('blanket', 0.715964674949646),
 ('mattress', 0.6989595890045166),
 ('quilt', 0.6865867972373962),
 ('matress', 0.6769784092903137),
 ('pillowcase', 0.6749173998832703),
 ('pillows', 0.6466516852378845),
 ('sheets', 0.6445187330245972),
 ('foam', 0.6302602887153625),
 ('comforter', 0.6183351874351501)]

In [10]:
# similarity between two different words
model.wv.similarity(w1="dirty",w2="smelly")

0.7297719

In [11]:
# similarity between two identical words
model.wv.similarity(w1="dirty",w2="dirty")

1.0

In [12]:
# similarity between two unrelated words
model.wv.similarity(w1="dirty",w2="clean")

0.29900852

In [13]:
# Which one is the odd one out in this list?
model.wv.doesnt_match(["cat","dog","france"])

'france'

In [14]:
# Which one is the odd one out in this list?
model.wv.doesnt_match(["bed","pillow","duvet","shower"])

'shower'