In [11]:
# pip install fasttext
# pip install gensim

In [1]:
import gensim 
import logging

In [2]:
with open("./media/reviews_data.txt", 'rb') as f:
        for i,line in enumerate (f):
            print(line)
            break

b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

In [3]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    logging.info("reading file {0}...this may take a while".format(input_file))
    with open(input_file, 'rb') as f:
        for i, line in enumerate(f):
            if (i % 10000 == 0):
                logging.info("read {0} reviews".format(i))
            # do some pre-processing and return list of words for each review
            # text
            yield gensim.utils.simple_preprocess(line)
documents = list (read_input ("./data/reviews_data.txt"))            

In [23]:
documents

[['oct',
  'nice',
  'trendy',
  'hotel',
  'location',
  'not',
  'too',
  'bad',
  'stayed',
  'in',
  'this',
  'hotel',
  'for',
  'one',
  'night',
  'as',
  'this',
  'is',
  'fairly',
  'new',
  'place',
  'some',
  'of',
  'the',
  'taxi',
  'drivers',
  'did',
  'not',
  'know',
  'where',
  'it',
  'was',
  'and',
  'or',
  'did',
  'not',
  'want',
  'to',
  'drive',
  'there',
  'once',
  'have',
  'eventually',
  'arrived',
  'at',
  'the',
  'hotel',
  'was',
  'very',
  'pleasantly',
  'surprised',
  'with',
  'the',
  'decor',
  'of',
  'the',
  'lobby',
  'ground',
  'floor',
  'area',
  'it',
  'was',
  'very',
  'stylish',
  'and',
  'modern',
  'found',
  'the',
  'reception',
  'staff',
  'geeting',
  'me',
  'with',
  'aloha',
  'bit',
  'out',
  'of',
  'place',
  'but',
  'guess',
  'they',
  'are',
  'briefed',
  'to',
  'say',
  'that',
  'to',
  'keep',
  'up',
  'the',
  'coroporate',
  'image',
  'as',
  'have',
  'starwood',
  'preferred',
  'guest',
  'me

In [4]:
model = gensim.models.Word2Vec(documents, window=10, min_count=2, workers=10)
model

<gensim.models.word2vec.Word2Vec at 0x12824e2b0>

In [9]:
w1 = ["roomy"]
model.wv.most_similar (positive=w1,topn=6)

[('spacious', 0.9046103954315186),
 ('large', 0.8283935189247131),
 ('compact', 0.7032818794250488),
 ('spaceous', 0.6754883527755737),
 ('small', 0.6571791172027588),
 ('spacy', 0.6536424160003662)]

### Some issues with the embeddings

The models learn what's in the data:

1. Data may contain errors, biases, and inaccuracies

2. The similarity may not always be intuitive

3. The results will contain inaccuracies

In [10]:
w1 = ["polite"]
model.wv.most_similar (positive=w1,topn=6)

[('courteous', 0.9283668398857117),
 ('curteous', 0.8775351643562317),
 ('cordial', 0.8566290140151978),
 ('curtious', 0.8535336256027222),
 ('courtious', 0.8521389365196228),
 ('friendly', 0.847969651222229)]

In [11]:
w1 = ["france"]
model.wv.most_similar (positive=w1,topn=6)

[('germany', 0.7804439663887024),
 ('canada', 0.7552111148834229),
 ('england', 0.7433517575263977),
 ('russia', 0.7276957035064697),
 ('greece', 0.7154693007469177),
 ('spain', 0.6951144337654114)]

In [13]:
w1 = ["dirty"]
model.wv.most_similar (positive=w1,topn=6)

[('filthy', 0.89163738489151),
 ('stained', 0.8257049322128296),
 ('unclean', 0.8242440223693848),
 ('dusty', 0.8185930252075195),
 ('grubby', 0.8114493489265442),
 ('mouldy', 0.7990272641181946)]

In [25]:
w1 = ["clean"]
model.wv.most_similar (positive=w1,topn=6)

[('spotless', 0.7940735220909119),
 ('immaculate', 0.7324411273002625),
 ('spatious', 0.5915030837059021),
 ('smallish', 0.5749716758728027),
 ('roomy', 0.5598531365394592),
 ('compact', 0.5552959442138672)]

### Facebook's FastText
```FastText is an open-source, free, lightweight library that allows users to learn text representations and text classifiers. It works on standard, generic hardware. Models can later be reduced in size to even fit on mobile devices.```



![](https://www.dropbox.com/s/i74guibnv5mxx2h/fasttext.png?dl=1)

https://fasttext.cc/

In [7]:
keep_words= [x.rstrip() for x in open("./data/words_to_keep")]
keep_words

['Athens',
 'Greece',
 'Bangkok',
 'Thailand',
 'Latvia',
 'lats',
 'Bulgaria',
 'lev',
 'bad',
 'worse',
 'big',
 'bigger',
 'boy',
 'girl',
 'brother',
 'sister']

In [16]:
import numpy as np
words_embeds = {}
for line in open("/Users/mahdi/Downloads/wiki-news-300d-1M.vec"):
    data = line.split()
    if data[0] in keep_words:
        words_embeds[data[0]] = np.array(list(map(float, data[1:])))



In [27]:
words_embeds.keys()

dict_keys(['big', 'bad', 'girl', 'Greece', 'boy', 'brother', 'worse', 'sister', 'bigger', 'Thailand', 'Bulgaria', 'Athens', 'Latvia', 'Bangkok', 'lats', 'lev'])

In [29]:
res = words_embeds["girl"] - words_embeds["boy"] + words_embeds["brother"]
res

array([ 0.3193, -0.0592, -0.0461, -0.1192, -0.0269,  0.0761, -0.0135,
       -0.0527, -0.0506,  0.073 ,  0.0514, -0.1087,  0.0921, -0.0437,
        0.0175,  0.204 , -0.0187, -0.0671,  0.1144, -0.0361, -0.1047,
        0.1144, -0.2408,  0.0436,  0.0406, -0.0068, -0.1024,  0.1106,
       -0.0419, -0.1826,  0.1547,  0.0084, -0.2653,  0.1108, -0.1934,
        0.152 , -0.1   , -0.0516,  0.0547, -0.0557, -0.0244,  0.1113,
       -0.0535,  0.0241,  0.0024, -0.0016, -0.0264,  0.0544,  0.0263,
       -0.0619,  0.0101, -0.0284, -0.6605,  0.12  ,  0.051 , -0.0162,
       -0.0769,  0.1983,  0.0784,  0.0058,  0.0147, -0.0124, -0.0843,
       -0.076 , -0.0489, -0.0935, -0.0857,  0.113 ,  0.0732, -0.0112,
        0.0516, -0.0555, -0.026 ,  0.0495, -0.0793,  0.1126,  0.0691,
        0.1725, -0.0754,  0.049 ,  0.036 , -0.1203, -0.0533, -0.185 ,
        0.108 , -0.073 , -0.2247, -0.2533,  0.1525,  0.0283, -0.0906,
        0.1891,  0.0952, -0.0831,  0.1297,  0.1307, -0.0464,  0.0717,
        0.0647, -0.0

In [22]:
res - words_embeds["sister"]

array([ 9.37000000e-02,  2.73000000e-02, -2.41000000e-02, -6.40000000e-03,
        5.86000000e-02,  1.10000000e-03, -3.36000000e-02, -9.75000000e-02,
       -1.78900000e-01,  4.58000000e-02,  8.46000000e-02,  9.00000000e-02,
        1.45100000e-01, -4.40000000e-03, -2.45000000e-02,  1.24900000e-01,
       -1.52700000e-01, -1.43600000e-01,  7.58000000e-02,  9.81000000e-02,
       -1.07700000e-01,  8.69000000e-02, -1.15000000e-01, -1.19500000e-01,
        4.32000000e-02,  4.56000000e-02, -1.70000000e-03, -3.00000000e-03,
       -4.25000000e-02, -1.08300000e-01, -2.57000000e-02, -2.72000000e-02,
       -5.54000000e-02,  1.61000000e-02, -1.25300000e-01,  5.08000000e-02,
       -1.30800000e-01,  4.02000000e-02, -1.23000000e-02, -2.97000000e-02,
        1.13600000e-01,  2.98000000e-02,  7.26000000e-02,  7.70000000e-03,
        1.01700000e-01,  2.60000000e-03,  2.34000000e-02,  5.88000000e-02,
        9.53000000e-02, -7.27000000e-02,  1.85000000e-02,  4.74000000e-02,
       -1.63000000e-02,  