# NLP with Python
## Amazon Fine Food Reviews

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import spacy
import itertools as it
import codecs

nlp = spacy.load('en')
%matplotlib inline

In [2]:
base = '/home/boyto/DataSets/FoodReviews/'
reviews = pd.read_csv(base+'train.csv',encoding = 'utf8')
len(reviews)

454760

In [3]:
#reviews = reviews.sample(10000)

In [4]:
texts = reviews['Text']

In [5]:
sample_review = texts[texts.index[1]]
sample_review

u"McCann's Steel Cut Oatmeal is the perfect breakfast for people in a hurry.  All you do is put 1/2 cup of oats in a little crockpot with 2 cups of boiling water before you go to bed, then an hour before I get up the crockpot starts on high on a timer so when I get out of the shower the oatmeal is ready to go.  Just mix in some blueberries, raisins, craisins, dates, nuts or whatever you want and eat."

In [6]:
parsed_review = nlp(sample_review)

In [7]:
print parsed_review

McCann's Steel Cut Oatmeal is the perfect breakfast for people in a hurry.  All you do is put 1/2 cup of oats in a little crockpot with 2 cups of boiling water before you go to bed, then an hour before I get up the crockpot starts on high on a timer so when I get out of the shower the oatmeal is ready to go.  Just mix in some blueberries, raisins, craisins, dates, nuts or whatever you want and eat.


In [8]:
for num, sentence in enumerate(parsed_review.sents):
    print 'Sentence {}:'.format(num + 1)
    print sentence
    print ''

Sentence 1:
McCann's Steel Cut Oatmeal is the perfect breakfast for people in a hurry.  

Sentence 2:
All you do is put 1/2 cup of oats in a little crockpot with 2 cups of boiling water before you go to bed, then an hour before I get up the crockpot starts on high on a timer so when I get out of the shower the oatmeal is ready to go.  

Sentence 3:
Just mix in some blueberries, raisins, craisins, dates, nuts or whatever you want and eat.



In [9]:
for num, entity in enumerate(parsed_review.ents):
    print 'Entity {}:'.format(num + 1), entity, '-', entity.label_
    print ''

Entity 1: McCann's Steel Cut Oatmeal - ORG

Entity 2: 1/2 - CARDINAL

Entity 3: 2 - CARDINAL



In [10]:
token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_pos),
             columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,McCann,PROPN
1,'s,PART
2,Steel,PROPN
3,Cut,PROPN
4,Oatmeal,PROPN
5,is,VERB
6,the,DET
7,perfect,ADJ
8,breakfast,NOUN
9,for,ADP


In [11]:
token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_lemma, token_shape),
             columns=['token_text', 'token_lemma', 'token_shape'])

Unnamed: 0,token_text,token_lemma,token_shape
0,McCann,mccann,XxXxxx
1,'s,'s,'x
2,Steel,steel,Xxxxx
3,Cut,cut,Xxx
4,Oatmeal,oatmeal,Xxxxx
5,is,be,xx
6,the,the,xxx
7,perfect,perfect,xxxx
8,breakfast,breakfast,xxxx
9,for,for,xxx


In [12]:
token_entity_type = [token.ent_type_ for token in parsed_review]
token_entity_iob = [token.ent_iob_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_entity_type, token_entity_iob),
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,McCann,ORG,B
1,'s,ORG,I
2,Steel,ORG,I
3,Cut,ORG,I
4,Oatmeal,ORG,I
5,is,,O
6,the,,O
7,perfect,,O
8,breakfast,,O
9,for,,O


In [13]:
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_review]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,McCann,-14.704258,,,,,
1,'s,-4.830559,,,,,
2,Steel,-12.109809,,,,,
3,Cut,-11.754498,,,,,
4,Oatmeal,-14.012610,,,,,
5,is,-4.457749,Yes,,,,
6,the,-3.528767,Yes,,,,
7,perfect,-9.035118,,,,,
8,breakfast,-11.033276,,,,,
9,for,-4.880109,Yes,,,,


In [14]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [15]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(series):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    for rev in series:
        yield rev.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(series):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(series),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [16]:
unigram_sentences_filepath = base+'unigram_sentences_all.txt'

In [17]:
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:
    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(texts):
            f.write(sentence + '\n')

CPU times: user 50min 46s, sys: 14.9 s, total: 51min 1s
Wall time: 17min 50s


In [18]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [19]:
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print u' '.join(unigram_sentence)
    print u''

this popcorn also have ton of oil in it

pops up light and tasty

just like i may want in a theater

unfortunately i be look for something that be not all that bad for me as a snack and this just be not it

bring lot of napkin because there be a lot of oil in the mixture

it ' almost too rich to eat

the product from sahales be great

we have order it several time and will order it again

it be one of the good snack food because of the ingredient

well it ' a fave thing for my bf.. but i guess its a lil too small for a big kid like him..its a good buy though



In [20]:
bigram_model_filepath = base+'bigram_model_all'

In [21]:
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 1:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

CPU times: user 1min 52s, sys: 2.66 s, total: 1min 55s
Wall time: 1min 55s


In [22]:
bigram_sentences_filepath = base+'bigram_sentences_all.txt'

In [23]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')



CPU times: user 5min 14s, sys: 7.1 s, total: 5min 21s
Wall time: 5min 22s


In [24]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [25]:
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print u' '.join(bigram_sentence)
    print u''

this popcorn also have ton of oil in it

pops up light and tasty

just like i may want in a theater

unfortunately i be look for something that be not all that bad for me as a snack and this just be not it

bring lot of napkin because there be a lot of oil in the mixture

it ' almost too rich to eat

the product from sahales be great

we have order it several time and will order it again

it be one of the good snack food because of the ingredient

well it ' a fave thing for my bf.. but i guess its a lil too small for a big kid like him..its a good buy though



In [26]:
trigram_model_filepath = base+'trigram_model_all'

In [27]:
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 1:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

CPU times: user 1min 46s, sys: 3.28 s, total: 1min 49s
Wall time: 1min 49s


In [28]:
trigram_sentences_filepath = base+'trigram_sentences_all.txt'

In [29]:
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')

CPU times: user 4min 57s, sys: 6.4 s, total: 5min 3s
Wall time: 5min 4s


In [30]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [31]:
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print u' '.join(trigram_sentence)
    print u''

this popcorn also have ton of oil in it

pops up light and tasty

just like i may want in a theater

unfortunately i be look for something that be not all that bad for me as a snack and this just be not it

bring lot of napkin because there be a lot of oil in the mixture

it ' almost too rich to eat

the product from sahales be great

we have order it several time and will order it again

it be one of the good snack food because of the ingredient

well it ' a fave thing for my bf.. but i guess its a lil too small for a big kid like him..its a good buy though



In [32]:
trigram_reviews_filepath = base+'trigram_transformed_reviews_all.txt'

In [33]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(line_review(texts),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in spacy.en.language_data.STOP_WORDS]
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')

CPU times: user 59min 44s, sys: 7.2 s, total: 59min 51s
Wall time: 27min 34s


In [34]:
print u'Original:' + u'\n'

for review in it.islice(line_review(texts), 11, 12):
    print review

print u'----' + u'\n'
print u'Transformed:' + u'\n'

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 11, 12):
        print review

Original:

While I love the product, if you read the label carefully, you realize that this is product of Taiwan, not France. For those who try to avoid groceries from China/Taiwan/Hong Kong, thought I should let them know....
----

Transformed:

love product read label carefully realize product taiwan france those_who try avoid grocery china/taiwan/hong kong think let know



In [35]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import cPickle as pickle

In [36]:
trigram_dictionary_filepath = base+'trigram_dict_all.dict'

In [37]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 0 == 0:

    trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

CPU times: user 37.1 s, sys: 192 ms, total: 37.3 s
Wall time: 37.5 s


In [38]:
trigram_bow_filepath = base+'trigram_bow_corpus_all.mm'

In [39]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [40]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 0 == 1:

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_reviews_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

CPU times: user 1min 59s, sys: 480 ms, total: 1min 59s
Wall time: 2min


In [41]:
lda_model_filepath = base+'lda_model_all'

In [42]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 0 == 1:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=4)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

CPU times: user 4min 23s, sys: 34.2 s, total: 4min 57s
Wall time: 5min 6s


In [43]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print u'{:20} {}'.format(u'term', u'frequency') + u'\n'

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print u'{:20} {:.3f}'.format(term, round(frequency, 3))

In [92]:
explore_topic(topic_number=10)

term                 frequency

oil                  0.051
olive_oil            0.024
licorice             0.024
'                    0.022
good                 0.016
olive                0.016
magnesium            0.014
like                 0.014
taste                0.013
flavor               0.012
grain                0.011
cook                 0.011
quinoa               0.009
need                 0.008
thing                0.008
use                  0.007
healthy              0.007
body                 0.006
know                 0.005
product              0.005
cooking              0.005
ingredient           0.005
great                0.005
rice                 0.005
try                  0.005


In [100]:
topic_names = dict()
for topic_number in range(0,50):
    topic_name = ''
    for term, frequency in lda.show_topic(topic_number, topn=5):
        topic_name = topic_name+' '+term
    topic_names[topic_number] = topic_name
topic_names    

{0: u' add sauce hot heat water',
 1: u' mix use add good recipe',
 2: u' price buy $ good pay',
 3: u' use hair smell product like',
 4: u' buy amazon good available stock',
 5: u' coffee taste like flavor try',
 6: u' product review read help use',
 7: u' like taste try think flavor',
 8: u' tooth good china dog vet',
 9: u' jerky flavor spicy packet color',
 10: u" oil olive_oil licorice ' good",
 11: u' tea drink honey hot taste',
 12: u" cereal good taste ' like",
 13: u' use extract buy good like',
 14: u' jar plastic use container seal',
 15: u' good soup salad eat like',
 16: u" product ingredient food organic 's",
 17: u' pasta sauce good flavor love',
 18: u' pod coffee use machine buy',
 19: u' love kid great son daughter',
 20: u' use day work formula start',
 21: u' tea black good earl_grey stash',
 22: u' good bean flavor try like',
 23: u' flavor gum like good chew',
 24: u' bread cheese good use slice',
 25: u' plant cake grow seed good',
 26: u' fruit apple cherry good

In [101]:
topic_names_filepath = '/home/boyto/DataSets/Yelp/data/topic_names.pkl'

with open(topic_names_filepath, 'w') as f:
    pickle.dump(topic_names, f)

In [47]:
LDAvis_data_filepath = base+'ldavis_prepared'

In [48]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

    with open(LDAvis_data_filepath, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath) as f:
    LDAvis_prepared = pickle.load(f)

CPU times: user 11min 59s, sys: 2min 8s, total: 14min 8s
Wall time: 11min 30s


In [49]:
pyLDAvis.display(LDAvis_prepared)

In [50]:
review_txt_filepath = '/home/boyto/DataSets/Yelp/data/review_text_all.txt'
def get_sample_review(review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(texts),
                          review_number, review_number+1))[0]

In [102]:
def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in spacy.en.language_data.STOP_WORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    review_lda = sorted(review_lda, key=lambda (topic_number, freq): -freq)
    
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        print '{:25} {}'.format(topic_names[topic_number],
                                round(freq, 3))

In [103]:
sample_review = get_sample_review(50)
print sample_review

I think the jerky would have been good, but...The packages I received contained jerky that had fuzzy white stuff growing on it. None of the packages had a read-able expiration date. The clearest packaging shows Aug 2011 (which means the product still had 6 months left in its life). When I contacted Lumen, however, I did get an email and phone call from them expressing their concern about my complaint. The man I spoke with was genuinely interested in investigating the problem. The company directly sent me three free packages to replace some of the moldy ones.


In [104]:
lda_description(sample_review)

 order amazon product receive item 0.733
 use hair smell product like 0.09
 jerky flavor spicy packet color 0.083


In [105]:
sample_review = get_sample_review(100)
print sample_review

I am a picky coffee-drinker. I LOVE my Starbucks lattes.  I have been trying to cut back my 3-cup-per-day consumption, however, which led me to Dandy Blend.  I didn't expect it to taste like my Starbucks, I was just trying to appreciate it for its unique flavor.  I have to say it's a lot better than I expected.  I still prefer good coffee, but if I had to choose between Dandy Blend and cheap coffee, I would choose Dandy Blend.  It tastes a lot more like coffee than I thought it would.  My kids love it (they are 4 and 8).  I warm up some milk for them and mix it in and they have "lattes" with Mommy.  This way I don't have to worry about giving them too much sugar and Dandy Blend is good for the liver, among other things.  It's also a good substitute for hot chocolate for them, which I don't give them often because of the sugar.  But you have to drink something warm and creamy when you come in from playing in the snow.  It's definitely worth a try, and it's a pretty good value considerin

In [106]:
lda_description(sample_review)

 coffee taste like flavor try 0.436
 ' like taste flavor drink 0.314
 love kid great son daughter 0.139
 add sauce hot heat water 0.051


In [56]:
from gensim.models import Word2Vec

trigram_sentences = LineSentence(trigram_sentences_filepath)
word2vec_filepath = base+'word2vec_model_all'

In [57]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the word2vec model yourself.
if 0 == 1:

    # initiate the model and perform the first epoch of training
    food2vec = Word2Vec(trigram_sentences, size=100, window=5,
                        min_count=20, sg=1, workers=4)
    
    food2vec.save(word2vec_filepath)

    # perform another 11 epochs of training
    for i in range(1,12):

        food2vec.train(trigram_sentences)
        food2vec.save(word2vec_filepath)
        
# load the finished model from disk
food2vec = Word2Vec.load(word2vec_filepath)
food2vec.init_sims()

print u'{} training epochs so far.'.format(food2vec.train_count)

12 training epochs so far.
CPU times: user 4h 53min 26s, sys: 4min 41s, total: 4h 58min 8s
Wall time: 1h 19min 23s


In [58]:
print u'{:,} terms in the food2vec vocabulary.'.format(len(food2vec.vocab))

23,633 terms in the food2vec vocabulary.


In [59]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in food2vec.vocab.iteritems()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda (term, index, count): -count)

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(food2vec.syn0norm[term_indices, :],
                            index=ordered_terms)

word_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
be,-0.131273,-0.212401,-0.063069,0.053883,-0.055274,0.112567,-0.13471,0.13866,0.013503,-0.0288,...,-0.12037,-0.020983,0.248556,0.036867,-0.115136,-0.114687,-0.086504,-0.078348,-0.027982,0.035967
the,-0.159651,-0.123239,-0.154392,-0.09201,-0.109142,-0.096214,0.012041,-0.002566,-0.043746,-0.020481,...,-0.105381,-0.173727,0.003538,0.187815,-0.019241,-0.145515,0.110137,0.100738,-0.135743,-0.12265
i,0.07623,-0.246134,-0.344308,0.094731,0.056499,-0.030151,-0.000231,-0.057895,-0.080298,-0.031023,...,-0.279124,0.077286,-0.075103,-0.032722,-0.163098,-0.013462,0.016392,0.025974,-0.134767,0.046014
and,-0.221207,-0.177193,-0.199978,-0.09697,0.062335,-0.01335,0.143637,-0.029563,0.10598,0.043706,...,-0.047908,0.084856,0.122433,0.186808,-0.10611,0.023059,0.038985,0.126377,-0.075163,0.0126
a,-0.143496,-0.178113,-0.074862,0.038801,0.066839,-0.164382,-0.096086,0.025945,-0.10781,0.15147,...,-0.005677,0.026158,0.086496,0.238731,0.027907,-0.075602,0.128862,0.144526,-0.109594,-0.110553


In [60]:
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in food2vec.most_similar(positive=[token], topn=topn):

        print u'{:20} {}'.format(word, round(similarity, 3))

In [61]:
get_related_terms(u'burger')

hamburger            0.87
steak                0.805
taco                 0.795
tacos                0.779
burrito              0.769
ham                  0.757
meatloaf             0.746
pull_pork            0.738
potato_salad         0.732
grill                0.731


In [62]:
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = food2vec.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print term

In [63]:
word_algebra(add=[u'breakfast', u'lunch'])

late_night_snack


In [64]:
word_algebra(add=[u'lunch', u'night'], subtract=[u'day'])

dinner


In [65]:
word_algebra(add=[u'taco', u'chinese'], subtract=[u'mexican'])

pork


In [66]:
word_algebra(add=[u'bun', u'mexican'], subtract=[u'american'])

burger


In [67]:
word_algebra(add=[u'filet_mignon', u'seafood'], subtract=[u'beef'])

fish


In [68]:
word_algebra(add=[u'coffee', u'snack'], subtract=[u'drink'])

pretzel


In [70]:
word_algebra(add=[u"denny_'s", u'fine_dining'])

restaurant


In [73]:
from sklearn.manifold import TSNE

In [74]:
tsne_input = word_vectors.drop(spacy.en.language_data.STOP_WORDS, errors=u'ignore')
tsne_input = tsne_input.head(5000)

In [75]:
tsne_input.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
good,-0.095766,-0.157914,0.039735,-0.025332,0.029984,-0.045774,0.027775,0.004062,0.014054,-0.102756,...,-0.148413,0.10053,0.022908,-0.016201,-0.074844,-0.114839,0.031754,-0.101051,0.054873,-0.030474
like,-0.18296,-0.000999,-0.126665,-0.069361,-0.063793,-0.024196,0.160745,0.16283,-0.013092,-0.107055,...,-0.073357,-0.005592,0.164221,0.066893,0.01553,-0.027996,0.043446,-0.00373,0.036534,-0.016018
taste,0.047653,-0.06604,-0.06245,-0.088787,-0.159298,-0.011771,0.045012,0.145097,0.050784,0.11378,...,-0.106853,-0.016816,0.123973,0.057458,0.097396,0.018637,0.065581,-0.030473,-0.05221,-0.055617
flavor,-0.119398,-0.074654,-0.144895,0.134072,-0.283441,-0.062454,0.128747,-0.009206,0.019763,0.101279,...,-0.124421,-0.039643,0.071159,0.106141,-0.048743,0.013028,0.187981,-0.022007,0.035431,-0.078193
',-0.08999,-0.052625,-0.090312,-0.069658,0.025092,0.062993,-0.073771,0.21282,0.033777,-0.10015,...,-0.125727,0.05582,0.077107,-0.002358,-0.024304,0.008462,0.014022,-0.017053,-0.075037,-0.096084


In [76]:
tsne_filepath = base+'tsne_model'
tsne_vectors_filepath = base+'tsne_vectors.npy'

In [77]:
%%time

if 0 == 1:
    
    tsne = TSNE()
    tsne_vectors = tsne.fit_transform(tsne_input.values)
    
    with open(tsne_filepath, 'w') as f:
        pickle.dump(tsne, f)

    pd.np.save(tsne_vectors_filepath, tsne_vectors)
    
with open(tsne_filepath) as f:
    tsne = pickle.load(f)
    
tsne_vectors = pd.np.load(tsne_vectors_filepath)

tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

CPU times: user 1min 26s, sys: 9.59 s, total: 1min 36s
Wall time: 1min 35s


In [78]:
tsne_vectors.head()

Unnamed: 0,x_coord,y_coord
good,4.007573,-6.241393
like,5.737544,-1.591911
taste,-0.349782,-8.840345
flavor,-3.200024,-8.918845
',6.261111,-3.804861


In [79]:
tsne_vectors[u'word'] = tsne_vectors.index

In [80]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [81]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, resize, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);