# NLP with Python
## Amazon Fine Food Reviews

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import spacy
import itertools as it
import codecs

nlp = spacy.load('en')
%matplotlib inline

In [2]:
base = '/home/boyto/DataSets/FoodReviews/'
reviews = pd.read_csv(base+'train.csv',encoding = 'utf8')
len(reviews)

454760

In [3]:
#reviews = reviews.sample(10000)

In [4]:
texts = reviews['Text']

In [5]:
sample_review = texts[texts.index[1]]
sample_review

u"McCann's Steel Cut Oatmeal is the perfect breakfast for people in a hurry.  All you do is put 1/2 cup of oats in a little crockpot with 2 cups of boiling water before you go to bed, then an hour before I get up the crockpot starts on high on a timer so when I get out of the shower the oatmeal is ready to go.  Just mix in some blueberries, raisins, craisins, dates, nuts or whatever you want and eat."

In [6]:
parsed_review = nlp(sample_review)

In [7]:
print parsed_review

McCann's Steel Cut Oatmeal is the perfect breakfast for people in a hurry.  All you do is put 1/2 cup of oats in a little crockpot with 2 cups of boiling water before you go to bed, then an hour before I get up the crockpot starts on high on a timer so when I get out of the shower the oatmeal is ready to go.  Just mix in some blueberries, raisins, craisins, dates, nuts or whatever you want and eat.


In [8]:
for num, sentence in enumerate(parsed_review.sents):
    print 'Sentence {}:'.format(num + 1)
    print sentence
    print ''

Sentence 1:
McCann's Steel Cut Oatmeal is the perfect breakfast for people in a hurry.  

Sentence 2:
All you do is put 1/2 cup of oats in a little crockpot with 2 cups of boiling water before you go to bed, then an hour before I get up the crockpot starts on high on a timer so when I get out of the shower the oatmeal is ready to go.  

Sentence 3:
Just mix in some blueberries, raisins, craisins, dates, nuts or whatever you want and eat.



In [9]:
for num, entity in enumerate(parsed_review.ents):
    print 'Entity {}:'.format(num + 1), entity, '-', entity.label_
    print ''

Entity 1: McCann's Steel Cut Oatmeal - ORG

Entity 2: 1/2 - CARDINAL

Entity 3: 2 - CARDINAL



In [10]:
token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_pos),
             columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,McCann,PROPN
1,'s,PART
2,Steel,PROPN
3,Cut,PROPN
4,Oatmeal,PROPN
5,is,VERB
6,the,DET
7,perfect,ADJ
8,breakfast,NOUN
9,for,ADP


In [11]:
token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_lemma, token_shape),
             columns=['token_text', 'token_lemma', 'token_shape'])

Unnamed: 0,token_text,token_lemma,token_shape
0,McCann,mccann,XxXxxx
1,'s,'s,'x
2,Steel,steel,Xxxxx
3,Cut,cut,Xxx
4,Oatmeal,oatmeal,Xxxxx
5,is,be,xx
6,the,the,xxx
7,perfect,perfect,xxxx
8,breakfast,breakfast,xxxx
9,for,for,xxx


In [12]:
token_entity_type = [token.ent_type_ for token in parsed_review]
token_entity_iob = [token.ent_iob_ for token in parsed_review]

pd.DataFrame(zip(token_text, token_entity_type, token_entity_iob),
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,McCann,ORG,B
1,'s,ORG,I
2,Steel,ORG,I
3,Cut,ORG,I
4,Oatmeal,ORG,I
5,is,,O
6,the,,O
7,perfect,,O
8,breakfast,,O
9,for,,O


In [13]:
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_review]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,McCann,-14.704258,,,,,
1,'s,-4.830559,,,,,
2,Steel,-12.109809,,,,,
3,Cut,-11.754498,,,,,
4,Oatmeal,-14.012610,,,,,
5,is,-4.457749,Yes,,,,
6,the,-3.528767,Yes,,,,
7,perfect,-9.035118,,,,,
8,breakfast,-11.033276,,,,,
9,for,-4.880109,Yes,,,,


In [14]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [15]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(series):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    for rev in series:
        yield rev.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(series):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(series),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [None]:
unigram_sentences_filepath = base+'unigram_sentences_all.txt'

In [None]:
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:
    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(texts):
            f.write(sentence + '\n')

In [None]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [None]:
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print u' '.join(unigram_sentence)
    print u''

In [None]:
bigram_model_filepath = base+'bigram_model_all'

In [None]:
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 0:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

In [None]:
bigram_sentences_filepath = base+'bigram_sentences_all.txt'

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')

In [None]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [None]:
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print u' '.join(bigram_sentence)
    print u''

In [None]:
trigram_model_filepath = base+'trigram_model_all'

In [None]:
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 0:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

In [None]:
trigram_sentences_filepath = base+'trigram_sentences_all.txt'

In [None]:
%%time
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')

In [None]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [None]:
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print u' '.join(trigram_sentence)
    print u''

In [None]:
trigram_reviews_filepath = base+'trigram_transformed_reviews_all.txt'

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(line_review(texts),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in spacy.en.language_data.STOP_WORDS]
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')

In [None]:
print u'Original:' + u'\n'

for review in it.islice(line_review(texts), 11, 12):
    print review

print u'----' + u'\n'
print u'Transformed:' + u'\n'

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 11, 12):
        print review

In [None]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import cPickle as pickle

In [None]:
trigram_dictionary_filepath = base+'trigram_dict_all.dict'

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 0 == 0:

    trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

In [None]:
trigram_bow_filepath = base+'trigram_bow_corpus_all.mm'

In [None]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 0 == 0:

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_reviews_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

In [None]:
lda_model_filepath = base+'lda_model_all'

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 0 == 0:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=4)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [None]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print u'{:20} {}'.format(u'term', u'frequency') + u'\n'

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print u'{:20} {:.3f}'.format(term, round(frequency, 3))

In [None]:
explore_topic(topic_number=0)

In [None]:
topic_names = {0: u'mexican',
               1: u'menu',
               2: u'thai',
               3: u'steak',
               4: u'donuts & appetizers',
               5: u'specials',
               6: u'soup',
               7: u'wings, sports bar',
               8: u'foreign language',
               9: u'las vegas',
               10: u'chicken',
               11: u'aria buffet',
               12: u'noodles',
               13: u'ambience & seating',
               14: u'sushi',
               15: u'arizona',
               16: u'family',
               17: u'price',
               18: u'sweet',
               19: u'waiting',
               20: u'general',
               21: u'tapas',
               22: u'dirty',
               23: u'customer service',
               24: u'restrooms',
               25: u'chinese',
               26: u'gluten free',
               27: u'pizza',
               28: u'seafood',
               29: u'amazing',
               30: u'eat, like, know, want',
               31: u'bars',
               32: u'breakfast',
               33: u'location & time',
               34: u'italian',
               35: u'barbecue',
               36: u'arizona',
               37: u'indian',
               38: u'latin & cajun',
               39: u'burger & fries',
               40: u'vegetarian',
               41: u'lunch buffet',
               42: u'customer service',
               43: u'taco, ice cream',
               44: u'high cuisine',
               45: u'healthy',
               46: u'salad & sandwich',
               47: u'greek',
               48: u'poor experience',
               49: u'wine & dine'}

In [None]:
topic_names_filepath = '/home/boyto/DataSets/Yelp/data/topic_names.pkl'

with open(topic_names_filepath, 'w') as f:
    pickle.dump(topic_names, f)

In [None]:
LDAvis_data_filepath = base+'ldavis_prepared'

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 0:

    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

    with open(LDAvis_data_filepath, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath) as f:
    LDAvis_prepared = pickle.load(f)

In [None]:
pyLDAvis.display(LDAvis_prepared)

In [None]:
review_txt_filepath = '/home/boyto/DataSets/Yelp/data/review_text_all.txt'
def get_sample_review(review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(texts),
                          review_number, review_number+1))[0]

In [None]:
def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in spacy.en.language_data.STOP_WORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    review_lda = sorted(review_lda, key=lambda (topic_number, freq): -freq)
    
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        print '{:25} {}'.format(topic_names[topic_number],
                                round(freq, 3))

In [None]:
sample_review = get_sample_review(50)
print sample_review

In [None]:
lda_description(sample_review)

In [None]:
sample_review = get_sample_review(100)
print sample_review

In [None]:
lda_description(sample_review)

In [None]:
from gensim.models import Word2Vec

trigram_sentences = LineSentence(trigram_sentences_filepath)
word2vec_filepath = base+'word2vec_model_all'

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the word2vec model yourself.
if 0 == 0:

    # initiate the model and perform the first epoch of training
    food2vec = Word2Vec(trigram_sentences, size=100, window=5,
                        min_count=20, sg=1, workers=4)
    
    food2vec.save(word2vec_filepath)

    # perform another 11 epochs of training
    for i in range(1,12):

        food2vec.train(trigram_sentences)
        food2vec.save(word2vec_filepath)
        
# load the finished model from disk
food2vec = Word2Vec.load(word2vec_filepath)
food2vec.init_sims()

print u'{} training epochs so far.'.format(food2vec.train_count)

In [None]:
print u'{:,} terms in the food2vec vocabulary.'.format(len(food2vec.vocab))

In [None]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in food2vec.vocab.iteritems()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda (term, index, count): -count)

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(food2vec.syn0norm[term_indices, :],
                            index=ordered_terms)

word_vectors.head()

In [None]:
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in food2vec.most_similar(positive=[token], topn=topn):

        print u'{:20} {}'.format(word, round(similarity, 3))

In [None]:
get_related_terms(u'burger')

In [None]:
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = food2vec.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print term

In [None]:
word_algebra(add=[u'breakfast', u'lunch'])

In [None]:
word_algebra(add=[u'lunch', u'night'], subtract=[u'day'])

In [None]:
word_algebra(add=[u'taco', u'chinese'], subtract=[u'mexican'])

In [None]:
word_algebra(add=[u'bun', u'mexican'], subtract=[u'american'])

In [None]:
word_algebra(add=[u'filet_mignon', u'seafood'], subtract=[u'beef'])

In [None]:
word_algebra(add=[u'coffee', u'snack'], subtract=[u'drink'])

In [None]:
word_algebra(add=[u'burger_king', u'fine_dining'])

In [None]:
word_algebra(add=[u"denny_'s", u'fine_dining'])

In [None]:
word_algebra(add=[u"applebee_'s", u'italian'])

In [None]:
word_algebra(add=[u"applebee_'s", u'pancakes'])

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne_input = word_vectors.drop(spacy.en.language_data.STOP_WORDS, errors=u'ignore')
tsne_input = tsne_input.head(5000)

In [None]:
tsne_input.head()

In [None]:
tsne_filepath = base+'tsne_model'
tsne_vectors_filepath = base+'tsne_vectors.npy'

In [None]:
%%time

if 0 == 0:
    
    tsne = TSNE()
    tsne_vectors = tsne.fit_transform(tsne_input.values)
    
    with open(tsne_filepath, 'w') as f:
        pickle.dump(tsne, f)

    pd.np.save(tsne_vectors_filepath, tsne_vectors)
    
with open(tsne_filepath) as f:
    tsne = pickle.load(f)
    
tsne_vectors = pd.np.load(tsne_vectors_filepath)

tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

In [None]:
tsne_vectors.head()

In [None]:
tsne_vectors[u'word'] = tsne_vectors.index

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [None]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, resize, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);