In [1]:
# Import the libraries to define the data directory and file path for business records
# Display a sample business record

import os
import codecs

data_directory = os.path.join('C:/Users/User/Documents', 'Yelp Data')
                              
businesses_filepath = os.path.join(data_directory, 'business.json')

with codecs.open(businesses_filepath, encoding='utf_8') as f: first_business_record = f.readline() 

print (first_business_record)

{"business_id": "YDf95gJZaq05wvo7hTQbbQ", "name": "Richmond Town Square", "neighborhood": "", "address": "691 Richmond Rd", "city": "Richmond Heights", "state": "OH", "postal_code": "44143", "latitude": 41.5417162, "longitude": -81.4931165, "stars": 2.0, "review_count": 17, "is_open": 1, "attributes": {"RestaurantsPriceRange2": 2, "BusinessParking": {"garage": false, "street": false, "validated": false, "lot": true, "valet": false}, "BikeParking": true, "WheelchairAccessible": true}, "categories": ["Shopping", "Shopping Centers"], "hours": {"Monday": "10:00-21:00", "Tuesday": "10:00-21:00", "Friday": "10:00-21:00", "Wednesday": "10:00-21:00", "Thursday": "10:00-21:00", "Sunday": "11:00-18:00", "Saturday": "10:00-21:00"}}



In [3]:
# Define file path for reviews
# Display a sample review record

review_json_filepath = os.path.join(data_directory, 'review.json')

with codecs.open(review_json_filepath, encoding='utf_8') as f: first_review_record = f.readline()
    
print (first_review_record)

{"review_id":"VfBHSwC5Vz_pbFluy07i9Q","user_id":"cjpdDjZyprfyDG3RlkVG3w","business_id":"uYHaNptLzDLoV_JZ_MuzUA","stars":5,"date":"2016-07-12","text":"My girlfriend and I stayed here for 3 nights and loved it. The location of this hotel and very decent price makes this an amazing deal. When you walk out the front door Scott Monument and Princes street are right in front of you, Edinburgh Castle and the Royal Mile is a 2 minute walk via a close right around the corner, and there are so many hidden gems nearby including Calton Hill and the newly opened Arches that made this location incredible.\n\nThe hotel itself was also very nice with a reasonably priced bar, very considerate staff, and small but comfortable rooms with excellent bathrooms and showers. Only two minor complaints are no telephones in room for room service (not a huge deal for us) and no AC in the room, but they have huge windows which can be fully opened. The staff were incredible though, letting us borrow umbrellas for t

In [4]:
# Read in each business record and convert it to a Python dict
# Filter out business records that aren't about restaurants (i.e., not in the "Restaurant" category)
# Create a frozenset of the business IDs for restaurants, which will be used in the next step

import json

restaurant_ids = set()

# open the businesses file
with codecs.open(businesses_filepath, encoding='utf_8') as f:
    
    # iterate through each line (json record) in the file
    for business_json in f:
        
        # convert the json record to a Python dict
        business = json.loads(business_json)
        
        # if this business is not a restaurant, skip to the next one
        if u'Restaurants' not in business[u'categories']:
            continue
            
        # add the restaurant business id to the restaurant_ids set
        restaurant_ids.add(business[u'business_id'])

# turn restaurant_ids into a frozenset, as we don't need to change it anymore
restaurant_ids = frozenset(restaurant_ids)

# print the number of unique restaurant ids in the dataset
print ('{:,}'.format(len(restaurant_ids)), u'restaurants are there in the dataset.')

51,613 restaurants are there in the dataset.


In [5]:
# Create a directory in the base folder to store the reviews from those 51,613 restaurants

rest_directory = os.path.join('C:/Users/User/Documents', 'Yelp Data','Restaurants')

review_text_filepath = os.path.join(rest_directory, 'review_text.txt')

In [8]:
# Select only those reviews which are for those 51,613 restaurants
# Save them in a text file

if 1 == 1:
    
    review_count = 0

    # create & open a new file in write mode
    with codecs.open(review_text_filepath, 'w', encoding='utf_8') as review_txt_file:

        # open the existing review json file
        with codecs.open(review_json_filepath, encoding='utf_8') as review_json_file:

            # loop through all reviews in the existing file and convert to dict
            for review_json in review_json_file:
                review = json.loads(review_json)

                # if this review is not about a restaurant, skip to the next one
                if review[u'business_id'] not in restaurant_ids:
                    continue

                # write the restaurant review as a line in the new file
                # escape newline characters in the original review text
                review_txt_file.write(review[u'text'].replace('\n', '\\n') + '\n')
                review_count += 1

    print (u'''Text from {:,} restaurant reviews
              written to the new txt file.'''.format(review_count))
    
else:
    
    with codecs.open(review_txt_filepath, encoding='utf_8') as review_txt_file:
        for review_count, line in enumerate(review_txt_file):
            pass
        
    print (u'Text from {:,} restaurant reviews in the txt file.'.format(review_count + 1))

JSONDecodeError: Unterminated string starting at: line 1 column 150 (char 149)

In [9]:
# import required libraries for text procecssing

import spacy
import pandas as pd
import itertools as it

In [10]:
# load English package in spacy

nlp = spacy.load('en')

In [11]:
# print a sample review 

with codecs.open(review_text_filepath, encoding='utf_8') as f:
    sample_review = list(it.islice(f, 8, 9))[0]
    sample_review = sample_review.replace('\\n', '\n')
        
print (sample_review)

The staff here is great and they're nice,  wonderful and quick. People were ranting in raving about pei wei, I had to try it.  Even good yelp reviews.  I'm highly dissatisfied with the flavor of the food. This  should be labeled Asian inspired and not Asian. I've tried a variety of Chinese restaurants, this doesn't taste close to anything I've had at other Asian restaurants. Their Mongolian beef  was 5 pieces of beef and large mushrooms cut into thirds in a thick sauce. You eat the rice to wash off the nasty flavor. My shrimp was thickly coated in an overpowering  sauce as well.  I only ate some of the veggies that take center stage on a meat dish.  The center of my pork egg roll was cold. The hot N sour soup was a much thicker consistency almost like that of a chili instead of being brothy. Worst of all was the price.  This was not worth it to us. Neither me or my husband enjoyed either of  our dishes.  We didn't even eat half of our plates.  We even refused to take it home with us.  

In [16]:
# Parse the sample review

parsed_review = nlp(sample_review)

In [17]:
# Print the parsed review

print(parsed_review)

The staff here is great and they're nice,  wonderful and quick. People were ranting in raving about pei wei, I had to try it.  Even good yelp reviews.  I'm highly dissatisfied with the flavor of the food. This  should be labeled Asian inspired and not Asian. I've tried a variety of Chinese restaurants, this doesn't taste close to anything I've had at other Asian restaurants. Their Mongolian beef  was 5 pieces of beef and large mushrooms cut into thirds in a thick sauce. You eat the rice to wash off the nasty flavor. My shrimp was thickly coated in an overpowering  sauce as well.  I only ate some of the veggies that take center stage on a meat dish.  The center of my pork egg roll was cold. The hot N sour soup was a much thicker consistency almost like that of a chili instead of being brothy. Worst of all was the price.  This was not worth it to us. Neither me or my husband enjoyed either of  our dishes.  We didn't even eat half of our plates.  We even refused to take it home with us.  

In [18]:
# Print sentences in the parsed review

for num, sentence in enumerate(parsed_review.sents):
    print ('Sentence {}:'.format(num + 1))
    print (sentence)
    print ('')

Sentence 1:
The staff here is great and they're nice,  wonderful and quick.

Sentence 2:
People were ranting in raving about pei wei, I had to try it.  

Sentence 3:
Even good yelp reviews.  

Sentence 4:
I'm highly dissatisfied with the flavor of the food.

Sentence 5:
This  should be labeled Asian inspired and not Asian.

Sentence 6:
I've tried a variety of Chinese restaurants, this doesn't taste close to anything I've had at other Asian restaurants.

Sentence 7:
Their Mongolian beef  was 5 pieces of beef and large mushrooms cut into thirds in a thick sauce.

Sentence 8:
You eat the rice to wash off the nasty flavor.

Sentence 9:
My shrimp was thickly coated in an overpowering  sauce as well.  

Sentence 10:
I only ate some of the veggies that take center stage on a meat dish.  

Sentence 11:
The center of my pork egg roll was cold.

Sentence 12:
The hot N sour soup was a much thicker consistency almost like that of a chili instead of being brothy.

Sentence 13:
Worst of all was the 

In [19]:
from spacy.en import English

In [20]:
# Print the entities (nouns) present in the review

for num, entity in enumerate(parsed_review.ents):
    print ('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print ('')

Entity 1: Asian - NORP

Entity 2: Asian - NORP

Entity 3: Chinese - NORP

Entity 4: Asian - NORP

Entity 5: Mongolian - NORP

Entity 6: 5 - CARDINAL

Entity 7: half - CARDINAL

Entity 8: Asian - NORP



In [21]:
# Create a pandas dataframe with the tokens and their part of speech (pos tagging)

token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

pd.DataFrame(list(zip(token_text, token_pos)), columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,The,DET
1,staff,NOUN
2,here,ADV
3,is,VERB
4,great,ADJ
5,and,CCONJ
6,they,PRON
7,'re,VERB
8,nice,ADJ
9,",",PUNCT


In [22]:
# Create a pandas dataframe with the tokens, their lemma and shape

token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]

pd.DataFrame(list(zip(token_text, token_lemma, token_shape)),
             columns=['token_text', 'token_lemma', 'token_shape'])

Unnamed: 0,token_text,token_lemma,token_shape
0,The,the,Xxx
1,staff,staff,xxxx
2,here,here,xxxx
3,is,be,xx
4,great,great,xxxx
5,and,and,xxx
6,they,-PRON-,xxxx
7,'re,be,'xx
8,nice,nice,xxxx
9,",",",",","


In [23]:
# Create a pandas dataframe with the tokens, their entity type and IOB

token_entity_type = [token.ent_type_ for token in parsed_review]
token_entity_iob = [token.ent_iob_ for token in parsed_review]

pd.DataFrame(list(zip(token_text, token_entity_type, token_entity_iob)),
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,The,,O
1,staff,,O
2,here,,O
3,is,,O
4,great,,O
5,and,,O
6,they,,O
7,'re,,O
8,nice,,O
9,",",,O


In [25]:
# Create a pandas dataframe with the tokens, their different attributes (probability, stop words, punctuation, white space, number, out of vocabulary)

token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_review]

df = pd.DataFrame(token_attributes, 
                  columns=['text',
                           'log_probability',
                           'stop word?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocabulary?'])

df.loc[:, 'stop word?':'out of vocabulary?'] = (df.loc[:, 'stop word?':'out of vocabulary?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop word?,punctuation?,whitespace?,number?,out of vocabulary?
0,The,-5.774222,Yes,,,,
1,staff,-10.720455,,,,,
2,here,-7.175437,Yes,,,,
3,is,-4.329765,Yes,,,,
4,great,-7.822114,,,,,
5,and,-4.195279,Yes,,,,
6,they,-5.429816,Yes,,,,
7,'re,-6.377125,,,,,
8,nice,-8.462502,,,,,
9,",",-3.391480,,Yes,,,


In [26]:
# Import gensim models to perform further text processing

from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

Using TensorFlow backend.


In [27]:
# Create functions to perform specific functions

def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [28]:
# define file path for unigram sentences

unigram_sentences_filepath = os.path.join(rest_directory, 'unigram_sentences_all.txt')

In [29]:
# Create an unigram file

%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(review_text_filepath):
            f.write(sentence + '\n')

Wall time: 0 ns


In [30]:
# Create unigram sentences

unigram_sentences = LineSentence(unigram_sentences_filepath)

In [34]:
# Print an unigram review
# itertools.islice(iterable, start, stop[, step])

for unigram_sentence in it.islice(unigram_sentences, 220, 240):
    print (u' '.join(unigram_sentence))
    print (u'')

-PRON- substitute fry rice for white rice .99 cent extra

this be tout as soy citrus glaze red bell pepper onion scallion ginger garlic and chile paste and sesame seed

-PRON- be just alright

the flavor do not really do much for -PRON-

-PRON- be a tad spicy from the chile paste but overall -PRON- be kind of bland

-PRON- have some piece of steak chicken that be a bit funky too

-PRON- would definitely try out a different dish next time here if there be a next time

the fried rice be good

egg roll- eh

-PRON- be an egg roll

nothing fantastic about -PRON- but -PRON- taste fine

-PRON- come with a sweet mustard dip sauce which be pretty tasty

usual filling of pork veggie etc

deep fried pot stickers- -PRON- get 2 of these thing

-PRON- be good

small but good

straight pork filling in the middle of -PRON-

overall -PRON- be just alright

-PRON- will say that -PRON- give -PRON- a grip of food

the 2 entree -PRON- get could have easily feed 1 2 more people



In [35]:
# define file path for bigram model

bigram_model_filepath = os.path.join(rest_directory, 'bigram_model_all')

In [36]:
# Create a bigram model

%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 1:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

Wall time: 14.8 s


In [37]:
# define file path for bigram sentences

bigram_sentences_filepath = os.path.join(rest_directory, 'bigram_sentences_all.txt')

In [38]:
# Create a bigram file

%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')

Wall time: 0 ns


In [39]:
# Create bigram sentences

bigram_sentences = LineSentence(bigram_sentences_filepath)

In [40]:
# Print a bigram review

for bigram_sentence in it.islice(bigram_sentences, 220, 240):
    print (u' '.join(bigram_sentence))
    print (u'')

-PRON- substitute fry rice for white rice .99_cent extra

this be tout as soy citrus_glaze red bell_pepper onion scallion ginger garlic and chile paste and sesame_seed

-PRON- be just alright

the flavor do not really do much for -PRON-

-PRON- be a tad spicy from the chile paste but overall -PRON- be kind of bland

-PRON- have some piece of steak chicken that be a bit funky too

-PRON- would definitely try out a different dish next time here if there be a next time

the fried_rice be good

egg roll- eh

-PRON- be an egg roll

nothing fantastic about -PRON- but -PRON- taste fine

-PRON- come with a sweet mustard dip sauce which be pretty tasty

usual filling of pork veggie etc

deep_fried pot stickers- -PRON- get 2 of these thing

-PRON- be good

small but good

straight pork filling in the middle of -PRON-

overall -PRON- be just alright

-PRON- will say that -PRON- give -PRON- a grip of food

the 2 entree -PRON- get could have easily_feed 1 2 more people



In [41]:
# define file path for trigram model

trigram_model_filepath = os.path.join(rest_directory, 'trigram_model_all')

In [42]:
# Create a trigram model

%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 1:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

Wall time: 13.7 s


In [43]:
# define file path for trigram sentences

trigram_sentences_filepath = os.path.join(rest_directory, 'trigram_sentences_all.txt')

In [44]:
# Create a trigram file

%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')

Wall time: 0 ns


In [45]:
# Create trigram sentences

trigram_sentences = LineSentence(trigram_sentences_filepath)

In [46]:
# display a trigram review

for trigram_sentence in it.islice(trigram_sentences, 220, 240):
    print (u' '.join(trigram_sentence))
    print (u'')

-PRON- substitute fry rice for white rice .99_cent extra

this be tout as soy citrus_glaze red_bell_pepper onion scallion ginger garlic and chile_paste and sesame_seed

-PRON- be just alright

the flavor do not really do much for -PRON-

-PRON- be a tad spicy from the chile_paste but overall -PRON- be kind of bland

-PRON- have some piece of steak chicken that be a bit funky too

-PRON- would definitely try out a different dish next time here if there be a next time

the fried_rice be good

egg roll- eh

-PRON- be an egg_roll

nothing fantastic about -PRON- but -PRON- taste fine

-PRON- come with a sweet mustard dip sauce which be pretty tasty

usual filling of pork veggie etc

deep_fried pot stickers- -PRON- get 2 of these thing

-PRON- be good

small but good

straight pork filling in the middle of -PRON-

overall -PRON- be just alright

-PRON- will say that -PRON- give -PRON- a grip of food

the 2 entree -PRON- get could have easily_feed 1 2 more people



In [47]:
# Define file path for a transformed file of all the reviews with one review per line

trigram_reviews_filepath = os.path.join(rest_directory, 'trigram_transformed_reviews_all.txt')

In [48]:
# Create the completely transformed file

%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(line_review(review_text_filepath),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in spacy.en.English.Defaults.stop_words]
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')

Wall time: 0 ns


In [49]:
# Compare an original and transformed review

print (u'Original:' + u'\n')

for review in it.islice(line_review(review_text_filepath), 11, 12):
    print (review)

print (u'----' + u'\n')
print (u'Transformed:' + u'\n')

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 11, 12):
        print (review)

Original:

I love this place i'd recommend it to anyone ! We always order it togo and it never disappoints! The food always taste fresh and is always ready on time! Definitely our favorite lunch spot !

----

Transformed:

-PRON- love place -PRON- recommend -PRON- -PRON- order -PRON- togo -PRON- disappoint food taste fresh ready time definitely -PRON- favorite lunch spot



In [50]:
# Import libraries for LDA (Latent Dirichlet Allocation)

from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import _pickle as cPickle

In [51]:
# Define a file path for the trigram dictionary 

trigram_dictionary_filepath = os.path.join(rest_directory, 'trigram_dict_all.dict')

In [52]:
# Create the trigram dictionary

%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 0 == 1:

    trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

Wall time: 226 ms


In [53]:
# Define file path for trigram bag of words (bow)

trigram_bow_filepath = os.path.join(rest_directory,
                                    'trigram_bow_corpus_all.mm')

In [54]:
# Define a function which converts reviews into bow representation

def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [55]:
# Create trigram bag of words corpus

%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 0 == 1:

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_reviews_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

Wall time: 518 ms


In [56]:
# Define a file path for LDA model

lda_model_filepath = os.path.join(rest_directory, 'lda_model_all')

In [57]:
# Create LDA model
# Set the number of topics to any number depending on the business (I am setting it to be 50)

%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 0 == 1:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=3)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

Wall time: 2.76 s


In [58]:
# Define a function to explore a topic for the words belonging to it

def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print (u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print (u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [59]:
# Explore any topic

explore_topic(topic_number=23)

term                 frequency

beer                 0.074
bar                  0.073
drink                0.035
bartender            0.024
game                 0.015
great                0.014
like                 0.013
selection            0.012
happy_hour           0.011
tap                  0.011
watch                0.010
menu                 0.009
tv                   0.009
's                   0.007
beer_selection       0.007
pretty               0.007
try                  0.007
local                0.006
pub                  0.006
sit                  0.006
friend               0.006
cool                 0.005
lot                  0.005
fun                  0.005
wine                 0.005


In [60]:
# Name the topics based on your understanding by exploring each topic

topic_names = {0: u'mexican',
               1: u'menu',
               2: u'thai',
               3: u'steak',
               4: u'donuts & appetizers',
               5: u'specials',
               6: u'soup',
               7: u'wings, sports bar',
               8: u'foreign language',
               9: u'las vegas',
               10: u'chicken',
               11: u'aria buffet',
               12: u'noodles',
               13: u'ambience & seating',
               14: u'sushi',
               15: u'arizona',
               16: u'family',
               17: u'price',
               18: u'sweet',
               19: u'waiting',
               20: u'general',
               21: u'tapas',
               22: u'dirty',
               23: u'customer service',
               24: u'restrooms',
               25: u'chinese',
               26: u'gluten free',
               27: u'pizza',
               28: u'seafood',
               29: u'amazing',
               30: u'eat, like, know, want',
               31: u'bars',
               32: u'breakfast',
               33: u'location & time',
               34: u'italian',
               35: u'barbecue',
               36: u'arizona',
               37: u'indian',
               38: u'latin & cajun',
               39: u'burger & fries',
               40: u'vegetarian',
               41: u'lunch buffet',
               42: u'customer service',
               43: u'taco, ice cream',
               44: u'high cuisine',
               45: u'healthy',
               46: u'salad & sandwich',
               47: u'greek',
               48: u'poor experience',
               49: u'wine & dine'}

In [61]:
# Define a file path for the topic names

topic_names_filepath = os.path.join(rest_directory, 'topic_names.pkl')

with open(topic_names_filepath, 'wb') as f: cPickle.dump(topic_names, f)

In [62]:
# Define a file path for LDA visualization

LDAvis_data_filepath = os.path.join(rest_directory, 'ldavis_prepared')

In [70]:


%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

    with open(LDAvis_data_filepath, 'wb') as f:
        cPickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath) as f:
    LDAvis_prepared = cPickle.load(f)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 69: character maps to <undefined>

In [71]:
# Display LDA visualization

pyLDAvis.display(LDAvis_prepared)

In [76]:
# Define a function to get a sample review

def get_sample_review(review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_text_filepath),
                          review_number, review_number+1))[0]

In [98]:
# Define a function to apply all the pre-processing, word-phrasing, removing stop words and creating bag of words representation
# which will result in top 3 topics which are highly related to that review

def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in spacy.en.English.Defaults.stop_words]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    from itertools import starmap
    
    # sort with the most highly related topics first
    review_lda = sorted(review_lda, key=lambda topic_number, freq: - freq)
    
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
                        
        # print the most highly related topic names and frequencies
        print ('{:25} {}'.format(topic_names[topic_number], round(freq, 3)))

In [83]:
sample_review = get_sample_review(100)
print (sample_review)

If you blink. You will miss this little eatery, squished into the corner of a small strip mall in Mentor, Ohio.

I went with a friend of mine from the area, who was treated like a rock star when he arrived (well, he IS a rock star), and that's indicative of the casual and familial vibe at this restaurant.  I didn't take notes - too busy enjoying the company - but I'm pretty sure I got a burrito and it was delicious.  Margaritas were excellent (this much I do remember) and service was quick and pleasant.  Now that I know that this place exists, it gives me even more reason to travel to the northern tip of Ohio for good company and eats!



In [99]:
lda_description(sample_review)



TypeError: <lambda>() missing 1 required positional argument: 'freq'