## Sentiment Analysis of Menu Items in Yelp Review

In [35]:
from __future__ import division # Use Python 3-style division
import ujson
from collections import Counter
from datetime import datetime
import sys
import numpy as np
from operator import itemgetter
import hashlib
from pprint import pprint

from num2words import num2words
import string
import nltk.corpus
from unidecode import unidecode
from nltk import word_tokenize
from nltk.tag.perceptron import PerceptronTagger
import codecs
import json
import nltk
import editdistance
from nltk.sentiment import SentimentIntensityAnalyzer

from pprint import pprint
import os

from ConfigParser import SafeConfigParser
from nltk.stem import WordNetLemmatizer

In [9]:
vader_analyzer = SentimentIntensityAnalyzer("/usr/local/share/nltk_data/sentiment/vader_lexicon/vader_lexicon.txt")

wnl = WordNetLemmatizer()

#### Functions for loading Pittsburgh Yelp data

In [2]:
def loadRestaurantsData():
    # Load restaurants data
    restaurants = {}
    with open("pittsburgh_restaurants.json", 'r') as f:
        for line in f:
            res = ujson.loads(line)
            restaurants[res["business_id"]] = res

    return restaurants


def loadReviewsData():
    # Load Review data
    maxReview = sys.maxint
    n = 0
    reviews = {}
    reviewers = {}

    with open("pittsburghReviews.json", 'r') as f:
        for line in f:
            review = ujson.loads(line)
            reviews[review['review_id']] = review

            restaurant = restaurants.get(review["business_id"], None)
            if not restaurant:
                continue

            # Collect stats for reviewers
            reviewSummary = reviewers.get(review["user_id"],
                                          {"reviewCount":0,
                                           "stars":Counter(), "wkdays":Counter(),
                                           "neighbors":Counter(), "postalCodes":Counter(),
                                           "useful":0, "categories":Counter(),
                                           "businessIds":Counter()})

            reviewSummary["reviewCount"] += 1
            reviewSummary["stars"][int(review["stars"])] += 1
            reviewSummary["wkdays"][datetime.date(datetime.strptime(review["date"], "%Y-%m-%d")).isoweekday()] += 1
            reviewSummary["useful"] += int(review["useful"])
            reviewSummary["businessIds"][review["business_id"]] += 1

            reviewSummary["postalCodes"][restaurant["postal_code"]] += 1

            neighbor = restaurant["neighborhood"]
            if not neighbor:
                neighbor = "None"
            reviewSummary["neighbors"][neighbor] += 1

            for category in restaurant["categories"]:
                if category not in ["Restaurants", "Food"] :
                    reviewSummary["categories"][category] += 1

            reviewers[review["user_id"]] = reviewSummary

            n += 1
            if n == maxReview:
                break

    return reviews, reviewers

def buildRestaurantReviewIds(reviews):
    # Build an inverted index so that we can list all the review ids of a restaurant quickly
    restaurantReviewIds = {}
    for reviewId in reviews.keys():
        review = reviews[reviewId]
        busId = review['business_id']
        reviewIds = restaurantReviewIds.get(busId, set())
        reviewIds.add(reviewId)
        restaurantReviewIds[busId] = reviewIds

    return restaurantReviewIds

#### Load data

In [3]:
restaurants = loadRestaurantsData()
reviews, reviewers = loadReviewsData()
restaurantReviewIds = buildRestaurantReviewIds(reviews)

#### Helper functions for NLP

In [6]:
# Lemmatize - return the dictionary form of a word: e.g. dogs -> dog
def get_lemma(word):
    try:
        return wnl.lemmatize(word)
    except UnicodeEncodeError:
        return word


# Return all possible n-grams from a list of words
def find_ngrams(input_list, n):
    n = min(len(input_list), n)
    return zip(*[input_list[i:] for i in range(n)])

    
# Stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))

# Add some commonly used non-English stopwords
stopwords.add("en")
stopwords.add("et")


# Do a fuzzy comparison between word and target
def fuzzyMatch(word, target):
    if word == target:
        return True

    if word[0] != target[0]:
        return False

    n = len(word)

    # Short words require full match
    # Avoid matching Sunday with Sundae
    # Avoid matching plate with pate
    if n <= 4 or word == "sunday" or word == "plate":
        return word == target

    dist = editdistance.eval(word, target)
    if n <= 6:
        return dist <= 1
    elif n <= 12:
        return dist <= 2
    else:
        return dist <= 3


# Return whether "word" fuzzy-matches one of the targets
def fuzzyMatchOneTarget(word, targets):
    for target in targets:
        match = fuzzyMatch(word, target)
        if match:
            return True

    return False


isDebug = False
def debugPrint(s):
    if isDebug:
        print s



#### Help functions for analyzing review text

In [47]:
# A quicker way to do POS tagging
tagger = PerceptronTagger()

# Unwanted word types
unwantedWordTypes = []

# See http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html for the definition of these tags
unwantedWordTypes.append("POS")
unwantedWordTypes.append("EX")
unwantedWordTypes.append("DT")
unwantedWordTypes.append("to")
unwantedWordTypes.append("WDT")
unwantedWordTypes.append("WP")
unwantedWordTypes.append("WP$")
unwantedWordTypes.append("WRB")


# Normalize certain dish names
normalizedDishNames = {
    "hamburger": "burger",
    "bier": "beer"
}

In [72]:
# Preprocess some words
def preprocessWords(s, restaurantName = None):
    if not s or len(s) == 0:
        return []
    
    s = s.lower()  # Lowercase it

    if restaurantName:
        # Remove restaurant name in a string, otherwise we will always get a match whenever someone
        # mentioned the restaurant name in the review
        if isinstance(restaurantName, list):
            for name in restaurantName:
                if name:
                    s = s.replace(name, "")
        else:
            s = s.replace(restaurantName, "")

    # Convert certain common word variations
    s = s.replace("-a-", " a ")
    s = s.replace('"', '')
    s = s.replace("'s ", ' ')
    s = s.replace("-o-", " of ")
    s = s.replace("/", " / ")
    s = s.replace("&", " and ")
    s = s.replace("'n ", " and ")
    s = s.replace(" n'", " and ")
    s = s.replace("'n'", " and ")
    s = s.replace("-n-", " and ")
    s = s.replace("'", " ")
    s = s.replace("*", " ")
    s = s.replace("-", " - ")

    s = unidecode(s)  # Convert accented chars to plain English chars
    
    tags = tagger.tag(word_tokenize(s))  # Get POS tag of the words

    tags = [t for t in tags if t[1] not in unwantedWordTypes] # Remove type of word we don't want

    tags = [t for t in tags if t[0] not in stopwords] # Remove stopwords

    tags = [t for t in tags if t[0] not in string.punctuation] # Remove punctuations

    s = [t[0] for t in tags]

    # Convert numbers to words
    s = [w if not w.isdigit() else num2words(int(w))  for w in s]

    s = [get_lemma(w) for w in s]  # Convert each word to its Dictionary version

    s = [normalizedDishNames.get(w, w) for w in s] # Normalize certain dish name

    return s

In [49]:
# Before we decide whether a review text n-gram matches a menu item, we first check the menu section which
# contains the menu item.  If the section contains one of the keywords found in the sectionKeywords below, then
# n-gram has to contain that keyword as well.
#
# E.g. let's say a menu contains a section called "Sandwich" and a section called "Pie", and under each section
# there is a menu item called "Beef".
#
# In the review, an n-gram "ate a beef sandwich" will match both menu item names.  We will use the section of each
# menu item to determine whether we match a beef sandwich or a beef pie.
sectionKeywords = [
    "bbq", "grill", "stew", "skillet", "casserole",

    "pasta", "pizza", "flatbread", "rice", "bean", "noodle",

    "salad",

    "soap",

    "casserole", "burger", "sandwich", "pie", "tart", "wrap",

    "lamb", "beef", "pork", "chicken", "schnitzel",
    "mussel", "clam", "oyster",

    # Drinks
    "beer", "coffee",

    # Mexican
    "nacho", "Quesadilla", "taco", "Enchilada", "Burrito", "fajita", "Chimichanga",

    # Japanese
    "sushi", "nigiri", "sashimi"
]

# Make sure all words are lower case and lemmatized
sectionKeywords = [get_lemma(w.lower()) for w in sectionKeywords]

# These are some generic section names which we don't expect to be found in the actual review text when
# the reviewer is mentioning a menu item.
genericSectionNames = ["specialty", "dessert", "appetizer", "favorite", "drink", "entree", "main", "course"]

In [75]:
# Process a json file (fn) which contains the sections and menu items of a restaurant, and
# return a list of data structures used in review analysis.
def parseMenuItems(fn, restaurants):
    menuItems = []
    menuSections = []

    with codecs.open(fn, "r", "utf-8") as f:
        menuInfo = json.load(f)

        resId = menuInfo["id"]
        restaurantName = menuInfo["name"]

        # Does this restaurant have an alt. name?
        restaurantInfo = restaurants.get(resId, None)
        altName = None
        if restaurantInfo:
            name = restaurantInfo["name"]
            if name != restaurantName:
                altName = name.lower()

        restaurantNames = [unidecode(restaurantName.lower()), unidecode(altName) if altName else altName]

        for section in menuInfo["sections"]:
            sectionName = section["name"]
            preprocessedName = preprocessWords(sectionName, restaurantNames)

            # Check if the section name contains nothing but generic terms
            onlyGenericTerms = False
            n = 0
            for word in preprocessedName:
                if word in genericSectionNames:
                    n += 1

            if n == len(preprocessedName):
                onlyGenericTerms = True

            # Does the section name contain one of the main dishes?
            matchedSectionKeyword = None
            for sectionKeyword in sectionKeywords:
                if sectionKeyword in preprocessedName:
                    matchedSectionKeyword = sectionKeyword
                    break

            lastSection = {
                "name": sectionName,
                "preprocessedName": preprocessedName,
                "sectionKeyword": matchedSectionKeyword,
                "onlyGenericTerms": onlyGenericTerms
            }

            menuSections.append(lastSection)

            for item in section["items"]:
                name = item["name"]
                desc = item.get("desc", "")
                menuItems.append({"name": name,
                                  "desc": desc,
                                  "preprocessedName": set(preprocessWords(name, restaurantNames)),
                                  "preprocessedDesc": set(preprocessWords(desc, restaurantNames)),
                                  "section": lastSection
                                 })

    return restaurantName, altName, resId, menuItems, menuSections


# Try to match menu items in a 'preprocessed sentence'
def match_menu_items_in_sent(menuItems, menuSections, preprocessedSent, ngramLen):
    matched_items = {}
    section_matched_items = set()

    # Loop thru all n-grams in the sentence
    for ngram in find_ngrams(preprocessedSent, ngramLen):
        ngram = set(ngram)  # Remove duplicate terms
        item_match = False

        # See if it matches a menu item
        for item in menuItems:
            itemWords = item["preprocessedName"]
            sectionWords = item["section"]["preprocessedName"]

            match = False
            sectionKeywordMatch = False

            # See if our n-gram matches certain number of terms in the menu item name
            # If menu item has:
            # - <= 2 terms, the ngram needs to match all the terms
            # - 3 or 4 terms, the ngram needs to match at least two terms
            # - >= 5 terms, the ngram needs to match at least three terms
            n_matched = sum([1 if fuzzyMatchOneTarget(w, itemWords) else 0 for w in ngram])
            #debugPrint("ngram:{}, itemWords:{}, n_matched:{}".format(ngram, itemWords, n_matched))

            itemLen = len(itemWords)
            matchItemName = False

            if itemLen <= 2:
                if n_matched >= itemLen:
                    matchItemName = True
            elif itemLen <= 4:
                if n_matched >= 2:
                    matchItemName = True
            else:
                if n_matched >= 3:
                    matchItemName = True

            if matchItemName:
                if itemLen >= 3 and n_matched == itemLen:
                    # For item with long name and we have a full match, consider it a good match
                    match = True
                else:
                    # For dishes which belong to certain type of dish (e.g. sandwich), we need to check if the n-gram
                    # contains that dish name
                    sectionKeyword = item["section"]["sectionKeyword"]

                    if sectionKeyword:
                        # See if the ngram matches at least a word in the sectionKeyword we identified in the section
                        # to which the dish belongs
                        if sectionKeyword in ngram:
                            debugPrint("! sectionKeyword matched !")
                            sectionKeywordMatch = True
                            match = True
                    else:
                        # We don't have a sectionKeyword related to the section.

                        # If this case, first see if the ngram matches at least a word in section name

                        if item["section"]["onlyGenericTerms"]:
                            # Its section contains only generic terms (e.g. "desserts).
                            # In this case we don't match section and we assume we have a match
                            debugPrint("! No need for section match !")
                            match = True
                        else:
                            n_match_section = sum([1 if w in sectionWords else 0 for w in ngram])
                            if n_match_section >= 1:
                                debugPrint("! section match !")
                                match = True

                #if not match:
                    # Last try: see if the n-gram matches at least two terms in the item description
                #    if len(ngram & item["preprocessedDesc"]) >= 2:
                #        debugPrint("! desc match !")
                #        match = True

            elif n_matched > 0:
                # if we match at least one word, but we couldn't match half of the words in the item name,
                # see if all the words in the ngram are found in the description
                if item["preprocessedDesc"] and len(ngram & set(item["preprocessedDesc"])) == len(ngram):
                    debugPrint("! desc full match only !")
                    match = True

            if match:
                debugPrint(u"[DEBUG: n_matched={} ngram={}; Menu: {}-{}; {} - {}]".format(n_matched, ngram,
                                                          sectionWords, itemWords,
                                                           item["section"], item["name"]))

                itemName = u"{} ({})".format(item["name"], item["section"]["name"])
                score = n_matched + 2*sectionKeywordMatch

                if itemName not in matched_items or score > matched_items[itemName]["score"]:
                    matched_items[itemName] = {"score": score, "item": item}

                # Add the item's section to the matched-section set
                section_matched_items.add(item["section"]["name"])

                item_match = True

        if not item_match:
            # There is no item match for this ngram.  See if it matches a section
            for section in menuSections:
                sectionWords = section["preprocessedName"]
                if not section["onlyGenericTerms"] and sectionWords:
                    n_matched = sum([1 if fuzzyMatchOneTarget(w, sectionWords) else 0 for w in ngram])
                    if n_matched > 0:
                        section_matched_items.add(section["name"])

    return matched_items, section_matched_items




In [51]:
# Try to match menu items in a review
def match_menu_items_in_review(matches, matchedSents, review, menuItems, menuSections, ngramLen):
    total_matches = 0

    reviewText = review["text"]
    reviewId = review["review_id"]
    reviewDate = datetime.strptime(review["date"], "%Y-%m-%d")

    sents = nltk.sent_tokenize(reviewText) # break our review into sentences
    preprocessedSents = [preprocessWords(s) for s in sents] # preprocess each sentence

    for sent, processedSent in zip(sents, preprocessedSents):
        matched_items, section_matched_items = match_menu_items_in_sent(menuItems, menuSections, 
                                                                        preprocessedSent, ngramLen)

        hasMatch = False
        if len(matched_items) > 0 or len(section_matched_items) > 0:
            hasMatch = True

        if hasMatch:
            # TODO: need to handle the case where the item name contains a strong sentiment word
            # e.g. "angry tiki dog"

            # TODO: sometimes we need to consider the next sentence(s) as well.

            sentiment = vader_analyzer.polarity_scores(sent)
            print
            print(u"{}".format(sent))
            print
            print "---- Sentiment:"
            print sentiment

        else:
            print(u"{}".format(sent))

        # TODO: if an n-gram matches multiple items, we should pick only one

        if len(matched_items) > 0:
            # Save up the matched sentence
            if reviewId not in matchedSents:
                matchedSents[reviewId] = set()

            matchedSents[reviewId].add(sent)
            sentMd5 = hashlib.md5(sent.encode('utf-8')).hexdigest()

            total_matches += len(matched_items)
            print "---- Matched Items:"
            for key, item in matched_items.iteritems():
                print key
                info = matches.get(key, {"shortName": item["item"]["name"],
                                         "scores": [], "dates": [], "reviewIds": [], "matchedSentMd5": []})
                info.get("scores").append(sentiment['compound'])
                info.get("dates").append(reviewDate)
                info.get("reviewIds").append(reviewId)
                info.get("matchedSentMd5").append('sent-' + sentMd5)
                matches[key] = info

        if len(section_matched_items) > 0:
            print "---- Matched Sections:"
            for item in sorted(list(section_matched_items)):
                print item

        if hasMatch:
            print "----\n"

    return


### Documentation
---

## Overview of the Menu Item Matching Process

In order to analyze the sentiment of a menu item in a review text, first we have to identify which menu item(s) are mentioned in a review body.  Below gives a high level summary of this process.  

---
### Step 1: Break down the review body into sentences

In [52]:
reviewText = \
"Great beer, great atmosphere, great food, & a good portion size at a decent price. The evening started \
out with a glass of their smooth tasting Gulden Draak Ale. The Mediterranean nachos tasted awesome, while the shrimp \
and langostino pizza was loaded w plenty of shrimp, lobster and flavor."

pprint(reviewText)

'Great beer, great atmosphere, great food, & a good portion size at a decent price. The evening started out with a glass of their smooth tasting Gulden Draak Ale. The Mediterranean nachos tasted awesome, while the shrimp and langostino pizza was loaded w plenty of shrimp, lobster and flavor.'


In [53]:
sentences = nltk.sent_tokenize(reviewText)
pprint(sentences)

['Great beer, great atmosphere, great food, & a good portion size at a decent price.',
 'The evening started out with a glass of their smooth tasting Gulden Draak Ale.',
 'The Mediterranean nachos tasted awesome, while the shrimp and langostino pizza was loaded w plenty of shrimp, lobster and flavor.']


---
### Step 2: Preprocess each sentence
In an ideal world, when a reviewer mentions a dish, it will have a perfect match with what is printed in the restaurant menu.  However, in reality there are many ways to break this assumption, with *some* listed below:


| Menu Item        | What the reviewer wrote instead  | Problem |
| :------------- |:-------------| :-------------| 
| Nachos      | Nacho      | Singular vs Plural |
| Pâté | Pate      | Accented characters |
| Spicy Hamburger | Spicy Burger | A similar word is used instead |
| Mac-n-Cheese | Mac and Cheese | Short forms |
| Lindeman's Framboise Cheesecake | Lindeman Framboise Cheesecake | The apostrophe+s is missing |



In order to make it easier to match a menu iten mentioned in a review, we introduce a *preprocessing* step, which will:
- Lowercase the sentence
- Remove stopwords and punctuations
- Convert numbers to words (e.g. 20 -> twenty)
- Convert each word to its dictionary form (i.e. lemmatize) (e.g. nachos -> nacho)
- Convert accented characters to English characters (e.g. Pâté -> Pate)
- Normalize certain dish names (e.g. hamburger -> burger)
- Normalize the short-form of certain words (e.g. Mac & Cheese -> Mac and Cheese)
- Identify the Part-of-Speech (POS) tag of each word and remove it if it belongs to certain types of tag (e.g. all determiner (i.e. the, all, both, each and every, etc.) are removed)

For example, after preprocessing, the sentence *"The Mediterranean nachos tasted awesome, while the mac-n-cheese was just so so."* becomes `['mediterranean', 'nacho', 'tasted', 'awesome', 'mac', 'cheese']`.  

Please note that we will *preprocess* all **reviews** as well as **restaurant menu items**. 

---
### Additional problems faced during matching
Before we cover the next step, we need to first look at some *additional* problems faced during matching:

| Menu Item        | What the reviewer wrote instead  | Problem |
| :------------- |:-------------| :-------------| 
| Mediterranean Nachos      | Mediteranean Nachos | Spelling mistake |
| Bacon Ranch Chicken Sandwich | Ranch Chicken Bacon Sandwich      | The order of words are different |
| Black Forest Guinness Brownie      | Black Forest Brownie      | Some word(s) are missing |


How to tackle these problems:
- Spelling mistake
    - When comparing two words, we measure the edit-distance between them and allow for certain degree of difference.
- The order of words are different
    - When looking for an item match in a sentence, we iterate all the 4-grams of the sentence, and see how many words from a menu item are included in the 4-gram *regardless* of ordering.
- Some word(s) are missing
    - In the comparison using 4-gram, *sometimes* we call it a match when only some of the words in the menu item is found.  E.g. the 4-gram `['order', 'Black', 'Forest', 'Brownie']` will match the menu item *Black Forest Guinness Brownie* even though only 3 words are found.

With these problems explained, let us introduction the next step.

---
### Step 4: Try to match menu item(s) in each sentence:
- Iterate through all the 4-gram in each *preprocessed* sentence, and use the aforementioned ways to compare it with each *preprocessed* item from that restaurant.
- Sometimes we will also make use of the description a menu item to check if a 4-gram is a match.

Moreover, on a menu multiple items could share the same name.  For example, on one menu the name *Shrimp and Avacado* is listed under both the *Sandwich section* and the *Salad section*.  In order to decide which item the review is talking about, under *some* conditions we require a 4-gram to contain the *section words* as well before we can call it a match.

---
Lastly, the above gives an overview of the process of menu item matching.  Please bear in mind that all these steps and tricks cannot *guarantee* a perfect match, but they will help to improve the accuracy of the algorithm.