# Yelp Data Challenge - NLP

Meina Wang

Mar 2018

In [60]:
import pandas as pd

In [61]:
df = pd.read_csv('dataset/last_1_years_restaurant_reviews.csv')

In [4]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Cajun/Creole, Steakhouses, Restaurants]",4.0,0,2017-02-14,0,Xp3ppynEvVu1KxDHQ3ae8w,5,Delmonico Steakhouse is a steakhouse owned by ...,0,KC8H7qTZVPIEnanw9fG43g
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Cajun/Creole, Steakhouses, Restaurants]",4.0,1,2017-05-28,0,LEzphAnz0vKE32PUCbjLgQ,4,One of the top steak places I've had in Vegas ...,2,3RTesI_MAwct13LWm4rhLw
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Cajun/Creole, Steakhouses, Restaurants]",4.0,0,2017-08-25,0,4e-cxYVdlIu2ZDxVJqUfOQ,5,This place is superb from the customer service...,0,EAOt1UQhJD0GG3l_jv7rWA
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Cajun/Creole, Steakhouses, Restaurants]",4.0,1,2017-02-12,1,heZd0W3HuPJxZBrCYD3wDw,2,"Lousy steak. \n\nThe service was great - Todd,...",3,OtKA03ALQQ1CBhtaJod_Jw
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Cajun/Creole, Steakhouses, Restaurants]",4.0,0,2017-12-10,0,exzXjy7Y2ICX_BEVTDWpJA,5,I got the filet mignon with seared foigras and...,0,Ymtd4cQypep_QZJ-qJsHuA


### Define feature variables, here is the text of the review

In [5]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = df['text'].values

In [6]:
# inspect your documents, e.g. check the size, take a peek at elements of the numpy array
documents.dtype, documents.shape
documents[100]

"This place had the best appetizer (fried oysters and bone marrow spread) and one of the most perfectly cooked steaks I've had. Went here for New Year's with my family and we definitely enjoyed ourselves. It was a bit dark and formal for my liking but overall was a great culinary experience."

### Define target variable, here is the rating of the review

#### For example, I am interested in > 4 stars rating

In [7]:
# Make a column and take the values, save to a variable named "target"
df['favorable'] = df['stars'] > 4
target = df['favorable'].values
target[:10]

array([ True, False,  True, False,  True,  True,  True, False,  True,
        True])

In [8]:
documents.shape, target.shape

((169917,), (169917,))

## Create training dataset and test dataset

In [9]:
from sklearn.cross_validation import train_test_split



In [16]:
# Split to documents_train, documents_test, target_train, target_test
documents_train, documents_test, target_train, target_test = train_test_split(
    documents,
    target,
    test_size = 0.3,
    random_state = 42
)

## Let's get NLP representation of the documents

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [19]:
# Train the model with your training data
vectors_train = vectorizer.fit_transform(documents_train).toarray()

In [20]:
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()

In [21]:
# Use the trained model to transform your test data
vectors_test = vectorizer.transform(documents_test).toarray()

## Similar review search engine

In [22]:
import numpy as np

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return [labels[i] for i in np.argsort(lst)[:n]]


In [23]:
# Use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
# Draw an arbitrary review from test (unseen in training) documents
some_random_number = 42
search_query = documents_test[some_random_number]
search_queries = [search_query]

In [63]:
# Transform the drawn review(s) to vector(s)
vector_search_queries = vectorizer.transform(search_queries).toarray()

In [64]:
# Calculate the similarity score(s) between vector(s) and training vectors
similarity_scores = cosine_similarity(vector_search_queries, vectors_train)

In [65]:
# Let's find top 5 similar reviews
n = 5
returned_reviews = get_top_values(similarity_scores[0], n, documents_train)

In [66]:
print('Our search query:')
print(search_queries[0]) 

Our search query:
Wow!! The Drag Queen Brunch was the highlight of my Vegas trip!

My daughter is a HUGE fan Rupaul's Drag Race & some of the cast perform at this show. So she chose this as one of the activities to celebrate her 21st birthday in Las Vegas. 

A group of us gals bought tickets via groupon. You need to redeem both the ticket & groupon voucher there at the venue so make sure you have it (or on your phone). VIP's do not need to wait in the very long line.  So I highly recommend the VIP tix --- no waiting in line, you receive priority seating, unlimited alcohol at the bar in addition to unlimited mimosas, and their buffet. The buffet had many choices, such as waffles, eggs, bacon, cheese omelets, yogurt, tamales, taquitos, juices, coffee, etc. 

All of the performances were amazing & the show had great energy.  The host (dressed in red here in this pic) was my FAVE--very funny & so entertaining!!! 

The only thing we missed out on was the photo op that was included in our pa

In [70]:
print('Most %s similar reviews:' % n)
print(returned_reviews[0]) 

Most 5 similar reviews:
This review is for the Drag Show Brunch at Señor Frogs only.

This show was amazing!!!

We arrived at 11:00 am for the 11:30 showing.  We started being checked in immediately, and were are able to get our food before the show.  While the Brunch Buffet was okay, the Drinks and entertainment far beyond make up for it.

The Brunch Buffett has a wide variety of breakfast.....cereal, waffles, egg omelets, bacon, sausage, pancakes and also a Mexican lunch option which I wasn't very impressed with.  The lunch option was taquitos, tamales, rice, chimichangas.  They also had additional coffee and grapefruit juice plus fruit and yogurt.  So there is a wide range of food to choose from.  For me though I thought the breakfast was better than the Mexican lunch.

The Drinks......So we bought our tickets and Groupon which you have to redeem both the Groupon and show tickets at the same time.  Also make sure you book the show tickets separately in advance.  Our Groupon was for 

It makes sense for the above reviews to be similar, since they both mentioned the show is amazing, and the reviews are positive.

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [37]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(vectors_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
# Get score for training set
model_nb.score(vectors_train, target_train)

0.817186672383787

In [39]:
# Get score for test set
model_nb.score(vectors_test, target_test)

0.8085765850596359

#### Logistic Regression Classifier

In [40]:
from sklearn.linear_model import LogisticRegression

model_lrc = LogisticRegression(random_state=42)
model_lrc.fit(vectors_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
# Get score for training set
model_lrc.score(vectors_train, target_train)

0.8500012611294675

In [42]:
# Get score for test set
model_lrc.score(vectors_test, target_test)

0.832274011299435

#### Key features(words) that make the positive prediction

In [43]:
# Find it out by ranking
n = 20
get_top_values(model_lrc.coef_[0], n, words)

[u'amazing',
 u'best',
 u'fantastic',
 u'awesome',
 u'delicious',
 u'thank',
 u'highly',
 u'perfect',
 u'incredible',
 u'perfection',
 u'excellent',
 u'phenomenal',
 u'love',
 u'great',
 u'outstanding',
 u'favorite',
 u'heaven',
 u'bomb',
 u'regret',
 u'wonderful']

Postive words make the positive prediction.

#### Key features(words) that make the negative prediction

In [44]:
n = 20
get_bottom_values(model_lrc.coef_[0], n, words)

[u'worst',
 u'ok',
 u'horrible',
 u'mediocre',
 u'rude',
 u'okay',
 u'disappointing',
 u'terrible',
 u'slow',
 u'bland',
 u'average',
 u'awful',
 u'dry',
 u'overpriced',
 u'poor',
 u'reason',
 u'lacking',
 u'meh',
 u'lacked',
 u'disgusting']

Negative or not postive words make the negative prediction.

## Use cross validation to evaluate the classifiers

In [72]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model_lrc,
                           vectors_train,
                           target_train,
                           cv = 5,
                           scoring = 'accuracy')
cv_scores

array([0.83446131, 0.83349447, 0.8363109 , 0.83747425, 0.83339639])

## Use grid search to find best predictable classifier

In [57]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

param_grid = [{'penalty':['l1'], 'C':[0.01, 0.1, 1, 5, 10, 100]},
             {'penalty':['l2'], 'C':[0.01, 0.1, 1, 5, 10, 100]}]
scores = ['accuracy']

for score in scores:
    clf = GridSearchCV(LogisticRegression(),
                      param_grid,
                      cv=5,
                      scoring=score)
    clf.fit(vectors_train[:500,:], target_train[:500])
    print("Best parameters are: ", clf.best_params_)
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print(mean, std * 2, params)
        

('Best parameters are: ', {'penalty': 'l2', 'C': 1})
(0.506, 0.0040009999750162395, {'penalty': 'l1', 'C': 0.01})
(0.506, 0.0040009999750162395, {'penalty': 'l1', 'C': 0.1})
(0.676, 0.07936762767043122, {'penalty': 'l1', 'C': 1})
(0.726, 0.058965838389780374, {'penalty': 'l1', 'C': 5})
(0.724, 0.036226090500677814, {'penalty': 'l1', 'C': 10})
(0.736, 0.038933250311244026, {'penalty': 'l1', 'C': 100})
(0.548, 0.0247688302003738, {'penalty': 'l2', 'C': 0.01})
(0.756, 0.08472858014406548, {'penalty': 'l2', 'C': 0.1})
(0.78, 0.09214120142531253, {'penalty': 'l2', 'C': 1})
(0.778, 0.10446326080151688, {'penalty': 'l2', 'C': 5})
(0.778, 0.11194058544364931, {'penalty': 'l2', 'C': 10})
(0.762, 0.11083482320915015, {'penalty': 'l2', 'C': 100})


In [55]:
y_true, y_pred = target_test, clf.predict(vectors_test)
print(classification_report(y_true, y_pred))

             precision    recall  f1-score   support

      False       0.78      0.76      0.77     25810
       True       0.76      0.79      0.77     25166

avg / total       0.77      0.77      0.77     50976

