# Yelp Data Challenge - NLP

BitTiger DS501

Jun 2017

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('dataset/last_2_years_restaurant_reviews.csv')

In [4]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Cajun/Creole, Steakhouses, Restaurants]",4.0,0,2016-03-31,0,6SgvNWJltnZhW7duJgZ42w,5,This is mine and my fiancé's favorite steakhou...,0,oFyOUOeGTRZhFPF9uTqrTQ
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Cajun/Creole, Steakhouses, Restaurants]",4.0,0,2016-02-10,0,UxFpgng8dPMWOj99653k5Q,5,Truly Fantastic! Best Steak ever. Service was...,0,aVOGlN9fZ-BXcbtj6dbf0g
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Cajun/Creole, Steakhouses, Restaurants]",4.0,0,2017-02-14,0,Xp3ppynEvVu1KxDHQ3ae8w,5,Delmonico Steakhouse is a steakhouse owned by ...,0,KC8H7qTZVPIEnanw9fG43g
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Cajun/Creole, Steakhouses, Restaurants]",4.0,1,2017-05-28,0,LEzphAnz0vKE32PUCbjLgQ,4,One of the top steak places I've had in Vegas ...,2,3RTesI_MAwct13LWm4rhLw
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Cajun/Creole, Steakhouses, Restaurants]",4.0,0,2017-08-25,0,4e-cxYVdlIu2ZDxVJqUfOQ,5,This place is superb from the customer service...,0,EAOt1UQhJD0GG3l_jv7rWA


### Define your feature variables, here is the text of the review

In [5]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = df['text'].values

In [7]:
# inspect your documents, e.g. check the size, take a peek at elements of the numpy array
documents.dtype, documents.shape
documents[100]

'Still my favorite steakhouse so far!  Ribeye amazing, spinach and au gratin well done.  Great service and they welcomed us back.  Still deserves a 5 star on all points from food to service to ambiance.'

### Define your target variable (any categorical variable that may be meaningful)

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [8]:
# Make a column and take the values, save to a variable named "target"
df['favorable'] = df['stars'] > 4
target = df['favorable'].values
target[:10]

array([ True,  True,  True, False,  True,  True,  True, False, False,
       False])

#### You may want to look at the statistic of the target variable

In [9]:
# To be implemented
target.mean(), target.std

(0.4782396579185261, <function std>)

In [10]:
documents.shape, target.shape

((348455,), (348455,))

## Let's create training dataset and test dataset

In [13]:
from sklearn.cross_validation import train_test_split



In [None]:
# Documents is your X, target is your y
# Now split the data to training set and test set

In [14]:
# Split to documents_train, documents_test, target_train, target_test
documents_train, documents_test, target_train, target_test = train_test_split(
    documents,
    target,
    test_size = 0.3,
    random_state = 7
)

## Let's get NLP representation of the documents

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [24]:
# Train the model with your training data
vectors_train = vectorizer.fit_transform(documents_train).toarray()

In [25]:
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()

In [26]:
# Use the trained model to transform your test data
vectors_test = vectorizer.transform(documents_test).toarray()

## Similar review search engine

In [27]:
import numpy as np

# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    pass  # To be implemented


In [28]:
# Let's use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
# Draw an arbitrary review from test (unseen in training) documents
some_random_number = 7
search_query = documents_test[some_random_number]
search_queries = [search_query]

In [30]:
# Transform the drawn review(s) to vector(s)
vector_search_queries = vectorizer.transform(search_queries).toarray()

In [31]:
# Calculate the similarity score(s) between vector(s) and training vectors
similarity_scores = cosine_similarity(vector_search_queries, vectors_train)

In [32]:
# Let's find top 5 similar reviews
n = 5
returned_reviews = get_top_values(similarity_scores[0], n, documents_train)

In [33]:
print('Our search query:')
print(search_queries[0]) # To be added

Our search query:
Seriously the best Japanese Steakhouse this fat boy has ever been to. If you're lucky enough to sit at the table when the Owner is cooking you're in for a real treat. All the chefs make custom sauces for your meal and each are a highlight. Must stop destination in Las Vegas.


In [34]:
print('Most %s similar reviews:' % n)
print(returned_reviews[0])  # To be added

Most 5 similar reviews:
Best local destination hands down in Las Vegas. Great food, great atmosphere, great service, and the best great drinks! From the staple lobster pho to the dinosaur bone barrow soup dish this place has it all. Don't think this is your regular pho destination! This place has so many more to offer! Chef Khai's cooking and presentations are one of the most creative I've seen here in Las Vegas. Also great place to watch your favorite NBA, Football, baseball, or basketball games! Stop on in...You won't be disappointed!


#### Q: Does the result make sense to you?

A: Yes, it makes sense.

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [35]:
# Build a Naive-Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(vectors_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [36]:
# Get score for training set
model_nb.score(vectors_train, target_train)

0.8117318115104256

In [37]:
# Get score for test set
model_nb.score(vectors_test, target_test)

0.8108899241416915

#### Logistic Regression Classifier

In [38]:
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression

model_lrc = LogisticRegression()
model_lrc.fit(vectors_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [39]:
# Get score for training set
model_lrc.score(vectors_train, target_train)

0.8421887683565789

In [40]:
# Get score for test set
model_lrc.score(vectors_test, target_test)

0.8358667266135436

#### Q: What are the key features(words) that make the positive prediction?

In [47]:
# Let's find it out by ranking
n = 20
get_top_values(model_lrc.coef_[0], n, words)

[u'amazing',
 u'best',
 u'awesome',
 u'perfection',
 u'thank',
 u'perfect',
 u'incredible',
 u'delicious',
 u'phenomenal',
 u'fantastic',
 u'heaven',
 u'highly',
 u'excellent',
 u'great',
 u'favorite',
 u'gem',
 u'impeccable',
 u'love',
 u'perfectly',
 u'outstanding']

A: Listed as above.

#### Q: What are the key features(words) that make the negative prediction?

In [51]:
# Let's find it out by ranking
n = 20
get_bottom_values(model_lrc.coef_[0], n, words)

A: (insert your comments here)

#### Random Forest Classifier

In [54]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

model_rfc = RandomForestClassifier(max_depth=20,
                                  n_estimators = 50,
                                  min_samples_leaf = 10,
                                  n_jobs = -1)
model_rfc.fit(vectors_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [55]:
# Get score for training set
model_rfc.score(vectors_train, target_train)

0.7973704277667085

In [56]:
# Get score for test set
model_rfc.score(vectors_test, target_test)

0.7893664444168094

#### Q: What do you see from the training score and the test score?

A: The model performances on training and test data are comparable, with test accuracy slightly lower than training accuracy.

#### Q: Can you tell what features (words) are important by inspecting the RFC model?

In [58]:
n = 20
get_top_values(model_rfc.feature_importances_, n, words)

[u'amazing',
 u'best',
 u'great',
 u'delicious',
 u'ok',
 u'wasn',
 u'didn',
 u'awesome',
 u'vegas',
 u'bad',
 u'rude',
 u'love',
 u'highly',
 u'minutes',
 u'worst',
 u'pretty',
 u'friendly',
 u'excellent',
 u'good',
 u'asked']

## Extra Credit #1: Use cross validation to evaluate your classifiers

[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)

In [59]:
# To be implemented
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model_lrc,
                           vectors_train,
                           target_train,
                           cv = 5,
                           scoring = 'accuracy')
cv_scores

array([0.83257149, 0.83314201, 0.83367157, 0.83678741, 0.83311809])

## Extra Credit #2: Use grid search to find best predictable classifier


[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)

In [63]:
# To be implemented
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

param_grid = [{'penalty':['l1'], 'C':[0.01, 0.1, 1, 5, 10, 100]},
             {'penalty':['l2'], 'C':[0.01, 0.1, 1, 5, 10, 100]}]
scores = ['accuracy']

for score in scores:
    clf = GridSearchCV(LogisticRegression(),
                      param_grid,
                      cv=5,
                      scoring=score)
    clf.fit(vectors_train[:500,:], target_train[:500])
    print(clf.best_params_)
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print(mean, std * 2, params)
        
    y_true, y_pred = target_test, clf.predict(vectors_test)

{'penalty': 'l2', 'C': 10}
(0.546, 0.004080204015301296, {'penalty': 'l1', 'C': 0.01})
(0.546, 0.004080204015301296, {'penalty': 'l1', 'C': 0.1})
(0.688, 0.0781342184680282, {'penalty': 'l1', 'C': 1})
(0.75, 0.08530134601495568, {'penalty': 'l1', 'C': 5})
(0.76, 0.03603556376949889, {'penalty': 'l1', 'C': 10})
(0.738, 0.042431564238952835, {'penalty': 'l1', 'C': 100})
(0.546, 0.004080204015301296, {'penalty': 'l2', 'C': 0.01})
(0.554, 0.015725874548432304, {'penalty': 'l2', 'C': 0.1})
(0.758, 0.07043329954622668, {'penalty': 'l2', 'C': 1})
(0.762, 0.07403246369376118, {'penalty': 'l2', 'C': 5})
(0.764, 0.07976788007380635, {'penalty': 'l2', 'C': 10})
(0.764, 0.08092493611074689, {'penalty': 'l2', 'C': 100})
