In [31]:
import pandas as pd
import json
import string
import os
import re

import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn import metrics
import numpy as np

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

pd.options.mode.chained_assignment = None

DO NOT RUN THE CELL BELOW (it will take forever to compile). Details on functionality included in the cell.

In [3]:
####################################### DONT RUN THIS CELL #################################################################

### This cell loads in the two JSON files, pulls out all the mexican restaurants from the business JSON and gathers all the
## reviews that mention that restaurant. This takes a long time to compile, so I pickled the two DFs to avoid having to
# reacess the massive JSON file.

business=pd.read_json('business.json',lines='True')
mex = business.categories.str.contains('Mexican', na=False)
tex_mex = business.categories.str.contains('Tex-Mex', na=False)
taqs = business.loc[mex | tex_mex]
taqs_dict = {}
taqs_id = []
for index,data in taqs.iterrows():
    taqs_dict[data['business_id']] = data['name']
    taqs_id.append(data['business_id'])
reviews = []
with open('review.json') as fp:
    for line in fp:
        comment = json.loads(line) 
        reviews.append(comment)
    fp.close()
mexican_reviews = [rev for rev in reviews if rev["business_id"] in taqs]  
reviews_df = pd.DataFrame(mexican_reviews)
reviews_df['Restaurant Name']=reviews_df['business_id'].map(taq_dict)
reviews_df['reviews_length'] = reviews_df['text'].apply(len)

reviews_df.to_pickle('./mexican_reviews.pkl')   
business.to_pickle('./business_info.pkl')

## Here on will load the pickled versions of the dataframe created above, perform manipulations and repickle...
reviews_path = 'C:/Users/nhcam/Desktop/Springboard/Yelp Burrito Reviews Project/Yelp_Project_Data/mexican_reviews.pkl'
business_path = 'C:/Users/nhcam/Desktop/Springboard/Yelp Burrito Reviews Project/Yelp_Project_Data/business_info.pkl'

reviews_df = pd.read_pickle(reviews_path)
business_df = pd.read_pickle(business_path)


#Adding city and state data to the reviews
city = {}
state = {}
for index,data in tqdm(business_df.iterrows()):
    city[data['business_id']] = data['city']
    state[data['business_id']] = data['state']
reviews_df['city'] = reviews_df['business_id'].map(city)
reviews_df['state'] = reviews_df['business_id'].map(state)


#Adding region to reviews, dropping any row without a US State (there are british and canadian cities included)
US_states = ['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL',
             'IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT',
             'NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI',
             'SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']
cali = ['CA']
west = ['WA','OR','NV','ID','AK','HI','MT','UT','CO','WY','NM','AZ']
midwest = ['ND','SD','NE','KS','OK','MN','IA','MO','WI','MI','IL','IN']
south = ['LA','MS','AL','TN','NC','SC','GA','FL','TX','AR']
noreast = ['KY','OH','WV','PA','MD','DE','NJ','NY','CT','RI','MA','VA','NH','ME','VT']
states = [cali,west,midwest,south,noreast]
regions = ['California','West','Midwest','South','Northeast']
regions_dict={}
i = 0
for each in tqdm(states):
    for indiv in each:
        regions_dict[indiv] = regions[i]
    i += 1
reviews_df['region'] = reviews_df['state'].map(regions_dict)
reviews_df = reviews_df.dropna(axis=0)

#Mapping labels to stars
stars_dict = {5:'Good',4:'Good',3:'Neutral/Bad',2:'Neutral/Bad',1:'Neutral/Bad'}
reviews_df['Good/Neutral/Bad'] = reviews_df['stars'].map(stars_dict)

#Word tokenize review text and remove punctuation
reviews_df = tokenize_and_clean(reviews_df)

#Pull out reviews that mention burritos into new df and drop those rows from the reviews df
burrito_mention = reviews_df.loc[reviews_df['text'].str.contains('burrito',case=False, regex = False)]
reviews_df = reviews_df.loc[~reviews_df['text'].str.contains('burrito',case=False, regex = False)]

#Find sentiment of burrito mentions df
burrito_mention = find_burrito_sentences_get_sentiment(burrito_mention)

#Set and sort index
reviews_df.set_index(['business_id','Restaurant Name'],inplace=True)
reviews_df.sort_index(inplace=True)
burrito_mention.set_index(['business_id', 'Restaurant Name'],inplace=True)
burrito_mention.sort_index(inplace=True)

#Pickle dataframes
reviews_df.to_pickle('./reviews_df.pkl')
burrito_mention.to_pickle('./burrito_mentions.pkl')

In [2]:
## Functions needed for throughout analysis

def remove_stopwords_punc(i1):
    """
    Using nltk library's stop words and string.punctuation, this removes them from a tokenized list. Use with df.apply()
    """
    
    stop_words = list(stopwords.words('english'))+list(string.punctuation)
    minus_stops = [w for w in i1 if w not in stop_words]
    return minus_stops

def tokenize_and_clean(df):
    """
    Using nltk word tokenize in df.apply() and cleaning using function above.
    """
    
    df['text'].str.lower()
    df['tokenized_text'] = df['text'].apply(word_tokenize)
    df['tokenized_text_cleaned'] = df['tokenized_text'].apply(remove_stopwords_punc)
    return df

def dummy(doc):
    """
    Used as a placeholder when instantiating a vectorizer as preprocessing and tokenizing already done. Does nothing.
    """
    return doc

def split_sentences_return_burrito(st):
    """Checks for occurence of punctuation that allows splitting of sentences, if found it splits the text into sentences
    and then pulls out the sentence that has the word 'burrito' in the sentence.
    """
    if '.' in st:
        sentences = re.split(r'[.?!]\s*', st)
        sentences_lower = [sentence.lower() for sentence in sentences]
        burrito_sentence = [sentence for sentence in sentences_lower if 'burrito' in sentence]
    else:
        burrito_sentence = 'Punctuation lacking'
    return burrito_sentence 

def apply_sentiment_intensity_analysis(sentence):
    """Applies the polarity scores function to a sentence. Used with df.apply(), returns dictionary. 
    """
    analyzer = SentimentIntensityAnalyzer()
    polarity_dict = analyzer.polarity_scores(sentence)
    return polarity_dict

def find_burrito_sentences_get_sentiment(df):
    """
    Takes the previous two functions and applies them to a dataframe. Finds burrito sentence then gets its polarity scores.
    Makes a dataframe with all the individual polarity dictionaries.
    """
    df.reset_index(inplace=True)
    df['burrito_sentences'] = df['text'].apply(split_sentences_return_burrito)
    polarities_list = []
    for indexes, data in df.iterrows():
        polarity_dict = apply_sentiment_intensity_analysis(str(data['burrito_sentences']))
        polarities_list.append(polarity_dict)
    polarities_df = pd.DataFrame(polarities_list)
    df = pd.concat([df, polarities_df],axis=1)
    return df

def n_grams(text):
    """
    Applies ngram function returns bigram. Used with df.apply().
    """
    two_gram = ngrams(text,2)
    return list(two_gram)

Load in the two preprocessed dataframes from their pickled files and apply the n_grams function to return bigrams.

In [3]:
all_reviews_path = 'C:/Users/nhcam/Desktop/Springboard/Yelp Burrito Reviews Project/Yelp_Project_Data/reviews_df.pkl'
burritos_reviews_path = 'C:/Users/nhcam/Desktop/Springboard/Yelp Burrito Reviews Project/Yelp_Project_Data/burrito_mentions.pkl'
reviews_df = pd.read_pickle(all_reviews_path)
burritos_df = pd.read_pickle(burritos_reviews_path)

burritos_df['bigrams'] = burritos_df['tokenized_text_cleaned'].apply(n_grams)
reviews_df['bigrams'] = reviews_df['tokenized_text_cleaned'].apply(n_grams)

Looking at a very simple model first, a simple word tokenized, stop word removed, count vecotrized matrix to predict the binary review target. First split the data into a train and test set. Then instantiate a count vectorizer to convert the tokenized training reviews into a corpus. Finally fit the vectorizer to this corpus. This returns a vocabulary of about 163,000 words (with this split). Wow!

In [96]:
X = list(reviews_df['tokenized_text_cleaned'])
y = reviews_df['Good/Neutral/Bad']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy)

vectorizer.fit(X_train)
len(vectorizer.vocabulary_)

163455

Next we'll transform the training and testing set into a sparse matrix of word counts using the vectorizer's transform method.

In [5]:
transformed_X_train = vectorizer.transform(X_train)
transformed_X_test = vectorizer.transform(X_test)

Next we can train a model on our training set and see how this simple model performs!

In [6]:
mnb = MultinomialNB()
mnb.fit(transformed_X_train,y_train)
y_pred = mnb.predict(transformed_X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        Good       0.90      0.93      0.91     53448
 Neutral/Bad       0.86      0.80      0.83     28785

   micro avg       0.88      0.88      0.88     82233
   macro avg       0.88      0.87      0.87     82233
weighted avg       0.88      0.88      0.88     82233



This simple baseline model generalizes pretty well to the testing set! However, it is probably possible to do better with slightly more advanced methods. Let's look at the same process as above with a TFIDF Vectorizer. This will help us weight features that appear in every document lower and may improve the model's performance. 

In [7]:
tfidf = TfidfVectorizer(tokenizer=dummy,preprocessor=dummy)
tfidf.fit(X_train)
len(tfidf.vocabulary_)

163717

In [8]:
tf_transformed_X_train = vectorizer.transform(X_train)
tf_transformed_X_test = vectorizer.transform(X_test)

In [9]:
tf_mnb = MultinomialNB()
tf_mnb.fit(tf_transformed_X_train,y_train)
tf_y_pred = tf_mnb.predict(tf_transformed_X_test)
print(classification_report(y_test,tf_y_pred))

              precision    recall  f1-score   support

        Good       0.90      0.93      0.91     53448
 Neutral/Bad       0.86      0.80      0.83     28785

   micro avg       0.88      0.88      0.88     82233
   macro avg       0.88      0.87      0.87     82233
weighted avg       0.88      0.88      0.88     82233



This model performs exactly the same as the CountVectorizer. This is likely because there are so many unique words in the corpus, especially with the stop words being removed in preprocessing, that the term frequency weighting doesn't add any useful information. The next thing I would like to try is using n-grams instead of just words. With just a word representation we are definitely losing information. Let's look at a CountVectorizer representation starting with bigrams.

In [97]:
bigrams_X = list(reviews_df['bigrams'])
y = reviews_df['Good/Neutral/Bad']
bigramsX_train,bigramsX_test,bigramsy_train,bigramsy_test = train_test_split(bigrams_X,y,test_size=0.25)

bigrams_vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy)
bigrams_vectorizer.fit(bigramsX_train)
len(bigrams_vectorizer.vocabulary_)

3411688

In [13]:
bigramsXtrain_transformed = bigrams_vectorizer.transform(bigramsX_train)
bigramsXtest_transformed = bigrams_vectorizer.transform(bigramsX_test)

In [14]:
mnb_bigrams = MultinomialNB()
mnb_bigrams.fit(bigramsXtrain_transformed,bigramsy_train)
bigrams_pred = mnb_bigrams.predict(bigramsXtest_transformed)
print(classification_report(bigramsy_test,bigrams_pred))

              precision    recall  f1-score   support

        Good       0.90      0.96      0.93     53424
 Neutral/Bad       0.92      0.79      0.85     28809

   micro avg       0.90      0.90      0.90     82233
   macro avg       0.91      0.88      0.89     82233
weighted avg       0.90      0.90      0.90     82233



The vocabulary with the bigrams model has ballooned in size to 3.5 million features. That's a whole heck of a lot. However, using bigrams instead of just words increases the performance of the model by a fair amount. The next piece we'll have to look at is feature reduction because we certainly don't need all 3.5 million bigrams. First we'll run a chi square test between the transformed vectors and the target variable. Then I'll pull out the actual bigrams by name and associate them with their pvalues in a dataframe. The statistically significant pvalues at the 0.05 level will be filtered and pulled out.

In [16]:
chi2,pval = chi2(bigramsXtrain_transformed,bigramsy_train)
bigrams = list(bigrams_vectorizer.vocabulary_.keys())
order = list(bigrams_vectorizer.vocabulary_.values())

bigram_pvalues = pd.DataFrame({'bigram':bigrams},index=order).sort_index()
bigram_pvalues['chi2'] = chi2
bigram_pvalues['pval'] = pval

In [92]:
significant = bigram_pvalues[bigram_pvalues.pval <= 0.05]
sig_bigrams = significant['bigram']
len(sig_bigrams)

150709

Of the 3.4 million bigrams, only 150,709 are significant at a p-value of 0.05. Below I've tested different levels of significance and feature exclusion to see which level has the best performance. It is 0.05.

In [106]:
pvalues = [0.05, 0.0005, 0.000005, 0.00000005, 0.0000000005]
for pval in pvalues:
    significant = bigram_pvalues[bigram_pvalues.pval <= pval]
    sig_bigrams = significant['bigram']
    sig_vect = CountVectorizer(tokenizer=dummy,preprocessor=dummy,vocabulary=sig_bigrams)
    sig_Xtrain_transformed = sig_vect.transform(bigramsX_train)
    sig_Xtest_transformed = sig_vect.transform(bigramsX_test)
    mnb_sig = MultinomialNB(alpha=0.1)
    mnb_sig.fit(sig_Xtrain_transformed,bigramsy_train)
    sig_pred = mnb_sig.predict(sig_Xtest_transformed)
    print('Classification Report for a p-value of:', pval)
    print(classification_report(bigramsy_test,sig_pred))

Classification Report for a p-value of: 0.05
              precision    recall  f1-score   support

        Good       0.93      0.94      0.94     53172
 Neutral/Bad       0.89      0.87      0.88     29061

   micro avg       0.92      0.92      0.92     82233
   macro avg       0.91      0.91      0.91     82233
weighted avg       0.92      0.92      0.92     82233

Classification Report for a p-value of: 0.0005
              precision    recall  f1-score   support

        Good       0.92      0.93      0.93     53172
 Neutral/Bad       0.88      0.85      0.86     29061

   micro avg       0.90      0.90      0.90     82233
   macro avg       0.90      0.89      0.89     82233
weighted avg       0.90      0.90      0.90     82233

Classification Report for a p-value of: 5e-06
              precision    recall  f1-score   support

        Good       0.91      0.93      0.92     53172
 Neutral/Bad       0.87      0.84      0.85     29061

   micro avg       0.90      0.90      0.90 

So this method of only including the 150,709 most significant bigrams, a reduction of 96%, doesn't change the predictive strength of the model at all. It does change the relationship between precision and recall for both classes, but actually increases the f1-score for predicting neutral and bad reviews. Overall this is a better model because it is far simpler, faster and more predictive.

Now I'll do a grid search for the best value of the the hyperparameter alpha (0.1) and then try out my model on the burrito set with the vocabulary of bigrams with p <= 0.05.

In [None]:
grid_mnb = MultinomialNB()
params = {'alpha': [0.01,0.1,1,10]}
mnb_grid = GridSearchCV(grid_mnb, param_grid=params,scoring='accuracy')
mnb_grid.fit(sig_Xtrain_transformed,bigramsy_train)
print(mnb_grid.best_params_)

In [105]:
significant = bigram_pvalues[bigram_pvalues.pval <= 0.05]
sig_bigrams = significant['bigram']
sig_vect = CountVectorizer(tokenizer=dummy,preprocessor=dummy,vocabulary=sig_bigrams)
sig_Xtrain_transformed = sig_vect.transform(bigramsX_train)
mnb_sig = MultinomialNB(alpha=0.1)
mnb_sig.fit(sig_Xtrain_transformed,bigramsy_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [107]:
# First remove all rows that I wasn't able to parse the burrito sentence out.
burritos_df = burritos_df[burritos_df['burrito_sentences'] != 'Punctuation lacking']

# Then transform the tokenized text into vectors so I can feed it into the model and get predictions.
burrito_bigrams = list(burritos_df['bigrams'])
burrito_X = sig_vect.transform(burrito_bigrams)
y = burritos_df['Good/Neutral/Bad']
burrito_pred = mnb_sig.predict(burrito_X)
burritos_df['Prediction'] = burrito_pred
print(classification_report(y,burrito_pred))

              precision    recall  f1-score   support

        Good       0.89      0.90      0.90     31926
 Neutral/Bad       0.83      0.81      0.82     18455

   micro avg       0.87      0.87      0.87     50381
   macro avg       0.86      0.86      0.86     50381
weighted avg       0.87      0.87      0.87     50381



Performs pretty similarly on this burrito set. Not surprising but good to see it genralizes well.

In [307]:
# Get all unique burrito restaurants and intialize a dictionary

unique_restaurants = burritos_df['Restaurant Name'].unique()
name_sentiment_dict = {}

# iterate over the unique restaurants, take the ones with more than 10 reviews and put their burrito score in a dictionary
for restaurant in unique_restaurants:
    given_res = burritos_df.loc[burritos_df['Restaurant Name'] == restaurant]
    if len(given_res) > 10:
        mean = given_res['compound'].mean()
        name_sentiment_dict[restaurant] = mean
    else:
        pass

# Push dictionary to dataframe, sort by sentiment score and then rename columns for joining operation    
undervalued_burritos = pd.DataFrame.from_dict(name_sentiment_dict,orient = 'index', columns = ['Mean Sentiment Score'])
undervalued_burritos.sort_values(by=['Mean Sentiment Score'],ascending = False,inplace=True)
undervalued_burritos = undervalued_burritos.reset_index()
undervalued_burritos.columns = ['Restaurant Name','Mean Sentiment Score']

# Take restaurant name and stars, group by restaurant name and aggregate to get average star rating and total reviews > 10
burrito_stars = burritos_df[['Restaurant Name','stars']].groupby(['Restaurant Name']).agg(['mean','count'])
burrito_stars = burrito_stars.loc[burrito_stars['stars']['count'] > 10 ]
#burrito_stars = burrito_stars.sort_values(('stars','mean'), ascending=False)

# Then join on restaurant name to see how burrito sentiment maps to average star rating
undervalued_burritos = undervalued_burritos.join(burrito_stars, on='Restaurant Name')
undervalued_burritos.columns = ['Restaurant Name','Mean Sentiment Score','Mean Stars','Review Count']
undervalued_burritos.head(10)



Unnamed: 0,Restaurant Name,Mean Sentiment Score,Mean Stars,Review Count
0,Rosarita's Beach,0.508306,3.375,16
1,Mexquite Mexican Eatery,0.456758,4.0,12
2,Betitos Mexican Food,0.455457,4.285714,14
3,Super B Burrito,0.454376,3.644444,135
4,Burrito's Juarez,0.449913,3.6,15
5,Los Picos Parrilla Restaurant,0.446238,3.769231,13
6,Mexico City,0.433089,3.388889,18
7,Chronic Cantina,0.427884,3.684211,19
8,Jose' and Tony's Mexican Restaurant,0.423275,3.25,12
9,Amigos Tacos,0.417572,3.916667,36


In [256]:
# Find highest starred restaurants with burrito reviews, same process as above

#burritos_df.reset_index(inplace=True)
unique_restaurants = burritos_df['Restaurant Name'].unique()
name_sentiment_dict = {}

# iterate over the unique restaurants, take the ones with more than 10 reviews and put their burrito score in a dictionary
for restaurant in unique_restaurants:
    given_res = burritos_df.loc[burritos_df['Restaurant Name'] == restaurant]
    if len(given_res) > 10:
        mean = given_res['stars'].mean()
        name_sentiment_dict[restaurant] = mean
    else:
        pass
starred_burritos = pd.DataFrame.from_dict(name_sentiment_dict,orient = 'index', columns = ['Stars'])
starred_burritos.head()
starred_burritos.sort_values(by=['Stars'],ascending = False,inplace=True)
starred_burritos = starred_burritos.reset_index()
starred_burritos.columns = ['Restaurant Name','Mean Stars']
starred_burritos.head(10)

Unnamed: 0,Restaurant Name,Mean Stars
0,Pollos LaChuya,4.916667
1,Cocina Madrigal,4.875
2,Garden Grill,4.846154
3,El Frescos Cocina Mexicana,4.8125
4,Kiss Pollos Estilo Sinaloa,4.769231
5,La Purisima Bakery,4.75
6,El Cordobes,4.75
7,Del Yaqui,4.727273
8,Humberto's Mexican Food,4.727273
9,Tacos Kissi,4.714286


In [239]:
# Same process as above but with the whole reviews dataframe

stars_by_place = reviews_df[['Restaurant Name','stars']].groupby(['Restaurant Name']).agg(['mean','count'])
stars_by_place = stars_by_place.loc[stars_by_place['stars']['count'] > 10 ]
stars_by_place = stars_by_place.sort_values(('stars','mean'), ascending=False)
stars_by_place.head(10)

Unnamed: 0_level_0,stars,stars
Unnamed: 0_level_1,mean,count
Restaurant Name,Unnamed: 1_level_2,Unnamed: 2_level_2
Geo's tacos,5.0,24
Taqueria La Herradura,5.0,13
Chicali Tacos,5.0,16
Casa De Falafel,4.955556,90
La Bamba Mexican Grill Restaurant,4.95122,41
El Pollito Charro,4.935484,31
Fernando's Food,4.933333,30
Bajamar Seafood & Tacos,4.921986,282
Cocina Madrigal,4.916667,120
Taco San Francisco,4.894737,19
