In [24]:
import pandas as pd
import json
import string
import os
import re

import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

pd.options.mode.chained_assignment = None

DO NOT RUN THE CELL BELOW (it will take forever to compile). Details on functionality included in the cell.

In [3]:
####################################### DONT RUN THIS CELL #################################################################

### This cell loads in the two JSON files, pulls out all the mexican restaurants from the business JSON and gathers all the
## reviews that mention that restaurant. This takes a long time to compile, so I pickled the two DFs to avoid having to
# reacess the massive JSON file.

business=pd.read_json('business.json',lines='True')
mex = business.categories.str.contains('Mexican', na=False)
tex_mex = business.categories.str.contains('Tex-Mex', na=False)
taqs = business.loc[mex | tex_mex]
taqs_dict = {}
taqs_id = []
for index,data in taqs.iterrows():
    taqs_dict[data['business_id']] = data['name']
    taqs_id.append(data['business_id'])
reviews = []
with open('review.json') as fp:
    for line in fp:
        comment = json.loads(line) 
        reviews.append(comment)
    fp.close()
mexican_reviews = [rev for rev in reviews if rev["business_id"] in taqs]  
reviews_df = pd.DataFrame(mexican_reviews)
reviews_df['Restaurant Name']=reviews_df['business_id'].map(taq_dict)
reviews_df['reviews_length'] = reviews_df['text'].apply(len)

reviews_df.to_pickle('./mexican_reviews.pkl')   
business.to_pickle('./business_info.pkl')

## Here on will load the pickled versions of the dataframe created above, perform manipulations and repickle...
reviews_path = 'C:/Users/nhcam/Desktop/Springboard/Yelp Burrito Reviews Project/Yelp_Project_Data/mexican_reviews.pkl'
business_path = 'C:/Users/nhcam/Desktop/Springboard/Yelp Burrito Reviews Project/Yelp_Project_Data/business_info.pkl'

reviews_df = pd.read_pickle(reviews_path)
business_df = pd.read_pickle(business_path)


#Adding city and state data to the reviews
city = {}
state = {}
for index,data in tqdm(business_df.iterrows()):
    city[data['business_id']] = data['city']
    state[data['business_id']] = data['state']
reviews_df['city'] = reviews_df['business_id'].map(city)
reviews_df['state'] = reviews_df['business_id'].map(state)


#Adding region to reviews, dropping any row without a US State (there are british and canadian cities included)
US_states = ['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL',
             'IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT',
             'NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI',
             'SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']
cali = ['CA']
west = ['WA','OR','NV','ID','AK','HI','MT','UT','CO','WY','NM','AZ']
midwest = ['ND','SD','NE','KS','OK','MN','IA','MO','WI','MI','IL','IN']
south = ['LA','MS','AL','TN','NC','SC','GA','FL','TX','AR']
noreast = ['KY','OH','WV','PA','MD','DE','NJ','NY','CT','RI','MA','VA','NH','ME','VT']
states = [cali,west,midwest,south,noreast]
regions = ['California','West','Midwest','South','Northeast']
regions_dict={}
i = 0
for each in tqdm(states):
    for indiv in each:
        regions_dict[indiv] = regions[i]
    i += 1
reviews_df['region'] = reviews_df['state'].map(regions_dict)
reviews_df = reviews_df.dropna(axis=0)

#Mapping labels to stars
stars_dict = {5:'Good',4:'Good',3:'Neutral/Bad',2:'Neutral/Bad',1:'Neutral/Bad'}
reviews_df['Good/Neutral/Bad'] = reviews_df['stars'].map(stars_dict)

#Word tokenize review text and remove punctuation
reviews_df = tokenize_and_clean(reviews_df)

#Pull out reviews that mention burritos into new df and drop those rows from the reviews df
burrito_mention = reviews_df.loc[reviews_df['text'].str.contains('burrito',case=False, regex = False)]
reviews_df = reviews_df.loc[~reviews_df['text'].str.contains('burrito',case=False, regex = False)]

#Find sentiment of burrito mentions df
burrito_mention = find_burrito_sentences_get_sentiment(burrito_mention)

#Set and sort index
reviews_df.set_index(['business_id','Restaurant Name'],inplace=True)
reviews_df.sort_index(inplace=True)
burrito_mention.set_index(['business_id', 'Restaurant Name'],inplace=True)
burrito_mention.sort_index(inplace=True)

#Pickle dataframes
reviews_df.to_pickle('./reviews_df.pkl')
burrito_mention.to_pickle('./burrito_mentions.pkl')

In [6]:
## Functions needed for throughout analysis

def remove_stopwords_punc(i1):
    ## Using nltk library's stop words and string.punctuation, this removes them from a tokenized list. Use with df.apply()
    
    stop_words = list(stopwords.words('english'))+list(string.punctuation)
    minus_stops = [w for w in i1 if w not in stop_words]
    return minus_stops

def tokenize_and_clean(df):
    ## Using nltk word tokenize in df.apply() and cleaning using function above.
    
    df['tokenized_text'] = df['text'].apply(word_tokenize)
    df['tokenized_text_cleaned'] = df['tokenized_text'].apply(remove_stopwords_punc)
    return df

def dummy(doc):
    return doc

def split_sentences_return_burrito(st):
    if '.' in st:
        sentences = re.split(r'[.?!]\s*', st)
        sentences_lower = [sentence.lower() for sentence in sentences]
        burrito_sentence = [sentence for sentence in sentences_lower if 'burrito' in sentence]
    else:
        burrito_sentence = 'Punctuation lacking'
    return burrito_sentence 

def apply_sentiment_intensity_analysis(sentence):
    analyzer = SentimentIntensityAnalyzer()
    polarity_dict = analyzer.polarity_scores(sentence)
    return polarity_dict

def find_burrito_sentences_get_sentiment(df):
    df.reset_index(inplace=True)
    df['burrito_sentences'] = df['text'].apply(split_sentences_return_burrito)
    polarities_list = []
    for indexes, data in df.iterrows():
        polarity_dict = apply_sentiment_intensity_analysis(str(data['burrito_sentences']))
        polarities_list.append(polarity_dict)
    polarities_df = pd.DataFrame(polarities_list)
    df = pd.concat([df, polarities_df],axis=1)
    return df

Load in the two preprocessed dataframes from their pickled files.

In [3]:
all_reviews_path = 'C:/Users/nhcam/Desktop/Springboard/Yelp Burrito Reviews Project/Yelp_Project_Data/reviews_df.pkl'
burritos_reviews_path = 'C:/Users/nhcam/Desktop/Springboard/Yelp Burrito Reviews Project/Yelp_Project_Data/burrito_mentions.pkl'
reviews_df = pd.read_pickle(all_reviews_path)
burritos_df = pd.read_pickle(burritos_reviews_path)

In [7]:
vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy)
text = list(reviews_df['tokenized_text_cleaned'])
bag_of_words = vectorizer.fit(text)

In [101]:
X = bag_of_words.transform(text)
y = reviews_df['Good/Neutral/Bad']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [102]:
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred = mnb.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        Good       0.90      0.93      0.91     53263
 Neutral/Bad       0.86      0.80      0.83     28970

   micro avg       0.88      0.88      0.88     82233
   macro avg       0.88      0.86      0.87     82233
weighted avg       0.88      0.88      0.88     82233

