In [1]:
from platform import python_version
print(python_version())

3.7.3


In [59]:
import pandas as pd
import json
import string
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
### This cell loads in the two JSON files, pulls out all the mexican restaurants from the business JSON and gathers all the
## reviews that mention that restaurant. This takes a long time to compile, so I pickled the two DFs to avoid having to
# reacess the massive JSON file.

business=pd.read_json('business.json',lines='True')
mex = business.categories.str.contains('Mexican', na=False)
tex_mex = business.categories.str.contains('Tex-Mex', na=False)
taqs = business.loc[mex | tex_mex]
taqs_dict = {}
taqs_id = []
for index,data in taqs.iterrows():
    taqs_dict[data['business_id']] = data['name']
    taqs_id.append(data['business_id'])
reviews = []
with open('review.json') as fp:
    for line in fp:
        comment = json.loads(line) 
        reviews.append(comment)
    fp.close()
mexican_reviews = [rev for rev in reviews if rev["business_id"] in taqs]  
reviews_df = pd.DataFrame(mexican_reviews)
reviews_df['Restaurant Name']=reviews_df['business_id'].map(taq_dict)
reviews_df['reviews_length'] = reviews_df['text'].apply(len)
reviews_df.to_pickle('./mexican_reviews.pkl')   
business.to_pickle('./business_info.pkl')

In [5]:
## Functions needed for throughout analysis

def remove_stopwords_punc(i1):
    ## Using nltk library's stop words and string.punctuation, this removes them from a tokenized list. Use with df.apply()
    
    stop_words = list(stopwords.words('english'))+list(string.punctuation)
    minus_stops = [w for w in i1 if w not in stop_words]
    return minus_stops

def tokenize_and_clean(df):
    ## Using nltk word tokenize in df.apply() and cleaning using function above.
    
    df['tokenized_text'] = df['text'].apply(word_tokenize)
    df['tokenized_text_cleaned'] = df['tokenized_text'].apply(remove_stopwords_punc)
    return df

def dummy(doc):
    return doc

def google_sentiment_analysis_magnitude(text):
    client = language.LanguageServiceClient(credentials=cred)
    text = text
    document = types.Document(content=text,type=enums.Document.Type.PLAIN_TEXT)
    sentiment = client.analyze_sentiment(document=document).document_sentiment
    magnitude = sentiment.magnitude
    return magnitude

def google_sentiment_analysis_score(text):
    client = language.LanguageServiceClient(credentials=cred)
    text = text
    document = types.Document(content=text,type=enums.Document.Type.PLAIN_TEXT)
    sentiment = client.analyze_sentiment(document=document).document_sentiment
    score = sentiment.score
    return score

def apply_sentiment_analysis(df):
    tqdm.pandas()
    df['Sentiment_Score'] = df['text'].progress_apply(google_sentiment_analysis_score)
    df['Sentiment_Magnitude'] = df['text'].progress_apply(google_sentiment_analysis_magnitude)
    return df

In [8]:
## This cell will load the pickled versions of the dataframe created above and perform manipulations...

rest_path = 'C:/Users/nhcam/Desktop/Springboard/Yelp Burrito Reviews Project/Yelp_Project_Data/mexican_reviews.pkl'
busi_path = 'C:/Users/nhcam/Desktop/Springboard/Yelp Burrito Reviews Project/Yelp_Project_Data/business_info.pkl'

reviews_df = pd.read_pickle(rest_path)
business_df = pd.read_pickle(busi_path)


#Adding city and state data to the reviews
city = {}
state = {}
for index,data in tqdm(business_df.iterrows()):
    city[data['business_id']] = data['city']
    state[data['business_id']] = data['state']
reviews_df['city'] = reviews_df['business_id'].map(city)
reviews_df['state'] = reviews_df['business_id'].map(state)


#Adding region to reviews, dropping any row without a US State (there are british and canadian cities included)
US_states = ['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL',
             'IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT',
             'NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI',
             'SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']
cali = ['CA']
west = ['WA','OR','NV','ID','AK','HI','MT','UT','CO','WY','NM','AZ']
midwest = ['ND','SD','NE','KS','OK','MN','IA','MO','WI','MI','IL','IN']
south = ['LA','MS','AL','TN','NC','SC','GA','FL','TX','AR']
noreast = ['KY','OH','WV','PA','MD','DE','NJ','NY','CT','RI','MA','VA','NH','ME','VT']
states = [cali,west,midwest,south,noreast]
regions = ['California','West','Midwest','South','Northeast']
regions_dict={}
i = 0
for each in tqdm(states):
    for indiv in each:
        regions_dict[indiv] = regions[i]
    i += 1
reviews_df['region'] = reviews_df['state'].map(regions_dict)
reviews_df = reviews_df.dropna(axis=0)

#Mapping labels to stars
stars_dict = {5:'Good',4:'Good',3:'Neutral/Bad',2:'Neutral/Bad',1:'Neutral/Bad'}
reviews_df['Good/Neutral/Bad'] = reviews_df['stars'].map(stars_dict)


#Set and sort index
reviews_df.set_index(['business_id','Restaurant Name'],inplace=True)
reviews_df.sort_index(inplace=True)

192609it [00:19, 10074.09it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 5101.32it/s]


In [11]:
burrito_mention = reviews_df.loc[reviews_df['text'].str.contains('burrito',case=False)]
#burrito_mention = tokenize_and_clean(burrito_mention)
reviews_df = tokenize_and_clean(reviews_df)

In [62]:
def find_burrito_sentence (df):
    burrito_sentence_list = []
    for indexes, data in df.iterrows():
        review = data['text']
        burrito_sentence = [sentence + '.' for sentence in re.split('. | ! ',review) if 'burrito' in sentence]
        burrito_sentence_list.append(burrito_sentence)
    df['burrito_sentence'] = burrito_sentence_list
    return df

In [65]:
small_set = find_burrito_sentence(small_set)
print(small_set['burrito_sentence'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


business_id             Restaurant Name                    
-1VaIJza42Hjev6ukacCNg  Rio Mirage Cafe Y Cantina                                                             []
                        Rio Mirage Cafe Y Cantina                                                             []
                        Rio Mirage Cafe Y Cantina                                                             []
                        Rio Mirage Cafe Y Cantina                                                             []
                        Rio Mirage Cafe Y Cantina                                                     [burrito.]
                        Rio Mirage Cafe Y Cantina                                                             []
                        Rio Mirage Cafe Y Cantina                                                     [burrito.]
                        Rio Mirage Cafe Y Cantina                                                             []
                        Rio Mirage C

In [10]:
first_review = reviews_df.iloc[1]['text']
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(first_review)

{'neg': 0.096, 'neu': 0.525, 'pos': 0.379, 'compound': 0.8164}

In [12]:
vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy)
text = list(reviews_df['tokenized_text_cleaned'])
bag_of_words = vectorizer.fit(text)
X = bag_of_words.transform(text)

In [13]:
X = bag_of_words.transform(text)
y = reviews_df['Good/Neutral/Bad']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [14]:
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred = mnb.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        Good       0.89      0.93      0.91     61554
 Neutral/Bad       0.86      0.80      0.83     33620

   micro avg       0.88      0.88      0.88     95174
   macro avg       0.88      0.86      0.87     95174
weighted avg       0.88      0.88      0.88     95174

