In [None]:
pip install --upgrade ray

In [1]:
import warnings, os
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
load_dotenv('env')
import pandas as md
from collections import Counter
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn
import spacy
from importlib import reload
from modules import DataCollection, Support
reload(Support)
reload(DataCollection)
from modules.Support import Corpus, Business, TokenCleaner

In [3]:
import ray, os
ray.init(num_cpus=os.cpu_count())
os.environ["MODIN_ENGINE"] = "ray"
import modin.pandas as md

In [4]:
s3_client = DataCollection.s3_client()

if not os.path.exists("restaurants"):
    os.mkdir("restaurants")
    
for file_name in ['review.csv', 'business.csv']:
    if not os.path.exists("restaurants/%s" % file_name):
        s3_client.download_file('s3fld','599Team12/restaurants/%s'% file_name,'restaurants/%s'% file_name) 
    
s3_client.download_file('s3fld','599Team12/negative-words.txt','negative-words.txt') 

In [8]:
with open('negative-words.txt','r') as f:
    negative_words = f.read().split('\n')
    
# remove heafer information
negative_words = negative_words[35:]
md.DataFrame(negative_words, columns = ['negative_words']).sample(5)

Unnamed: 0,negative_words
1934,hamper
1571,fainthearted
2280,incompliant
3789,shit
2962,moronic


In [3]:
restaurants_review = md.read_csv('restaurants/review.csv', compression = 'zip')
restaurants_info = md.read_csv('restaurants/business.csv', compression = 'zip')
negative_reviews = restaurants_review.query('sentiment == "negative"')

In [4]:

# negative_reviews.date is date only
negative_reviews['date'] = md.to_datetime(negative_reviews['date'])
negative_reviews['date'] = negative_reviews['date'].dt.date

# if a column name Unnamed: 0 exists, then drop it
if 'Unnamed: 0' in negative_reviews.columns:
    negative_reviews.drop('Unnamed: 0', axis=1, inplace=True)
    
# use restaurants_review to get the business name
negative_reviews = negative_reviews.merge(
    restaurants_info[['business_id', 'name']], 
    on='business_id', 
    how='left')
 
negative_reviews = negative_reviews.reset_index(drop=True)
negative_reviews.sample(5)

Unnamed: 0,review_id,business_id,stars,text,date,sentiment,name
207279,byIAYlPnBb5tgj8LvA3bpA,VQcCL9PiNL_wkGf-uF3fjg,1.0,Where do I even begin about this disastrous ex...,2018-03-18,negative,Royal House
510475,FUlN4fCfMIWvbKf5YXAKJQ,NieQEbxG4Aqkhqj0h2TRtg,2.0,If employees are unable to have online orders ...,2016-09-27,negative,Port of Subs
66876,-pzlOJ-Brj1i_C9NXIKWzA,31s1x27DnN2V-ptUWEdfTQ,1.0,Oh my goodness! Walking to get sandwiches for ...,2020-07-05,negative,Subway
740935,C_CJk1CXTgSW-ZCXdHpazw,t5Xc0u9yvOH0-DgmrCS60Q,1.0,Discriminate against the lactose intolerant! T...,2012-01-07,negative,Pirrone's Pizzeria
860501,5Ys1fJwlQaVRO9l_0yfecw,lUiY3W4JOdpWBXq32KvDGw,2.0,We have a friend who LOVES Waffles on Maple. W...,2017-04-03,negative,Waffles on Maple


In [5]:
# count number of name in negative_reviews and sort by count
business_name_counts = negative_reviews.name.value_counts().sort_values(ascending=False)
business_name_counts.head(10)

McDonald's                14173
Taco Bell                  5994
Chipotle Mexican Grill     5628
Wendy's                    4680
Buffalo Wild Wings         4472
Steak ’n Shake             4458
Domino's Pizza             4448
Burger King                4004
Panera Bread               3779
Chili's                    3673
Name: name, dtype: int64

In [6]:
corpus = Corpus()
tc = TokenCleaner(return_as_string=False)

In [15]:
# negative_words_roots = tc.CleanText(' '.join(negative_words))
negative_words_roots = set(negative_words_roots)

# show 10 random root words
list(negative_words_roots)[:10]

['prejudic',
 'vocifer',
 'insoci',
 'hothead',
 'recessionari',
 'scourg',
 'maladjust',
 'discredit',
 'collaps',
 'clash']

In [21]:
# get the top negative words used in the whole negative reviews
negative_reviews['tokenized'] = negative_reviews['text'].apply(lambda x: tc.CleanText(x))
negative_reviews['negative_words'] = \
    negative_reviews['tokenized'].apply(lambda x: [word for word in x if word in negative_words_roots])

# create a dic that has count of each negative words used in the whole negative reviews
negative_words_count = Counter(chain.from_iterable(negative_reviews['negative_words']))
negative_words_count

KeyboardInterrupt: 

In [17]:
restaurants_to_do = business_name_counts.index[:5]
restaurants_to_do = ['Chipotle Mexican Grill']

In [19]:
for business_name in restaurants_to_do:
    reviews = negative_reviews.query('name == @business_name')
    reviews.reset_index(drop=True, inplace=True)
    reviews['tokenized'] = reviews['text'].apply(tc.CleanText)
    words = Counter(chain.from_iterable(reviews['tokenized']))
    
    # remove if not a negative word
    words = {k:v for k,v in words.items() if k in negative_words_roots}
    
    info = restaurants_info.query('name == @business_name')
    business = Business(info, reviews, )

In [None]:
business_names = ['Chipotle Mexican Grill']

for business_name in business_names:
    info = business_names[business_names ]
    business = Business(business_name)
    business.negative_reviews = negative_reviews[negative_reviews.business_name == business_name]
    business.word_counts = word_counts[word_counts.business_name == business_name]
    corpus.businesses[business_name] = business

In [None]:
for index, row in word_counts.iterrows():
    # count how many time each root word was used in review with negative sentiment
    word_to_look_for = row.word
    occurance = negative_reviews['text_cleaned'].apply(lambda x: x.count(word_to_look_for)).sum()
    word_counts.loc[index, 'negative_use'] = occurance
    print('{:0>3} | "{}" was used {:,} times in the negative reviews.'.format(index, word_to_look_for, occurance))

In [None]:
s3_client = DataCollection.s3_client()

if not os.path.exists("analysis"):
    os.mkdir("analysis")
    
negative_reviews.to_csv('analysis/negative_reviews.csv', compression = 'zip')
s3_client.upload_file('analysis/negative_reviews.csv', 's3fld', '599Team12/analysis/negative_reviews.csv', ExtraArgs={'ACL': 'public-read'})

word_counts.to_csv('analysis/word_counts.csv', compression = 'zip')
s3_client.upload_file('analysis/word_counts.csv', 's3fld', '599Team12/analysis/word_counts.csv', ExtraArgs={'ACL': 'public-read'})