In [98]:
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from heapq import nlargest
from heapq import nsmallest

In [99]:
stop_words = set(stopwords.words('english'))
#print(stop_words)
lemmatizer = WordNetLemmatizer()
sid = SentimentIntensityAnalyzer()
num_businesses = 5

def read_from_file(filename):
    return pd.read_csv(filename, index_col=0)

# Data exploration
def data_exploration(dataset):
    dataset.isnull().sum() # all zeros
    dataset.business_id.nunique() # 284
    dataset.name.nunique() # 271
    dataset.groupby('name').size().sort_values(ascending=False)
    dataset.groupby('business_id').size().sort_values(ascending=False).head()

def clean_dataset(dataset):
    reviews_by_businessid = dataset.groupby(['business_id', 'name'])['text'].apply(' '.join).reset_index()
    return reviews_by_businessid
    
def sent_preprocess(sent):
    tokens = (word for word in word_tokenize(sent))
    filtered_tokens = (token.lower() for token in tokens if re.match("^[A-Za-z-]*$", token))
    lemmatized_tokens = (lemmatizer.lemmatize(t, 'v') for t in filtered_tokens)
    #final_tokens = [s for s in lemmatized_tokens if s not in stop_words]
    return ' '.join(lemmatized_tokens)

def get_names(reviews_by_businessid):
    return reviews_by_businessid.head(num_businesses).name
    
def preprocess(reviews_by_businessid):
    reviews_dict = dict(zip(reviews_by_businessid.head(num_businesses).business_id, reviews_by_businessid.head(num_businesses).text))
    sentences_dict = {}
    processed_sent_dict = {}
    
    for bi, text in reviews_dict.items():
        sentences_dict[bi] = sent_tokenize(text)
    
    for bi, sent_list in sentences_dict.items():
        new_sent_list = []
        
        for sent in sent_list:
            processed_sent = sent_preprocess(sent)
            new_sent_list.append (processed_sent)
        
        processed_sent_dict[bi] = new_sent_list
    
    return processed_sent_dict, sentences_dict

def select_reviews_vader(processed_reviews, original_reviews):
    pos_reviews_dict, neg_reviews_dict = {}, {}
    
    for bi, sent_list in processed_reviews.items():
        sent_scores = []
        
        for idx, sentence in enumerate(sent_list):
            ss = sid.polarity_scores(sentence)
            
            #pos_score = ss['pos']
            #neg_score = ss['neg']
            #neu_score = ss['neu']
            compound_score = ss['compound']
            sent_scores.append(compound_score)
        
        pos_indices = nlargest(10, range(len(sent_scores)), key=lambda idx: sent_scores[idx])
        neg_indices = nsmallest(10, range(len(sent_scores)), key=lambda idx: sent_scores[idx])
        pos_sentences = [re.sub('\s+', ' ', original_reviews[bi][i]).replace("\\", "") for i in pos_indices]
        neg_sentences = [re.sub('\s+', ' ', original_reviews[bi][j]).replace("\\", "") for j in neg_indices]
        
        pos_reviews_dict[bi], neg_reviews_dict[bi] = pos_sentences, neg_sentences
    
    return pos_reviews_dict, neg_reviews_dict


In [100]:
#%%time
fn = "csv_reviews_mesa.csv"
mesa = read_from_file(fn)

# change assignment later
selected_businesses = mesa
reviews_by_businessid = clean_dataset(selected_businesses)

id_names = get_names(reviews_by_businessid)

processed_reviews, original_reviews = preprocess(reviews_by_businessid)
pos_reviews_dict, neg_reviews_dict = select_reviews_vader(processed_reviews, original_reviews)

d = {'business_id': [*pos_reviews_dict], 'name': id_names, 'postive_reviews': [*pos_reviews_dict.values()],
                'negative_reviews': [*neg_reviews_dict.values()]}

selected_reviews = pd.DataFrame(data=d)


Unnamed: 0,business_id,name,postive_reviews,negative_reviews
0,-3oxnPPPU3YoxO9M1I2idg,Eklectic Pie - Mesa,"[Love love love love love., The atmosphere was...",[The only thing that lost points is the staff ...
1,-oSII3bw90cvyLmgsHgmpg,Mokis Hawaiian Grill,"[Really good, really authentic Hawaiian faire ...","[I leave full, but unfulfilled, and again I am..."
2,-sNi7U9seVfCr8T8nkWd_w,Rumbi Island Grill,[This restaurant had a beautiful interior desi...,"[Cheap cuts of meat on the steak and chicken, ..."
3,01xTdrNUuTOAyH7NaRWcUA,Mellow Mushroom,"[Great vibes super laid back, Robert the serve...","[So, since everything about this dining ""Exper..."
4,09psTuUYhUMA2ZRzQlm30Q,Five Guys,"[Now to be honest, I don't eat burgers that of...",[Let me just make a list of WTF items: Wicked ...
