In [21]:
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

In [26]:
stop_words = set(stopwords.words('english'))
#print(stop_words)
lemmatizer = WordNetLemmatizer()

def read_from_file(filename):
    df = pd.read_csv(filename, index_col=0)
    return df

# Data exploration
def data_exploration(dataset):
    dataset.isnull().sum() # all zeros
    dataset.business_id.nunique() # 284
    dataset.name.nunique() # 271
    dataset.groupby('name').size().sort_values(ascending=False)
    dataset.groupby('business_id').size().sort_values(ascending=False).head()

def clean_dataset(dataset):
    reviews_by_businessid = dataset.groupby('business_id').apply(lambda x: x['text'].unique()).to_frame(name='review_list')
    reviews_by_businessid = reviews_by_businessid.reset_index()
    reviews_by_businessid['review_list'] = reviews_by_businessid['review_list'].apply(lambda x: ' '. join(x))
    # reviews_by_businessid['review_length'] = reviews_by_businessid['review_list'].apply(len)
    # reviews_by_businessid.review_length.sort_values(ascending=False)
    return reviews_by_businessid

def sent_preprocess(sent):
    tokens = [word for word in word_tokenize(sent)]
    filtered_tokens = [token.lower() for token in tokens if re.match("^[A-Za-z-]*$", token)]
    lemmatized_tokens = [lemmatizer.lemmatize(t, 'v') for t in filtered_tokens]
    #final_tokens = [s for s in lemmatized_tokens if s not in stop_words]
    return ' '.join(lemmatized_tokens)

def preprocess(reviews_by_businessid):
    # remove head() later
    reviews_dict = dict(zip(reviews_by_businessid.head(1).business_id, reviews_by_businessid.head(1).review_list))
    sentences_dict = {}
    processed_sent_dict = {}
    
    for bi, text in reviews_dict.items():
        sentences_dict[bi] = sent_tokenize(text)
    
    for bi, sent_list in sentences_dict.items():
        new_sent_list = []
        
        for sent in sent_list:
            processed_sent = sent_preprocess(sent)
            new_sent_list.append(processed_sent)
        
        processed_sent_dict[bi] = new_sent_list
    
    return processed_sent_dict, sentences_dict

def select_pos_reviews(processed_reviews, original_reviews):
    pos_reviews_dict = {}
    
    sid = SentimentIntensityAnalyzer()
    
    for bi, sent_list in processed_reviews.items():
        pos_sentences = set()
        
        for idx, sentence in enumerate(sent_list):
            ss = sid.polarity_scores(sentence)
            
            #pos_score = ss['pos']
            #neg_score = ss['neg']
            #neu_score = ss['neu']
            compound_score = ss['compound']
        
            if compound_score >= 0.05:
                original_sentence = original_reviews[bi][idx]
                pos_sentences.add(original_sentence)
                #print('original sentence: ', original_sentence)
                #print('processed sentence; ', sentence)
        
        pos_reviews_dict[bi] = pos_sentences
    
    return pos_reviews_dict


In [27]:
fn = "csv_reviews_mesa.csv"
mesa = read_from_file(fn)

# change assignment later
selected_businesses = mesa
reviews_by_businessid = clean_dataset(selected_businesses)

processed_reviews, original_reviews = preprocess(reviews_by_businessid)
res = select_pos_reviews(processed_reviews, original_reviews)



{'It was served quick, hot, and delicious!', 'Highly recommend this place.', "Everything I've had has been amazing!", 'Good pizza made quickly, without skimping on quality.', 'Fresh ingredients, great price, crust is good--a little thin for my taste but next time I will get the double crust for just a buck more.', 'Maybe the ingredients were fresh, but why would it matter.', 'The owner was holding the door to greet customers which was very nice customer service he explained the concept him and his wife decided to implement into the restaurant and thanked us for coming.', 'All the employees were very nice and the inside was decorated perfectly.', 'The service is is GREAT!', 'They have a fantastic selection of local brews and wine to choose from as well, and the pizza combinations on the wall of fame are very creative.', 'I had gone here the other day to use my Birthday loyalty reward.', 'The atmosphere and drink selection is the real winner here.', 'What makes it awesome is the caliber 