In [50]:
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize 
import pandas as pd
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

In [51]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def read_from_file(filename):
    df = pd.read_csv(filename, index_col=0)
    return df

# Data exploration
def data_exploration(dataset):
    dataset.isnull().sum() # all zeros
    dataset.business_id.nunique() # 284
    dataset.name.nunique() # 271
    dataset.groupby('name').size().sort_values(ascending=False)
    dataset.groupby('business_id').size().sort_values(ascending=False).head()

def clean_dataset(dataset):
    reviews_by_businessid = dataset.groupby('business_id').apply(lambda x: x['text'].unique()).to_frame(name='review_list')
    reviews_by_businessid = reviews_by_businessid.reset_index()
    reviews_by_businessid['review_list'] = reviews_by_businessid['review_list'].apply(lambda x: ' '. join(x))
    # reviews_by_businessid['review_length'] = reviews_by_businessid['review_list'].apply(len)
    # reviews_by_businessid.review_length.sort_values(ascending=False)
    return reviews_by_businessid

# expensive
def initial_sent_clean(sent):
    sent = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    sent = re.sub("[^a-zA-Z ]", "", text)
    sent = text.lower()
    return sent

def sent_preprocess(sent):
    tokens = [word for word in word_tokenize(sent)]
    filtered_tokens = [token.lower() for token in tokens if re.match("^[A-Za-z-]*$", token)]
    lemmatized_tokens = [lemmatizer.lemmatize(t, 'v') for t in filtered_tokens]
    final_tokens = [s for s in lemmatized_tokens if s not in stop_words]
    return ' '.join(final_tokens)

def preprocess(reviews_by_businessid):
    # remove head() later
    reviews_dict = dict(zip(reviews_by_businessid.head(1).business_id, reviews_by_businessid.head(1).review_list))
    sentences_dict = {}
    processed_sent_dict = {}
    
    for bi, text in reviews_dict.items():
        sentences_dict[bi] = sent_tokenize(text)
    
    for bi, sent_list in sentences_dict.items():
        new_sent_list = []
        
        for sent in sent_list:
            processed_sent = sent_preprocess(sent)
            new_sent_list.append(processed_sent)
        
        processed_sent_dict[bi] = new_sent_list
    
    return processed_sent_dict


In [52]:
fn = "csv_reviews_mesa.csv"
mesa = read_from_file(fn)

# change assignment later
selected_businesses = mesa
reviews_by_businessid = clean_dataset(selected_businesses)

reviews = preprocess(reviews_by_businessid)
res = select_pos_reviews(reviews)



893

In [22]:
def select_pos_reviews(reviews):
    pos_reviews_dict = {}
    
    sid = SentimentIntensityAnalyzer()
    
    for bi, sent_list in reviews.items():
        pos_sentences = []
        
        for sentence in sent_list:
            ss = sid.polarity_scores(sentence)
            
            #pos_score = ss['pos']
            #neg_score = ss['neg']
            #neu_score = ss['neu']
            compound_score = ss['compound']
        
            if compound_score >= 0.05:
                pos_sentences.append(sentence)
        
        pos_reviews_dict[bi] = pos_sentences
        
    return pos_reviews_dict

res = select_pos_reviews(reviews)
res

{'-3oxnPPPU3YoxO9M1I2idg': ['try place first time pizza really good',
  'however service good',
  'could definitely use customer service train',
  'definitely best make type pizza place beat mod pizza mile',
  'ingredients fresh desserts delicious',
  'pizza bomb love thin crust',
  'italian cowboy bqq really good',
  'pizza flavor good customize way want',
  'inch perfect person',
  'love everything place',
  'toppings fun pile fountain drink great love child tour every month',
  'love place',
  'like subway pizzas pay one price heap toppings hearts delight nine buck',
  'sure guy stay business guilt always end buy drink side wing skin fantastic',
  'tip get mail list use reward program get kinds deal send way',
  'apologetic friendly',
  'success rate',
  'quality food good complaints see frequent place first impression',
  'impress',
  'things pretty quiet mean line faster pizza mouth',
  'logo outside super bright yellow eye-catching even',
  'inside things cool industrial unusual'