In [29]:
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from heapq import nlargest

In [30]:
stop_words = set(stopwords.words('english'))
#print(stop_words)
lemmatizer = WordNetLemmatizer()

def read_from_file(filename):
    df = pd.read_csv(filename, index_col=0)
    return df

# Data exploration
def data_exploration(dataset):
    dataset.isnull().sum() # all zeros
    dataset.business_id.nunique() # 284
    dataset.name.nunique() # 271
    dataset.groupby('name').size().sort_values(ascending=False)
    dataset.groupby('business_id').size().sort_values(ascending=False).head()

def clean_dataset(dataset):
    reviews_by_businessid = dataset.groupby(['business_id'])['text'].apply(' '.join).reset_index()
    return reviews_by_businessid
    
def sent_preprocess(sent):
    tokens = (word for word in word_tokenize(sent))
    filtered_tokens = (token.lower() for token in tokens if re.match("^[A-Za-z-]*$", token))
    lemmatized_tokens = (lemmatizer.lemmatize(t, 'v') for t in filtered_tokens)
    #final_tokens = [s for s in lemmatized_tokens if s not in stop_words]
    return ' '.join(lemmatized_tokens)

def preprocess(reviews_by_businessid):
    # remove head() later
    reviews_dict = dict(zip(reviews_by_businessid.head(1).business_id, reviews_by_businessid.head(1).text))
    sentences_dict = {}
    processed_sent_dict = {}
    
    for bi, text in reviews_dict.items():
        sentences_dict[bi] = sent_tokenize(text)
    
    for bi, sent_list in sentences_dict.items():
        new_sent_list = set()
        
        for sent in sent_list:
            processed_sent = sent_preprocess(sent)
            new_sent_list.add(processed_sent)
        
        processed_sent_dict[bi] = new_sent_list
    
    return processed_sent_dict, sentences_dict

def select_pos_reviews_vader(processed_reviews, original_reviews):
    pos_reviews_dict = {}
    
    sid = SentimentIntensityAnalyzer()
    
    for bi, sent_list in processed_reviews.items():
        pos_sentences = set()
        pos_scores = []
        
        for idx, sentence in enumerate(sent_list):
            ss = sid.polarity_scores(sentence)
            
            #pos_score = ss['pos']
            #neg_score = ss['neg']
            #neu_score = ss['neu']
            compound_score = ss['compound']
            #pos_scores.append(compound_score)
        
            if compound_score >= 0.05:
                original_sentence = original_reviews[bi][idx]
                pos_sentences.add(re.sub('\s+', ' ', original_sentence).replace("\\", ""))
                #print('original sentence: ', original_sentence)
                #print('processed sentence; ', sentence)
        
        #indices = nlargest(10, range(len(pos_scores)), key=lambda idx: pos_scores[idx])
        pos_reviews_dict[bi] = pos_sentences
    
    return pos_reviews_dict


In [32]:
%%time
fn = "csv_reviews_mesa.csv"
mesa = read_from_file(fn)

# change assignment later
selected_businesses = mesa
reviews_by_businessid = clean_dataset(selected_businesses)

processed_reviews, original_reviews = preprocess(reviews_by_businessid)
res = select_pos_reviews_vader(processed_reviews, original_reviews)
res

CPU times: user 987 ms, sys: 79.3 ms, total: 1.07 s
Wall time: 1.07 s
