In [8]:
import re
import nltk
nltk.download('vader_lexicon')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from heapq import nlargest, nsmallest

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
sid = SentimentIntensityAnalyzer()

# Data exploration
def data_exploration(dataset):
    dataset.isnull().sum() # all zeros
    dataset.business_id.nunique() # 284
    dataset.name.nunique() # 271
    dataset.groupby('name').size().sort_values(ascending=False)
    dataset.groupby('business_id').size().sort_values(ascending=False).head()

def clean_dataset(dataset):
    reviews_by_businessid = dataset.groupby(['business_id', 'name'])['text'].apply(' '.join).reset_index()
    reviews_by_businessid.dropna(axis=0, inplace=True, subset=['text'])
    return reviews_by_businessid
    
def sent_preprocess(sent):
    tokens = (word for word in word_tokenize(sent))
    filtered_tokens = (token.lower() for token in tokens if re.match("^[A-Za-z-]*$", token))
    lemmatized_tokens = (lemmatizer.lemmatize(t, 'v') for t in filtered_tokens)
    #final_tokens = [s for s in lemmatized_tokens if s not in stop_words]
    return ' '.join(lemmatized_tokens)

def preprocess(reviews_by_businessid):
    reviews_dict = dict(zip(reviews_by_businessid.business_id, reviews_by_businessid.text))
    sentences_dict = {}
    processed_sent_dict = {}
    
    for bi, text in reviews_dict.items():
        sentences_dict[bi] = sent_tokenize(text)
    
    for bi, sent_list in sentences_dict.items():
        new_sent_list = []
        
        for sent in sent_list:
            processed_sent = sent_preprocess(sent)
            new_sent_list.append (processed_sent)
        
        processed_sent_dict[bi] = new_sent_list
    
    return processed_sent_dict, sentences_dict

pred_pos_sent, pred_neg_sent = [], []

def select_reviews_vader(processed_reviews, original_reviews):
    reviews_dict, stats_dict = {}, {}
    
    for bi, sent_list in processed_reviews.items():
        sent_scores = []
        pos_count, neg_count = 0, 0
        
        for idx, sentence in enumerate(sent_list):
            ss = sid.polarity_scores(sentence)
            
            compound_score = ss['compound']
            sent_scores.append(compound_score)
            
            if compound_score >= 0.05:
                pos_count = pos_count + 1
                s1 = re.sub('\s+', ' ', original_reviews[bi][idx])
                pred_pos_sent.append(s1)
                
            elif compound_score <= -0.05:
                neg_count = neg_count + 1
                s2 = re.sub('\s+', ' ', original_reviews[bi][idx])
                pred_neg_sent.append(s2)
        
        stats_dict[bi] = (pos_count, neg_count, pos_count/(pos_count+neg_count), neg_count/(pos_count+neg_count))
        
        pos_indices = nlargest(5, range(len(sent_scores)), key=lambda idx: sent_scores[idx])
        neg_indices = nsmallest(5, range(len(sent_scores)), key=lambda idx: sent_scores[idx])
        
        pos_sentences = [re.sub('\s+', ' ', original_reviews[bi][i]).replace("\\", "")
                         for i in pos_indices if sent_scores[i] >= 0.05]
        neg_sentences = [re.sub('\s+', ' ', original_reviews[bi][j]).replace("\\", "")
                         for j in neg_indices if sent_scores[j] <= 0.05]
        
        reviews_dict[bi] = (pos_sentences, neg_sentences)
    
    return reviews_dict, stats_dict

def get_sentiments(fn):
    pred_pos_sent.clear()
    pred_neg_sent.clear()
    df = pd.read_csv(fn, index_col=0)
    processed_reviews, original_reviews = preprocess(df)
    print('preprocess phase completed')
    reviews_dict, stats_dict = select_reviews_vader(processed_reviews, original_reviews)
    print('sentiment classification completed')
    
    business_ids = list(reviews_dict.keys())
    reviews = list(reviews_dict.values())
    stats = list(stats_dict.values())
    
    businessid_df = pd.DataFrame({'business_id': business_ids})
    reviews_df = pd.DataFrame(reviews, columns=['postive_reviews', 'negative_reviews'])
    stats_df = pd.DataFrame(stats, columns=['num_pos', 'num_neg', 'pos_ratio', 'neg_ratio'])
    return pd.concat([businessid_df, reviews_df, stats_df], axis=1)

# Sample sample_num_pos positive reviews and sample_num_neg negative reviews for evaluation
def sample_reviews(fn, sample_num_pos, sample_num_neg):
    pred_pos_sent.clear()
    pred_neg_sent.clear()
    df = pd.read_csv(fn, index_col=0)
    processed_reviews, original_reviews = preprocess(df)
    print('preprocess phase completed')
    select_reviews_vader(processed_reviews, original_reviews)
    print('sentiment classification completed')
    
    pos_reviews_df = pd.DataFrame({'reviews': pred_pos_sent, 'actual_sentiment': [True]*len(pred_pos_sent),
                                  'predicted_sentiment': [True]*len(pred_pos_sent)}) 
    neg_reviews_df = pd.DataFrame({'reviews': pred_neg_sent, 'actual_sentiment': [False]*len(pred_neg_sent),
                                  'predicted_sentiment': [True]*len(pred_neg_sent)})
    
    sampled_pos_reviews = pos_reviews_df.sample(n=sample_num_pos)
    sampled_neg_reviews = neg_reviews_df.sample(n=sample_num_neg)

    reviews = pd.concat([sampled_pos_reviews, sampled_neg_reviews]).reset_index(drop=True)
    return reviews


In [16]:
sampled_reviews = sample_reviews('mesa_5000.csv', sample_num_pos=150, sample_num_neg=50)
sampled_reviews.to_csv('eval_reviews.csv')

preprocess phase completed
sentiment classification completed


In [7]:
csv_names = ["pittsburgh_reviews", "mesa_reviews", "charlotte_reviews"]

In [8]:
for csv in csv_names:
    sentiments = get_sentiments("%s_cleaned.csv" % csv)
    original_df = pd.read_csv("%s.csv" % csv)
    final = pd.merge(original_df,sentiments,how = "left",on = "business_id")
    final.to_csv("%s_final.csv" % csv)

preprocess phase completed
sentiment classification completed
preprocess phase completed
sentiment classification completed
preprocess phase completed
sentiment classification completed
