# Sentiment Analysis of Amazon Reviews

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

plt.style.use('bmh')
plt.rcParams.update({'font.size': 7, 'font.family': 'monospace'})

In [None]:
# Import data
df = pd.read_csv('../data/amazon_reviews.csv')
df = df.head(500)
df = df.reset_index()
df.head()

In [None]:
# Exploratory analysis
cmap = sns.color_palette("coolwarm", as_cmap=True)
cmap = cmap(np.linspace(0, 1, 5))
ax = df['Score'].value_counts().sort_index().plot(
    kind='bar',
    title='Reviews by Stars',
    figsize=(6, 4), 
    color=cmap,
    width=0.8
    )

ax.set_xlabel('Stars')
ax.set_ylabel('Number of Reviews')
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
plt.show()

df.info()

## VADER Sentiment Scoring

Vader (Valence Aware Dictionary and sEntiment Reasoner) is a pre-trained sentiment analysis model. VADER analyzes text based on a predefined list of words and their associated sentiment scores.
- PROS: easy to use and handles informal text.
- CONS: limited contextual understanding and domain dependence.

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
# Test the sentiment analyzer object
print(sia.polarity_scores('I love this product'))
print(sia.polarity_scores('I hate this product'))
print(sia.polarity_scores('I love this product so much'))
print(sia.polarity_scores('I hate this product so much'))

In [None]:
# Apply the sentiment analyzer to the reviews
res_vader = {}
for i in range(len(df)):
    row = df.iloc[i]
    text = row['Text']
    res_vader[i]= sia.polarity_scores(text)
    
vader = pd.DataFrame(res_vader).T
vader = vader.reset_index()
vader = vader.merge(df, on='index', how='left')
vader = vader.drop(columns=['index', 'Id'])   

vader.head()

In [None]:
ax = sns.barplot(data=vader, x='Score', y='compound', palette=list(cmap), hue='Score')
ax.set_title('Compound Score by Amazon Review')
ax.get_legend().remove()
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vader, x='Score', y='pos', ax=axs[0], palette=list(cmap), hue='Score')
sns.barplot(data=vader, x='Score', y='neu', ax=axs[1], palette=list(cmap), hue='Score')
sns.barplot(data=vader, x='Score', y='neg', ax=axs[2], palette=list(cmap), hue='Score')
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
for ax in axs:
    ax.set_ylim(0, 1)
    ax.get_legend().remove()
plt.tight_layout()
plt.show()

## RoBERTa Pretrained Model
This is a roBERTa-base model trained on ~58M tweets and finetuned for sentiment analysis with the TweetEval benchmark. 

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
# Transfer learning model with predefined model weights
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# Run RoBERTa

def run_roberta(text):
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    }
    return scores_dict
    
sentiment_analysis = {}
res_vader = {}

for i in range(len(df)):
    try:
        print(f'Processed {i+1} out of {len(df)} reviews', end='\r')
        row = df.iloc[i]
        text = row['Text']
        myid = row['Id']
        sentiment_analysis[myid] = run_roberta(text)
        sentiment_analysis[myid].update(sia.polarity_scores(text))
        sentiment_analysis[myid]['Score'] = row['Score']
        sentiment_analysis[myid]['Text'] = row['Text']
        sentiment_analysis[myid]['Summary'] = row['Summary']
        sentiment_analysis[myid]['Time'] = row['Time']
        sentiment_analysis[myid]['ProfileName'] = row['ProfileName']
        sentiment_analysis[myid]['ProductId'] = row['ProductId']
        sentiment_analysis[myid]['HelpfulnessNumerator'] = row['HelpfulnessNumerator']
        sentiment_analysis[myid]['HelpfulnessDenominator'] = row['HelpfulnessDenominator']
    except RuntimeError:
        print(f'\nError with {i+1}\n')

In [None]:
sentiment_analysis = pd.DataFrame(sentiment_analysis).T
sentiment_analysis = sentiment_analysis.rename(columns={'neg': 'vader_neg', 'neu': 'vader_neu', 'pos': 'vader_pos', 'compound': 'vader_compound'})
sentiment_analysis

## Compare results

In [None]:
# compare vader and roberta with a pairplot
sns.pairplot(
    data=sentiment_analysis, 
    markers='o',
    vars=['vader_neg', 'vader_neu', 'vader_pos', 'roberta_neg', 'roberta_neu', 'roberta_pos'], 
    hue='Score', palette=['red', 'orange', 'goldenrod', 'green', 'blue'],
    plot_kws={'alpha': 0.5}
    )
plt.legend(title='Score', loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
