### Sentiment Analysis: [Amazon Fine Food Reviews](https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews)

Import Libraries

In [1]:
import sys
import os
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Import Custom Modules

In [2]:
sys.path.append("Custom Modules")
import custom_module as cm

Data Collection

In [3]:
folder = "Data\\Amazon Fine Food Reviews"
folder_path = os.getcwd() + "\\" + folder
file_name = "Reviews.csv"

In [4]:
dfs = []
for file in os.listdir(folder_path):
    if file_name in file:
        df = pd.read_csv(folder_path + "\\" + file)
        dfs.append(df)

df = pd.concat(dfs, axis=0)

Data Review

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


* Id: Row ID
* ProductId: Unique identifier for the product
* UserId: Unqiue identifier for the user
* ProfileName: Profile name of the user
* HelpfulnessNumerator: Number of users who found the review helpful
* HelpfulnessDenominator: Number of users who indicated whether they found the review helpful or not
* Score: Rating between 1 and 5
* Time: Timestamp for the review
* Summary: Brief summary of the review
* Text: Text of the review

In [9]:
df.shape

(568454, 11)

In [6]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284226.5,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,0.0,1.0,0.0,0.0,1.0,939340800.0
25%,142113.25,142114.25,0.0,0.0,4.0,1271290000.0
50%,284226.5,284227.5,0.0,1.0,5.0,1311120000.0
75%,426339.75,426340.75,2.0,2.0,5.0,1332720000.0
max,568453.0,568454.0,866.0,923.0,5.0,1351210000.0


Data Cleaning

In [10]:
data = df.copy()

In [11]:
# Sample the data
data = data.sample(n=10000)

In [12]:
# Remove ID column
data = data.drop(columns=["Unnamed: 0", "Id"])

In [13]:
# Remove duplicate rows (i.e. users who have reviewed the same product with the same review details)
initial_len = len(data)
data = data.drop_duplicates()
post_len = len(data)
print("Before:", initial_len)
print("After:", post_len)

Before: 10000
After: 10000


In [14]:
# Remove remaining identifier and time columns
data = data.drop(columns=["ProductId", "UserId", "ProfileName", "Time"])
data.head()

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,Text
77498,2,2,4,Pretty Darn Good,I ordered PB2 because I absolutely love peanut...
49633,4,4,5,Wonderful Drink,I really love this beverage. I have been drink...
60104,0,0,5,Excellent healthy way to start your day!,If you like bran and blueberries and are tired...
95182,0,0,5,Raspberry Leaf Tea,"I bought this for my wife who is pregnant, and..."
120493,1,1,5,Yummy,These were exactly like the ones we had in Ita...


In [15]:
# Check for null values
data.isna().sum()

HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Summary                   0
Text                      0
dtype: int64

In [16]:
# # Percentage of nulls for Summary
# (data["Summary"].isna().sum() / post_len)*100

In [17]:
# # Percentage of nulls is miniscule, therefore remove these rows
# data = data[~data["Summary"].isna()]

In [18]:
# calculate percentage of helpfulness
data["HelpfulnessPerc"] = data["HelpfulnessNumerator"] / data["HelpfulnessDenominator"]
data["HelpfulnessCount"] = data["HelpfulnessDenominator"]

In [19]:
# drop all except Text and Score for now
data = data[["Text", "Score"]]

In [20]:
# check for all possible classification values
print(data["Score"].value_counts())

Score
5    6377
4    1447
1     896
3     759
2     521
Name: count, dtype: int64


Import and Download the Natural Language Toolkit

In [21]:
import nltk 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [22]:
# download nltk corpus (first time only)
# nltk.download('all')

Import Other Modules

In [23]:
import re
from autocorrect import Speller

Preprocessing

In [24]:
data_pre = data.copy()

In [139]:
def preprocess_text(text):

    # processing steps to consider
    # - reduce repeated letters e.g. yeeeeeees -> yes
    # - remove links
    # - remove frequent words
    # - remove rare words
    # - apply autocorrect - need GPU to apply this, processing time is too long
    #       spell = Speller()
    #       text = spell(text)

    # tokenisation
    tokens = word_tokenize(text.lower())

    # remove stopwords from tokens
    tokens = [t for t in tokens if t not in stopwords.words("english")]

    # lemmatisation of tokens
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(t) for t in tokens]

    # remove non-alphanumeric characters from tokens
    # remove empty tokens
    tokens_alphanum = []
    for token in tokens:
        token_alphanum = re.sub(r'[^0-9a-zA-Z\s]+', '', token)
        if len(token_alphanum) > 0:
            tokens_alphanum.append(token_alphanum)

    # consolidate all tokens into string
    processed_text = ' '.join(tokens_alphanum)

    return processed_text

In [140]:
data_pre["Text"] = data_pre["Text"].apply(preprocess_text)
data_pre.head() 

Sentiment Analysis

In [112]:
data_sent = data_pre.copy()

In [113]:
# Divide scores into positive (3,4,5) and negative (1,2)
data_sent["ScoreBinary"] = np.where(data_sent["Score"]>=3, 1, 0)

In [114]:
analyser = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = analyser.polarity_scores(text) 
    if scores['pos'] > 0:
        sentiment = 1
    else:
        sentiment = 0
    return sentiment

In [115]:
data_sent["Sentiment"] = data_sent["Text"].apply(get_sentiment)
data_sent.head() 

Unnamed: 0,Text,Score,ScoreBinary,Sentiment
118218,fan brand extremely disappointed tried bunny g...,1,0,1
97744,sardine absolutely delicious skinless boneless...,5,1,1
56206,taste close fresh lemonade ll probably get mix...,4,1,1
67645,favorite clam dip ever take time make herbsspi...,5,1,1
115111,way compare body wash another dove product exf...,5,1,1


Confusion Matrix

In [123]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


In [124]:
conf_mat = confusion_matrix(data_sent['ScoreBinary'], data_sent['Sentiment'])
class_report = classification_report(data_sent['ScoreBinary'], data_sent['Sentiment'])
accuracy = accuracy_score(data_sent['ScoreBinary'], data_sent['Sentiment'])

In [133]:
print(
    "Confusion Matrix:\n", conf_mat,
    "\n\nClassification Report:\n", class_report,
    "\nAccuracy =", f"{accuracy*100}%"
)

Confusion Matrix:
 [[ 127 1359]
 [ 146 8368]] 

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.09      0.14      1486
           1       0.86      0.98      0.92      8514

    accuracy                           0.85     10000
   macro avg       0.66      0.53      0.53     10000
weighted avg       0.80      0.85      0.80     10000
 
Accuracy = 84.95%
