### Sentiment Analysis: [Hotel Reviews](https://www.kaggle.com/datasets/jiashenliu/515k-hotel-reviews-data-in-europe)

Import Libraries

In [1]:
import sys
import os
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Import Custom Modules

In [2]:
sys.path.append("Custom Modules")
import custom_module as cm

Data Collection

In [58]:
folder = "Data\\Amazon Fine Food Reviews"
folder_path = os.getcwd() + "\\" + folder
file_name = "Reviews.csv"

In [59]:
dfs = []
for file in os.listdir(folder_path):
    if file_name in file:
        df = pd.read_csv(folder_path + "\\" + file)
        dfs.append(df)

df = pd.concat(dfs, axis=0)

Data Review

In [60]:
df.head()

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


* Id: Row ID
* ProductId: Unique identifier for the product
* UserId: Unqiue identifier for the user
* ProfileName: Profile name of the user
* HelpfulnessNumerator: Number of users who found the review helpful
* HelpfulnessDenominator: Number of users who indicated whether they found the review helpful or not
* Score: Rating between 1 and 5
* Time: Timestamp for the review
* Summary: Brief summary of the review
* Text: Text of the review

In [61]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284226.5,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,0.0,1.0,0.0,0.0,1.0,939340800.0
25%,142113.25,142114.25,0.0,0.0,4.0,1271290000.0
50%,284226.5,284227.5,0.0,1.0,5.0,1311120000.0
75%,426339.75,426340.75,2.0,2.0,5.0,1332720000.0
max,568453.0,568454.0,866.0,923.0,5.0,1351210000.0


Data Cleaning

In [62]:
data = df.copy()

In [63]:
# Sample the data
data = data.sample(n=50000)

In [64]:
# Remove ID column
data = data.drop(columns=["Unnamed: 0", "Id"])

In [65]:
# Remove duplicate rows (i.e. users who have reviewed the same product with the same review details)
initial_len = len(data)
data = data.drop_duplicates()
post_len = len(data)
print("Before:", initial_len)
print("After:", post_len)

Before: 50000
After: 49998


In [66]:
# Remove remaining identifier and time columns
data = data.drop(columns=["ProductId", "UserId", "ProfileName", "Time"])
data.head()

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,Text
129141,0,1,1,Does not taste good or actually like anything...,This is probably the most bland snack I have e...
28321,0,0,1,Pure trash,"This order was pure trash. The assorted ""Mexic..."
2966,2,2,5,Natural Balance Dry Dog Food is the Best,My 2 dogs were on pro plan when they became ad...
99415,0,0,5,Convenient for home and office,While this coffee is not as good as fresh brew...
61158,1,1,5,purchase candy @ Amazon,I was very satisfied with this purchase. The ...


In [67]:
# Check for null values
data.isna().sum()

HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Summary                   2
Text                      0
dtype: int64

In [68]:
# Percentage of nulls for Summary
(data["Summary"].isna().sum() / post_len)*100

0.004000160006400256

In [69]:
# Percentage of nulls is miniscule, therefore remove these rows
data = data[~data["Summary"].isna()]

In [70]:
# calculate percentage of helpfulness
data["HelpfulnessPerc"] = data["HelpfulnessNumerator"] / data["HelpfulnessDenominator"]
data["HelpfulnessCount"] = data["HelpfulnessDenominator"]

In [71]:
# drop helpfulness for now
data = data[["Summary", "Text", "Score"]]

In [72]:
# check for all possible classification values
print(data["Score"].value_counts())

Score
5    31802
4     7134
1     4642
3     3822
2     2596
Name: count, dtype: int64


Natural Language Toolkit

In [73]:
import nltk 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [74]:
# download nltk corpus (first time only)
# nltk.download('all')

Preprocessing

In [75]:
data_pre = data.copy()

In [76]:
def preprocess_text(text):

    # tokenisation
    tokens = word_tokenize(text.lower())

    # remove stopwords from tokens
    tokens_filtered = [t for t in tokens if t not in stopwords.words("english")]

    # lemmatisation of tokens
    lemmatiser = WordNetLemmatizer()
    tokens_lemmatised = [lemmatiser.lemmatize(t) for t in tokens_filtered]

    # rejoin tokens into string
    processed_text = ' '.join(tokens_lemmatised)

    return processed_text

In [77]:
data_pre["Text"] = data_pre["Text"].apply(preprocess_text)
data_pre.head() 

NLTK Sentiment Analysis

In [None]:
data_sent = data_pre.copy()

In [None]:
analyser = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = analyser.polarity_scores(text) 
    if scores['pos'] > 0:
        sentiment = 1
    else:
        sentiment = 0
    return sentiment

In [None]:
data_sent["Sentiment"] = data_sent["Text"].apply(get_sentiment)