In [4]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from datetime import datetime

In [5]:
df_data = pd.read_csv(filepath_or_buffer="amzn_customer_reviews.csv", sep=",")

In [6]:
df_data.head(3)

Unnamed: 0,Pseudo,Title,Review,Rating,Verified Purchase,Date
0,R.A.O,"\nA Small, But Very Powerful Device\n",\nReview is based of having the computer as a ...,4.4 out of 5 stars,Verified Purchase,"Reviewed in the United States on November 27, ..."
1,Bee Lor,\nExcellent portable gaming laptop\n,\nI'm writing this review for anyone who's on ...,5.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on January 6, 2022"
2,R.A.O,\nThe Golden Standard\n,\nI was weighing my options between getting a ...,3.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on November 27, ..."


## Review cleaning
All the reviews have '\n' at the end. Let's remove it.

In [7]:
df_data['Title'] = df_data['Title'].apply(lambda x:x.strip('\n'))
df_data.head(3)

Unnamed: 0,Pseudo,Title,Review,Rating,Verified Purchase,Date
0,R.A.O,"A Small, But Very Powerful Device",\nReview is based of having the computer as a ...,4.4 out of 5 stars,Verified Purchase,"Reviewed in the United States on November 27, ..."
1,Bee Lor,Excellent portable gaming laptop,\nI'm writing this review for anyone who's on ...,5.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on January 6, 2022"
2,R.A.O,The Golden Standard,\nI was weighing my options between getting a ...,3.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on November 27, ..."


In [8]:
df_data['Review'] = df_data['Review'].apply(lambda x: x.strip('\n') if isinstance(x, str) else x)
df_data.head(3)


Unnamed: 0,Pseudo,Title,Review,Rating,Verified Purchase,Date
0,R.A.O,"A Small, But Very Powerful Device",Review is based of having the computer as a da...,4.4 out of 5 stars,Verified Purchase,"Reviewed in the United States on November 27, ..."
1,Bee Lor,Excellent portable gaming laptop,I'm writing this review for anyone who's on th...,5.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on January 6, 2022"
2,R.A.O,The Golden Standard,I was weighing my options between getting a fu...,3.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on November 27, ..."


In [9]:
vader = SentimentIntensityAnalyzer()

In [10]:
df_data = df_data.dropna()
df_data["Review"] = df_data["Review"].astype(str)

In [11]:
df_data["Score"] = df_data["Review"].apply(
    lambda review: vader.polarity_scores(review)
)
df_data["Compound"] = df_data["Score"].apply(
    lambda score_dict: score_dict["compound"]
)
df_data["Sentiment"] = df_data["Compound"].apply(
    lambda sent: "positive" if sent > 0 else ("neutral" if sent == 0 else "negative")
)

In [12]:
df_data["Sentiment"].value_counts()

Sentiment
positive    183
negative     37
neutral       9
Name: count, dtype: int64

In [13]:
df_data.head(3)


Unnamed: 0,Pseudo,Title,Review,Rating,Verified Purchase,Date,Score,Compound,Sentiment
0,R.A.O,"A Small, But Very Powerful Device",Review is based of having the computer as a da...,4.4 out of 5 stars,Verified Purchase,"Reviewed in the United States on November 27, ...","{'neg': 0.015, 'neu': 0.82, 'pos': 0.165, 'com...",0.9984,positive
1,Bee Lor,Excellent portable gaming laptop,I'm writing this review for anyone who's on th...,5.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on January 6, 2022","{'neg': 0.016, 'neu': 0.88, 'pos': 0.104, 'com...",0.9921,positive
2,R.A.O,The Golden Standard,I was weighing my options between getting a fu...,3.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on November 27, ...","{'neg': 0.026, 'neu': 0.864, 'pos': 0.109, 'co...",0.9745,positive


In [14]:
df_data["Title"] = df_data["Title"].str.lower()
df_data["Review"] = df_data["Review"].str.lower()

In [15]:
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# apply the function to each row in the "Review" column
df_data["Review"] = df_data["Review"].apply(lambda text: remove_punctuation(text.lower()))
df_data["Title"] = df_data["Title"].apply(lambda text: remove_punctuation(text.lower()))

In [16]:
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

df_data["Review"] = df_data["Review"].apply(
    lambda review: " ".join([word for word in review.lower().split() if word not in stop_words])
)

df_data["Title"] = df_data["Title"].apply(
    lambda review: " ".join([word for word in review.lower().split() if word not in stop_words])
)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/namnguyen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
df_data.head(5)

Unnamed: 0,Pseudo,Title,Review,Rating,Verified Purchase,Date,Score,Compound,Sentiment
0,R.A.O,small powerful device,review based computer daily driver week meant ...,4.4 out of 5 stars,Verified Purchase,"Reviewed in the United States on November 27, ...","{'neg': 0.015, 'neu': 0.82, 'pos': 0.165, 'com...",0.9984,positive
1,Bee Lor,excellent portable gaming laptop,im writing review anyone whos fence purchasing...,5.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on January 6, 2022","{'neg': 0.016, 'neu': 0.88, 'pos': 0.104, 'com...",0.9921,positive
2,R.A.O,golden standard,weighing options getting full desktop pc getti...,3.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on November 27, ...","{'neg': 0.026, 'neu': 0.864, 'pos': 0.109, 'co...",0.9745,positive
3,Tanmay,powerful size,great laptop things small negative background ...,5.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on July 7, 2021","{'neg': 0.097, 'neu': 0.701, 'pos': 0.201, 'co...",0.9891,positive
4,Brandon Relaz,brand new razer blade 14 broken mouse pad,received gem turned realized track pad super j...,5.0 out of 5 stars,Verified Purchase,"Reviewed in the United States on July 20, 2021","{'neg': 0.084, 'neu': 0.731, 'pos': 0.185, 'co...",0.9859,positive


In [19]:
df_data.columns

Index(['Pseudo', 'Title', 'Review', 'Rating', 'Verified Purchase', 'Date',
       'Score', 'Compound', 'Sentiment'],
      dtype='object')

In [21]:
df_data["Verified Purchase"].replace(
    to_replace="Verified Purchase",
    value="Yes",
    inplace=True
)

In [22]:
df_data.head(3)

Unnamed: 0,Pseudo,Title,Review,Rating,Verified Purchase,Date,Score,Compound,Sentiment
0,R.A.O,small powerful device,review based computer daily driver week meant ...,4.4 out of 5 stars,Yes,"Reviewed in the United States on November 27, ...","{'neg': 0.015, 'neu': 0.82, 'pos': 0.165, 'com...",0.9984,positive
1,Bee Lor,excellent portable gaming laptop,im writing review anyone whos fence purchasing...,5.0 out of 5 stars,Yes,"Reviewed in the United States on January 6, 2022","{'neg': 0.016, 'neu': 0.88, 'pos': 0.104, 'com...",0.9921,positive
2,R.A.O,golden standard,weighing options getting full desktop pc getti...,3.0 out of 5 stars,Yes,"Reviewed in the United States on November 27, ...","{'neg': 0.026, 'neu': 0.864, 'pos': 0.109, 'co...",0.9745,positive


In [23]:
df_data.to_csv(path_or_buf="amz_customer_reviews.csv", sep=",", index=False)