In [1]:
# Import necessary libraries
import spacy
import pandas as pd
from spacytextblob.spacytextblob import SpacyTextBlob
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import textwrap

# Load the spaCy model
nlp = spacy.load('en_core_web_sm') 
nlp.add_pipe('spacytextblob')


  from .autonotebook import tqdm as notebook_tqdm


<spacytextblob.spacytextblob.SpacyTextBlob at 0x1791328c830>

In [2]:
# Load in the dataset
df = pd.read_csv('amazon_product_reviews.csv')

# Understanding the dataset
print('df.info: ', df.info())
print('df.shape: ', df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34660 non-null  object 
 1   name                  27900 non-null  object 
 2   asins                 34658 non-null  object 
 3   brand                 34660 non-null  object 
 4   categories            34660 non-null  object 
 5   keys                  34660 non-null  object 
 6   manufacturer          34660 non-null  object 
 7   reviews.date          34621 non-null  object 
 8   reviews.dateAdded     24039 non-null  object 
 9   reviews.dateSeen      34660 non-null  object 
 10  reviews.didPurchase   1 non-null      object 
 11  reviews.doRecommend   34066 non-null  object 
 12  reviews.id            1 non-null      float64
 13  reviews.numHelpful    34131 non-null  float64
 14  reviews.rating        34627 non-null  float64
 15  reviews.sourceURLs 

  df = pd.read_csv('amazon_product_reviews.csv')


In [3]:
# function for Text Preprocessing
def preprocess_text(text, filter_stopwords=True):
    # Create a spaCy document
    doc = nlp(text)
    
    # Remove stopwords (default option) and punctuation
    filtered_tokens = [token for token in doc if (not filter_stopwords or 
                                                  not token.is_stop) and not token.is_punct]
    
    # Lemmatize the tokens
    lemmatized_tokens = [token.lemma_ for token in filtered_tokens]
        
    # NER regconition
    print('NER          : ', [(i, i.label_, i.label) for i in doc.ents])

    # Join the lemmatized tokens back into a string
    preprocessed_text = ' '.join(lemmatized_tokens)
    print('Preprocessed : ', preprocessed_text)
    
    return preprocessed_text

In [11]:
# Preprocess the reviews.text column

# Remove missing values
# There is 1 missing value in reviews.text
df = df.dropna(subset=['reviews.text'])

# Convert the value to string to avoid error in analysis
df['preprocessed_text'] = df['reviews.text'].apply(str)

# Strip to remove leading space characters that reduces NER
df['preprocessed_text'] = df['preprocessed_text'].str.strip()

# Test the model on sample product reviews (first 60 reviews in the dataset). 
# reviews_data = df.iloc[0:60,[11,14,16,17,20,21]]  # Use columns related to review only
reviews_data = df.loc[0:60,['reviews.rating','reviews.text','preprocessed_text']]

# Apply SpaCy preprocess to review text 
reviews_data['preprocessed_text'] = reviews_data['preprocessed_text'].apply(
    preprocess_text)

reviews_data

NER          :  []
Preprocessed :  product far disappoint child love use like ability monitor control content ease
NER          :  []
Preprocessed :  great beginner experienced person buy gift love
NER          :  [(Skype, 'ORG', 383)]
Preprocessed :  inexpensive tablet use learn step NABI thrilled learn Skype
NER          :  [(8 two weeks, 'DATE', 391), (SHINES, 'ORG', 383), (1280/800, 'CARDINAL', 397), (900, 'MONEY', 394), (INSANELY, 'ORG', 383), (only 7.7mm, 'QUANTITY', 395)]
Preprocessed :  Fire HD 8 week love tablet great value Prime Members tablet SHINES love able easily access Prime content movie download watch laterthis 1280/800 screen nice look nice crisp bright infact bright ipad pro cost $ 900 base model build fire INSANELY AWESOME run 7.7 mm thick smooth glossy feel amazing hold like futuristic tab ur hand
NER          :  [(Amazon, 'ORG', 383), (64gig, 'CARDINAL', 397), (hundreds of dollars, 'MONEY', 394)]
Preprocessed :  buy grand daughter come visit set user enter age Ama

Unnamed: 0,reviews.doRecommend,reviews.rating,reviews.text,reviews.title,reviews.username,preprocessed_text
0,True,5.0,This product so far has not disappointed. My c...,Kindle,Adapter,product far disappoint child love use like abi...
1,True,5.0,great for beginner or experienced person. Boug...,very fast,truman,great beginner experienced person buy gift love
2,True,5.0,Inexpensive tablet for him to use and learn on...,Beginner tablet for our 9 year old son.,DaveZ,inexpensive tablet use learn step NABI thrille...
3,True,4.0,I've had my Fire HD 8 two weeks now and I love...,Good!!!,Shacks,Fire HD 8 week love tablet great value Prime M...
4,True,5.0,I bought this for my grand daughter when she c...,Fantastic Tablet for kids,explore42,buy grand daughter come visit set user enter a...
5,True,5.0,This amazon fire 8 inch tablet is the perfect ...,Just what we expected,tklit,amazon fire 8 inch tablet perfect size purchas...
6,True,4.0,"Great for e-reading on the go, nice and light ...",great e-reader tablet,Droi,great e reading nice light weight price point ...
7,True,5.0,"I gave this as a Christmas gift to my inlaws, ...",Great for gifts,Kacy,give Christmas gift inlaw husband uncle love e...
8,True,5.0,Great as a device to read books. I like that i...,Great for reading,Weebee,great device read book like link borrow librar...
9,True,5.0,I love ordering books and reading them with th...,Great and lightweight reader,RoboBob,love order book read reader


In [5]:
# function for sentiment analysis
def analyze_sentiment(text):
    # Create a spaCy document
    doc = nlp(text)
    
    # Get the sentiment polarity and subjectivity
    
    # sentiment = doc.sentiment     # always 0
    polarity = doc._.blob.polarity
    subjectivity = doc._.blob.subjectivity
    sent = doc._.blob.sentiment

    # Determine the sentiment label
    if polarity > 0:
        sentiment = 'Positive'
    elif polarity < 0:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'

    return sentiment, polarity


In [10]:
# Save sentiment and polarity in the dataset for further review
reviews_data['sentiment'], reviews_data['polarity'] = zip(
    *reviews_data['preprocessed_text'].apply(analyze_sentiment))

# View the analysis result
for index, row in reviews_data.sort_values('polarity').iterrows():
    print(f"index: {index}")
    print(textwrap.fill(f"Review: {row['reviews.text']}", 180))  # Wrap for long text
    print(f"Preprocessed Text: {row['preprocessed_text']}")
    print(f"Review Rating: {row['reviews.rating']}")   # For verifying sentiment result
    print(f"Sentiment: {row['sentiment']}")  # Result from the model
    print(f"Polarity: {row['polarity']}")  # Result from the model
    print()

index: 17
Review: I really like this tablet. I would have given 5 stars but sometimes you have to push start several times after you unlock the screen and it is a little annoying.
Preprocessed Text: like tablet give 5 star push start time unlock screen little annoying
Review Rating: 4.0
Sentiment: Negative
Polarity: -0.49375

index: 58
Review: My daughter likes this tablet to play her online games!
Preprocessed Text: daughter like tablet play online game
Review Rating: 5.0
Sentiment: Negative
Polarity: -0.4

index: 34
Review: Just the right size for reading books and playing some games.
Preprocessed Text: right size read book play game
Review Rating: 5.0
Sentiment: Negative
Polarity: -0.05714285714285716

index: 13
Review: Simply does everything I need. Thank youAnd silk works wonders
Preprocessed Text: simply need thank youAnd silk work wonder
Review Rating: 5.0
Sentiment: Neutral
Polarity: 0.0

index: 38
Review: I bought 3 tablets and my family was not disappointed.
Preprocessed Text

In [9]:
# Further experiments on the model 
 
# Strip the text can help on NER
text = '        Google is  a good searching engine and fast to get result.'
print('text: ', text)
print('Result with strip():', analyze_sentiment(preprocess_text(text.strip())))
print('Result w/o  strip():', analyze_sentiment(preprocess_text(text)))  # Can't recongnise Google as ORG

# The lower/upper cases of name may affect NER
text = 'This travel book introduces lots of great places in USA, and they are not well known by people.'
print('\ntext: ', text)
print('Result with lower():', analyze_sentiment(preprocess_text(text.lower())))  # Can't recongnise USA
print('Result with upper():', analyze_sentiment(preprocess_text(text.upper())))
print('Result w/o changing case:', analyze_sentiment(preprocess_text(text)))

# 'Not' is removed causes wrong analysis result
text = 'Not easy for elderly users cease of ads that pop up.'
# text = 'I am very happy.'
print('\ntext: ', text)
print('Result with removing stopwords:', analyze_sentiment(preprocess_text(
    text, filter_stopwords=True)))  # Wronly interpreted as positive sentiment
print('Result w/o  removing stopwords:', analyze_sentiment(preprocess_text(
    text, filter_stopwords=False)))

text:          Google is  a good searching engine and fast to get result.
NER          :  [(Google, 'ORG', 383)]
Preprocessed :  Google   good searching engine fast result
Result with strip(): ('Positive', 0.44999999999999996)
NER          :  []
Preprocessed :           Google   good searching engine fast result
Result w/o  strip(): ('Positive', 0.44999999999999996)

text:  This travel book introduces lots of great places in USA, and they are not well known by people.
NER          :  []
Preprocessed :  travel book introduce lot great place usa know people
Result with lower(): ('Positive', 0.8)
NER          :  [(USA, 'GPE', 384)]
Preprocessed :  travel BOOK introduce LOTS great place USA know PEOPLE
Result with upper(): ('Positive', 0.8)
NER          :  [(USA, 'GPE', 384)]
Preprocessed :  travel book introduce lot great place USA know people
Result w/o changing case: ('Positive', 0.8)

text:  Not easy for elderly users cease of ads that pop up.
NER          :  []
Preprocessed :  easy el