In [1]:
#import any necessary libraries
import numpy as np
import pandas as pd
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob



In [2]:
#load the spacy english model (medium) and add pipeline
nlp = spacy.load('en_core_web_md')
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x120d1bb80>

In [3]:
#load the dataset into pandas dataframe
amazon_df = pd.read_csv('amazon_product_reviews.csv', low_memory=False)
amazon_df

In [15]:
#display dataframe
print(amazon_df["reviews.text"].head())
print(amazon_df["reviews.text"].shape)
amazon_df["reviews.text"].isnull().sum()

0    I thought it would be as big as small paper bu...
1    This kindle is light and easy to use especiall...
2    Didnt know how much i'd use a kindle so went f...
3    I am 100 happy with my purchase. I caught it o...
4    Solid entry level Kindle. Great for kids. Gift...
Name: reviews.text, dtype: object
(5000,)


0

In [5]:
#drop any rows with missing values and prints
clean_data = amazon_df[["reviews.text", "reviews.title"]]
clean_data.head()

(5000,)
0    I thought it would be as big as small paper bu...
1    This kindle is light and easy to use especiall...
2    Didnt know how much i'd use a kindle so went f...
3    I am 100 happy with my purchase. I caught it o...
4    Solid entry level Kindle. Great for kids. Gift...
Name: reviews.text, dtype: object


In [6]:
# Function to preprocess and extract individual words
def preprocess_text(text):
    doc = nlp(text)
    preprocessed_tokens = []
    
    for token in doc:
        if not token.is_stop and not token.is_punct:
            clean_token = token.lemma_.lower()
            preprocessed_tokens.append(clean_token)
    
    return preprocessed_tokens

In [7]:
clean_data['processed_reviews'] = clean_data['reviews.text'].apply(preprocess_text)
clean_data.head()

0    think big small paper turn like palm think sma...
1               kindle light easy use especially beach
2    not know use kindle go low end m happy little ...
3    100 happy purchase catch sale good price norma...
4    solid entry level kindle great kid gifted kid ...
Name: reviews.text, dtype: object


In [9]:
# Sentiment analysis function
def sentiment_analysis(review):
    polarity_value = review._.blob.polarity
    subjectivity_value = review._.blob.subjectivity
    
    return polarity_value, subjectivity_value


In [12]:
#load the spacy english model (medium)
nlp = spacy.load('en_core_web_md')

In [13]:
#create function to compare the similarity between two reviews
def similarity(first, second):
    similarity_results = nlp(first).similarity(nlp(second))
    return(similarity_results)

In [14]:
first_review = clean_data['processed_reviews'][203]
second_review = clean_data['processed_reviews'][659]
print(f"Review one: {first_review}")
print(f"Review two: {second_review}")
print(f"Similarity: {similarity(first_review, second_review)}")


Review one: This Echo Show comes in handy. I use it almost as much as I thought I would. I am still learning all that it can do, but my favorite part is the screen. It‚Äôs not a huge deal, but if I could change one thing it would be a detachable power cable instead of hardwired.
Review two: Great item to upgrade your house. Works very well.
Similarity: ('This Echo Show comes in handy. I use it almost as much as I thought I would. I am still learning all that it can do, but my favorite part is the screen. It‚Äôs not a huge deal, but if I could change one thing it would be a detachable power cable instead of hardwired.', 'Great item to upgrade your house. Works very well.')


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
from collections import defaultdict

In [None]:
positive_words = defaultdict(int)
negative_words = defaultdict(int)

# Process each review, extract individual words, and perform sentiment analysis
for review_text in clean_data['reviews.text'].values:
    tokens = preprocess_text(review_text)
    review_doc = nlp(" ".join(tokens))
    polarity_value, _ = sentiment_analysis(review_doc)
    
    if polarity_value > 0:
        for word in tokens:
            positive_words[word] += 1
    elif polarity_value < 0:
        for word in tokens:
            negative_words[word] += 1

# Check the content of positive words and negative words
print("Positive words:", positive_words)
print("Negative words:", negative_words)

In [None]:
# Check the content of positive_words and negative_words
print("Positive words:", positive_words)
print("Negative words:", negative_words)

print("Positive words count:", len(positive_words))
print("Negative words count:", len(negative_words))

In [None]:
# Generate word clouds from positive and negative word frequencies
pos_wordcloud = WordCloud(width=400, height=200, background_color='white').generate_from_frequencies(positive_words)
neg_wordcloud = WordCloud(width=400, height=200, background_color='white').generate_from_frequencies(negative_words)

# Display the word clouds
fig, ax = plt.subplots(1, 2, figsize=(15, 7))

ax[0].imshow(pos_wordcloud, interpolation='bilinear')
ax[0].set_title('Positive Words')
ax[0].axis('off')

ax[1].imshow(neg_wordcloud, interpolation='bilinear')
ax[1].set_title('Negative Words')
ax[1].axis('off')