In [28]:
import pandas as pd              # For loading and handling the dataset
import spacy                     # For natural language processing
import matplotlib.pyplot as plt  # For visualizing sentiment distribution
from textblob import TextBlob

In [29]:
# Load the data
df = pd.read_csv("Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")
nlp = spacy.load("en_core_web_md")

In [30]:
reviews_data = df['reviews.text']

In [31]:
# Remove missing (NaN) values from the review column
clean_data = df.dropna(subset=['reviews.text'])

In [32]:
def preprocess_text(text):
    """
    Cleans and preprocesses a single review.
    - Removes stop words
    - Removes punctuation
    - Converts to lowercase
    - Keeps only alphabetic tokens
    Returns the cleaned text string.
    """
    # Process the text using spaCy
    doc = nlp(text)

    # Keep only meaningful words (no stopwords, no punctuation)
    tokens = [token.lemma_.lower() for token in doc
              if not token.is_stop and token.is_alpha]

    # Join tokens back into a cleaned sentence
    cleaned_text = " ".join(tokens)
    return cleaned_text


In [33]:
# Apply the cleaning function to each review text
clean_data['cleaned_review'] = clean_data['reviews.text'].apply(preprocess_text)

In [34]:
def get_sentiment(review_text):
    """
    Takes a product review as input and predicts its sentiment.
    Uses TextBlob to calculate polarity (from -1 to +1):
        >  0.1: Positive
        < -0.1: Negative
        Otherwise: Neutral
    Returns a string label for the sentiment.
    """
    # Compute polarity using TextBlob
    polarity = TextBlob(review_text).sentiment.polarity

    # Categorize sentiment
    if polarity > 0.1:
        return 'positive'
    elif polarity < -0.1:
        return 'negative'
    else:
        return 'neutral'

In [35]:
# Randomly select a few reviews from the cleaned data
sample_real_reviews = clean_data[['reviews.text', 'cleaned_review']].sample(4, random_state=42)

# Apply the sentiment function to each selected review
sample_real_reviews['predicted_sentiment'] = sample_real_reviews['cleaned_review'].apply(get_sentiment)

# Display results
for index, row in sample_real_reviews.iterrows():
    print("Original Review:")
    print(row['reviews.text'])
    print(f"Predicted Sentiment: {row['predicted_sentiment']}")
    print("-" * 90)

Original Review:
Awesome tablet. I was amazed how fast it is. And the software is very user friendly
Predicted Sentiment: positive
------------------------------------------------------------------------------------------
Original Review:
They don't last. USed in electronics (like computer mice, computer keyboards) Energizer or Duracell last easily 3x longer. Not worth the savings. If its going in some kids game or infrequent use (where the battery would be dead later anyways) ok.
Predicted Sentiment: neutral
------------------------------------------------------------------------------------------
Original Review:
Thx.
Predicted Sentiment: neutral
------------------------------------------------------------------------------------------
Original Review:
kids love it EZ to use great Quality bought this for the grand kids had super reviews also free Amazon for a year 2 year warranty its the best buy out there
Predicted Sentiment: positive
------------------------------------------------

### References:

- Stackoverflow: How to filter stopwords for spaCy tokenized text contained in a Pandas dataframe
https://stackoverflow.com/questions/62266678/how-to-filter-stopwords-for-spacy-tokenized-text-contained-in-a-pandas-dataframe

- TextBlob: Tutorial: Quickstart â€” TextBlob 0.19.0 documentation
https://textblob.readthedocs.io/en/dev/quickstart.html