# Import Libraries

In [75]:
import kagglehub
import pandas as pd

# Download the dataset

In [None]:
# Download latest version
# https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")
print("Path to dataset files:", path)

In [None]:
# Load the dataset
df = pd.read_csv(path + "/Reviews.csv")
print(df.head())

In [None]:
print(df.info())

# Extract Features and Target

In [None]:
# Score is the rating of the product. This will be our target variable.
df_score = df["Score"]
df_score

In [None]:
df_score.unique()

In [None]:
score_statistics = df_score.describe()
score_statistics

In [None]:
import matplotlib.pyplot as plt
# Plotting the bar chart for the value counts of the scores
score_counts = df_score.value_counts()
plt.figure(figsize=(10, 2))
score_counts.plot(kind='bar')
plt.title('Distribution of Scores')
plt.xlabel('Score')
plt.ylabel('Count')
plt.show()

In [None]:
# Summary is the title of the review. This will be our feature variable.
df_summary = df["Summary"]
df_summary

In [None]:
import numpy as np
rand_idxs = np.random.randint(0, len(df), size=10)
for idx in rand_idxs:
    print(f"Score: {df_score.iloc[idx]} - Summary: {df_summary.iloc[idx]}")

In [85]:
# We zero out the data to free up memory
df = 0

# Preprocessing

The Preprocessing steps we will use are:
1. Lower Casing
2. Replacing URLs
3. Replacing Emojis
4. Replacing Usernames
5. Removing Non-Alphabets
6. Removing Consecutive letters
7. Removing Short Words
8. Removing Stopwords
9. Lemmatizing

## Lowercase

In [86]:
def lowercase_text(text):
    # Convert text to lowercase.
    return str(text).lower()

In [87]:
# Apply lowercase function to all summaries
df_summary = df_summary.apply(lowercase_text)

In [None]:
# Display a few examples to verify the transformation
print("After lowercase transformation:")
rand_idxs = np.random.randint(0, len(df_summary), size=10)
for idx in rand_idxs:  # Show first 5 examples
    print(f"Score: {df_score.iloc[idx]} - Summary: {df_summary.iloc[idx]}")

## Replace URLs

In [None]:
import re

def replace_urls(text):
    """
    Replace URLs in text with a placeholder.
    In NLP, we might want to remove URLs to: 
        Reduce noise - URLs rarely add semantic meaning.
        Improve model performance - Less irrelevant data = better learning.
        Normalize text - Keeps data consistent and clean.
    """
    # Regular expression pattern to match URLs
    # This pattern matches:
    # - http:// or https:// at the beginning
    # - Followed by any combination of:
    #   - Letters (a-z, A-Z)
    #   - Numbers (0-9)
    #   - Special characters like $-_@.&+
    #   - Escaped characters like !*()
    #   - URL-encoded characters (e.g., %20 for space)
    # The + at the end ensures we match one or more of these characters
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Replace URLs with placeholder
    return re.sub(url_pattern, 'URL', str(text))

# Apply URL replacement to all summaries
df_summary = df_summary.apply(replace_urls)

# Display a few examples to verify the transformation
print("After URL replacement:")
rand_idxs = np.random.randint(0, len(df_summary), size=10)
for idx in rand_idxs:
    print(f"Score: {df_score.iloc[idx]} - Summary: {df_summary.iloc[idx]}")


## Replacing Emojis

In [90]:
import re

# re.compile will compile the regex pattern into a regex object, necessary for 
# efficient pattern matching. This creates a reusable pattern object that can be
# used multiple times without recompiling the pattern each time, improving performance.
emoji_pattern = re.compile("["

    # Emoticons (e.g., 😀😁😂🤣😃😄😅😆)
    u"\U0001F600-\U0001F64F"  

    # Symbols & pictographs (e.g., 🔥🎉💡📦📱)
    u"\U0001F300-\U0001F5FF"  

    # Transport & map symbols (e.g., 🚗✈️🚀🚉)
    u"\U0001F680-\U0001F6FF"  

    # Flags (e.g., 🇺🇸🇬🇧🇨🇦 — these are pairs of regional indicators)
    u"\U0001F1E0-\U0001F1FF"  

    # Dingbats (e.g., ✂️✈️✉️⚽)
    u"\u2700-\u27BF"          

    # Supplemental Symbols & Pictographs (e.g., 🤖🥰🧠🦾)
    u"\U0001F900-\U0001F9FF"  

    # Symbols & Pictographs Extended-A (e.g., 🪄🪅🪨)
    u"\U0001FA70-\U0001FAFF"  

    # Miscellaneous symbols (e.g., ☀️☁️☂️⚡)
    u"\u2600-\u26FF"          

    "]+", flags=re.UNICODE)


In [91]:
# This pattern will match common text-based emoticons that aren't covered by the emoji Unicode ranges
# These emoticons are made up of regular ASCII characters like colons, parentheses, etc.
# Examples include:
# :) - happy face
# :( - sad face
# :D - laughing face
# ;) - winking face
emoticon_pattern = re.compile(r'(:\)|:\(|:D|:P|;\)|:-\)|:-D|:-P|:\'\(|:\||:\*)')

In [92]:
def remove_and_print(text):
    if emoji_pattern.search(text) or emoticon_pattern.search(text):
        print(text)
        text = emoji_pattern.sub('', text)
        text = emoticon_pattern.sub('', text)
        print(text)
    return text

In [None]:
df_summary = df_summary.apply(remove_and_print)

## Replacing Usernames

In [94]:
import re

def replace_usernames(text):
    """
    Replace usernames (words starting with @) with a generic token.
    
    Usernames are personally identifiable information and not relevant for sentiment analysis.
    Replacing them with a consistent token helps reduce noise and protect privacy.
    """
    # Replace @username with [USER]
    return re.sub(r'@\w+', '[USER]', str(text))

# Apply username replacement to all summaries
df_summary = df_summary.apply(replace_usernames)

## Removing Non-Alphabets

In [None]:
def remove_non_alphabets(text):
    """
    Remove non-alphabetic characters from text.
    
    Non-alphabetic characters like numbers and special symbols like #, $ etc. often add noise
    to text analysis. 
    Removing them helps focus on the actual words and their meanings,
    which is more relevant for sentiment analysis.
    """
    # Keep only alphabetic characters and spaces
    return re.sub(r'[^a-zA-Z\s]', '', str(text))

# Apply non-alphabet removal to all summaries
df_summary = df_summary.apply(remove_non_alphabets)

# Display a sample of the cleaned summaries
print(df_summary.head())

## Removing Consecutive letters

In [None]:
def remove_consecutive_letters(text):
    """
    Remove consecutive repetitions of letters (more than 3) in text.
    
    In informal text like reviews, people often express emotions by elongating words
    (e.g., 'sooooo good', 'loooove it'). Normalizing these to a maximum of two consecutive
    occurrences helps standardize the text while preserving the emphasis.
    """
    # In the following pattern, 3 is the minimum number of consecutive occurrences of the same letter
    pattern = r'(\w)\1{3,}'
    
    # Replace with just two occurrences of the letter
    return re.sub(pattern, r'\1\1', str(text))

# Apply consecutive letter removal to all summaries
df_summary = df_summary.apply(remove_consecutive_letters)

# Display a sample of the cleaned summaries
print(df_summary.head())

## Removing Short Words

In [97]:
def remove_short_words(text):
    """
    Remove words that are too short (less than 3 characters) from text.
    
    Very short words like 'a', 'an', 'to', etc. are often not meaningful for sentiment analysis.
    Removing these helps reduce noise and focus on more significant words that carry sentiment.
    """
    # Split the text into words
    words = str(text).split()
    
    # Filter out words that are less than 3 characters
    filtered_words = [word for word in words if len(word) >= 3]
    
    # Join the filtered words back into a string
    return ' '.join(filtered_words)

# Apply short word removal to all summaries
df_summary = df_summary.apply(remove_short_words)

## Removing Stopwords

In [None]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

print("Sample stopwords:", list(stopwords.words('english'))[:10])

In [None]:
def remove_stopwords(text):
    """
    Remove common stopwords from text.
    
    Stopwords are common words like 'the', 'and', 'is', etc. that don't carry much meaning
    for sentiment analysis. Removing them helps focus on the more meaningful content words.
    """
    # Get English stopwords
    stop_words = set(stopwords.words('english'))
    
    # Split the text into words
    words = str(text).split()
    
    # Filter out stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Join the filtered words back into a string
    return ' '.join(filtered_words)

# Apply stopword removal to all summaries
df_summary = df_summary.apply(remove_stopwords)

# Display a sample of the cleaned summaries
print(df_summary.head())

## Stemming and Lemmatizing

In [None]:
# Stemming and Lemmatization
# --------------------------
# Both stemming and lemmatization are text normalization techniques used to reduce words to their base forms.
#
# Stemming:
# - A rule-based process that removes word endings to get the root form (stem)
# - Fast but often produces non-words (e.g., "running" → "run", but "trouble" → "troubl")
# - Less accurate but computationally efficient
# - Useful for search engines where exact word forms are less important
#
# Lemmatization:
# - Converts words to their dictionary base form (lemma) using vocabulary and morphological analysis
# - More accurate as it produces actual words (e.g., "better" → "good")
# - Considers the context and part of speech
# - Computationally more intensive but produces better results for NLP tasks
# - Preferred for sentiment analysis and other advanced NLP applications

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download WordNet if not already downloaded
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Example text for demonstration
example_text = "The cats are running and jumping in the fields"
words = example_text.split()

print("Original words:", words)

# Stemming example
stemmed_words = [stemmer.stem(word) for word in words]
print("Stemmed words:", stemmed_words)

# Lemmatizing example
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print("Lemmatized words:", lemmatized_words)

In [None]:
# Lemmatizing with part of speech tagging
# Download necessary resources for POS tagging
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

# Function to lemmatize text with POS tagging
def lemmatize_text(text):
    """
    Lemmatize text using WordNet lemmatizer with POS tagging.
    
    Lemmatization reduces words to their base or dictionary form (lemma) while
    considering the part of speech. This is more accurate than stemming as it 
    produces actual words.
    
    Example:
    - "running" -> "run" (verb form)
    - "better" -> "good" (adjective form)
    """
    # Tokenize the text
    words = str(text).split()
    
    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join the lemmatized words back into a string
    return ' '.join(lemmatized_words)

# Apply lemmatization to all summaries
df_summary = df_summary.apply(lemmatize_text)

# Display a sample of the lemmatized summaries
print("\nLemmatized summaries:")
print(df_summary.head())

# Visualize

## Word Cloud for positive sentiments

In [None]:
from wordcloud import WordCloud

# Filter summaries for df_score >= 4
filtered_summaries = df_summary[df_score >= 4]

# Combine all filtered summaries into a single string
all_summaries = " ".join(str(summary) for summary in filtered_summaries)

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_summaries)

# Clear the memory
all_summaries = 0

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

## Word Cloud for negative sentiments

In [None]:
from wordcloud import WordCloud

# Filter summaries for df_score 1
filtered_summaries = df_summary[df_score == 1]

# Combine all filtered summaries into a single string
all_summaries = " ".join(str(summary) for summary in filtered_summaries)

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_summaries)

# Clear the memory
all_summaries = 0

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()