In [1]:
import pandas as pd
import numpy as np
import re
import os
import string
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# Download necessary resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /Users/mayi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mayi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mayi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load dataset
file_path = os.path.expanduser("~/Desktop/School/6200/Project/Reviews.csv")  
df = pd.read_csv(file_path)

print(df.head())

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [4]:
# Display column names
print("Columns in the dataset:\n", df.columns)

Columns in the dataset:
 Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')


In [5]:
# Get general info
print(df.info())

# Descriptive statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB
None
                  Id  HelpfulnessNumerator  HelpfulnessDenominator  \
count  568454.000000         568454.000000            568454.00000   
mean   284227.500000              1.743817               

In [6]:
# Add a new column for text length
df["Text_Length"] = df["Text"].astype(str).apply(len)

# Display descriptive statistics for text length
print(df["Text_Length"].describe())

count    568454.000000
mean        436.222083
std         445.339741
min          12.000000
25%         179.000000
50%         302.000000
75%         527.000000
max       21409.000000
Name: Text_Length, dtype: float64


In [7]:
# Read a few random review samples
print(df["Text"].sample(10, random_state=45))

165256    Having tried a couple of other brands of glute...
231465    My cat loves these treats. If ever I can't fin...
427827    A little less than I expected.  It tends to ha...
433954    First there was Frosted Mini-Wheats, in origin...
70260     and I want to congratulate the graphic artist ...
49866     Please add more Pineapple flavor to your packa...
551047    I absolutely love Yorkshire tea and am so glad...
18983     I have such a hard time finding loose tea loca...
138968    Previously, I've attempted a recipe with white...
36352     I make pancakes or waffles every Saturday morn...
Name: Text, dtype: object


In [8]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [9]:
# Text cleaning function
def clean_text(text):
    if not isinstance(text, str):
        return ""  # Handle missing values
    
    # 1. Remove HTML tags using a regex pattern
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags

    # 2. Replace URLs with a placeholder "URL"
    text = re.sub(r'http[s]?://\S+', 'URL', text)

    # 3. Convert to lowercase
    text = text.lower()

    # 4. Remove special characters, numbers, and punctuation
    text = re.sub(r"[^a-z\s]", "", text)

    # 5. Tokenize the text into words
    words = word_tokenize(text)

    # 6. Remove stopwords and lemmatize
    cleaned_text = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    # 7. Replace multiple spaces with a single space
    cleaned_text = " ".join(cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text

In [10]:
# Apply the updated cleaning function to the Text column
df["Cleaned_Text"] = df["Text"].apply(clean_text)

In [11]:
# Spot-checking
spotcheck_sample = df[["Text", "Cleaned_Text"]].sample(5, random_state=45)
print(spotcheck_sample)

                                                     Text  \
300665  I have used this product before and found it t...   
167485  The product arrived a bit later than expected ...   
313460  I never eat this stuff, usually but Walmart ha...   
7589    It's all natural, no artificial color, no weir...   
48968   I've not actually used any other filter paper ...   

                                             Cleaned_Text  
300665  used product found best product jerk chicken c...  
167485  product arrived bit later expected arrived goo...  
313460  never eat stuff usually walmart sitting right ...  
7589    natural artificial color weird ingredient also...  
48968   ive actually used filter paper cant make compa...  


In [14]:
# Initialize VADER Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

In [19]:
# Tokenize the Cleaned_Text once and store it in a new column 'Tokens'
df['Tokens'] = df['Cleaned_Text'].apply(word_tokenize)

# Apply Feature Engineering on the pre-tokenized text
def extract_features_from_tokens(tokens):
    word_count = len(tokens)
    unique_word_count = len(set(tokens))
    avg_word_length = sum(len(word) for word in tokens) / word_count if word_count else 0

    # Sentiment analysis with VADER
    sentiment = analyzer.polarity_scores(" ".join(tokens))  # VADER expects a string, not tokens

    return [word_count, unique_word_count, avg_word_length, sentiment['compound']]

# Now apply the feature extraction on the 'Tokens' column (pre-tokenized)
df[['Word_Count', 'Unique_Word_Count', 'Avg_Word_Length', 'Sentiment_Score']] = df['Tokens'].apply(
    lambda x: pd.Series(extract_features_from_tokens(x)))

# Spot-check the results
print(df[['Text', 'Cleaned_Text', 'Tokens', 'Word_Count', 'Unique_Word_Count', 'Avg_Word_Length', 'Sentiment_Score']].head())


                                                Text  \
0  I have bought several of the Vitality canned d...   
1  Product arrived labeled as Jumbo Salted Peanut...   
2  This is a confection that has been around a fe...   
3  If you are looking for the secret ingredient i...   
4  Great taffy at a great price.  There was a wid...   

                                        Cleaned_Text  \
0  bought several vitality canned dog food produc...   
1  product arrived labeled jumbo salted peanutsth...   
2  confection around century light pillowy citrus...   
3  looking secret ingredient robitussin believe f...   
4  great taffy great price wide assortment yummy ...   

                                              Tokens  Word_Count  \
0  [bought, several, vitality, canned, dog, food,...        23.0   
1  [product, arrived, labeled, jumbo, salted, pea...        18.0   
2  [confection, around, century, light, pillowy, ...        40.0   
3  [looking, secret, ingredient, robitussin, beli...  

In [20]:
def classify_sentiment(sentiment_score):
    if sentiment_score > 0.1:
        return "Positive"
    elif sentiment_score < -0.1:
        return "Negative"
    else:
        return "Neutral"

# Apply sentiment classification based on VADER sentiment score
df['Sentiment_Label'] = df['Sentiment_Score'].apply(classify_sentiment)

# Show some of the classified sentiments
print(df[['Cleaned_Text', 'Sentiment_Score', 'Sentiment_Label']].head())

                                        Cleaned_Text  Sentiment_Score  \
0  bought several vitality canned dog food produc...           0.9413   
1  product arrived labeled jumbo salted peanutsth...          -0.1027   
2  confection around century light pillowy citrus...           0.8532   
3  looking secret ingredient robitussin believe f...           0.4404   
4  great taffy great price wide assortment yummy ...           0.9468   

  Sentiment_Label  
0        Positive  
1        Negative  
2        Positive  
3        Positive  
4        Positive  


In [23]:
# Select 100 random rows
random_sample = df.sample(100, random_state=45)

# Extract sentiment predictions from VADER
random_sample['VADER_Sentiment'] = random_sample['Sentiment_Score'].apply(
    lambda score: 'positive' if score > 0 else 'negative' if score < 0 else 'neutral'
)

In [28]:
# Spot-check several examples
spotcheck_sample = random_sample[['Text', 'VADER_Sentiment']].sample(5, random_state=45)

print(spotcheck_sample)

                                                     Text VADER_Sentiment
300665  I have used this product before and found it t...        negative
167485  The product arrived a bit later than expected ...        positive
76005   The flavor in these is intense and wonderful. ...        positive
146696  My first cup, I said "wow" -- Henry's Blend is...        positive
203398  This is a highly unusual tea -- real licorice ...        positive


In [27]:
# To manually review the sentiment, you can check the first few examples
for idx, row in spotcheck_sample.iterrows():
    print(f"Review Text: {row['Text']}")
    print(f"VADER Sentiment: {row['VADER_Sentiment']}")
    print("-" * 50)

Review Text: My son had a little trouble on occasion with constipation and when he did, this would be our go to meal. The prunes always seemed to help, and he loves the taste.
VADER Sentiment: positive
--------------------------------------------------
Review Text: This baby food has very simple ingredients: no sugar added, no preservatives (lemon juice only), no coloring. It is mostly all pear however. There is very little iron (2%), that I though it should have had from spinach, and virtually no protein - again, I though broccoli would add protein to this mix. It does have 70% of vitamin C however. Taste good: not over the top sweet from the pear and not over the top tart from the lemon juice.  I really wish it would have more greens in it though.
VADER Sentiment: positive
--------------------------------------------------
Review Text: As a vet tech interested in pet nutrition Royal Canin is one of the best. My cat really likes this food and it seems to be agreeing with her. Concentr