From: Maya Johnson, Marketing Director
Challenge: Our marketing team struggles to quickly understand sentiment across thousands of
customer reviews. We need to identify products with sentiment that doesn’t match their star ratings and understand the language patterns that indicate customer satisfaction or dissatisfaction.
Request: Build a model that can predict review ratings based on the review text. This would
help us:
• Identify products with mismatched ratings and sentiment
• Extract key positive and negative product attributes
• Develop more accurate messaging for different product types
Success metrics: Rating prediction accuracy within 0.5 stars of the actual rating for at least
75% of reviews, and identification of key phrases that predict high or low ratings.

In [None]:
!pip install textblob lightgbm --quiet
import nltk
nltk.download('punkt') #used by textblob to split sentences



In [None]:
# Mount Google Drive if needed
from google.colab import drive
drive.mount('/content/drive')

# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from textblob import TextBlob
from lightgbm import LGBMRegressor

from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# Load preprocessing functions
from de3_preprocessing import load_preprocessed, compute_tfidf, normalize_features


In [None]:
# Load review text data and embeddings from preprocessed data
df, embeddings = load_preprocessed("/content/drive/MyDrive/DEAssignment3/review")

# Quick check of the data
print(df.columns)
df.head()


In [None]:
# Plot rating distribution
plt.figure(figsize=(8, 5))
sns.countplot(x="rating", data=df, palette="viridis")
plt.title("Distribution of Review Ratings")
plt.xlabel("Star Rating")
plt.ylabel("Number of Reviews")
plt.xticks(ticks=[0, 1, 2, 3, 4], labels=[1, 2, 3, 4, 5])
plt.tight_layout()
plt.show()

# Print exact counts and proportions (optional for reference)
rating_counts = df["rating"].value_counts().sort_index()
rating_proportions = df["rating"].value_counts(normalize=True).sort_index()
print("Rating counts:\n", rating_counts)
print("\nRating proportions:\n", rating_proportions)

Sentiment Detection Model: cardiffnlp/twitter-roberta-base-sentiment from HuggingFace.

This model is a version of RoBERTa that can classify text into positive, neutral, or negative sentiment classes. Each review is truncated to the first 500 characters because the model can only handle a limited sequence of characters.

In [None]:
#takes a review and uses textblob to compute the polarity and subjectivity, outputs a pair of numbers

from textblob import TextBlob
!python -m textblob.download_corpora


def compute_sentiment_smart(text):
    blob = TextBlob(text)
    sentence_polarities = [sentence.sentiment.polarity for sentence in blob.sentences]

    if not sentence_polarities:
        return 0.0, 0.0

    max_polarity = max(sentence_polarities)
    min_polarity = min(sentence_polarities)
    avg_polarity = np.mean(sentence_polarities)

    final_polarity = (avg_polarity + (max_polarity if abs(max_polarity) > abs(min_polarity) else min_polarity)) / 2
    subjectivity = blob.sentiment.subjectivity

    return final_polarity, subjectivity

# Apply it
df["polarity"], df["subjectivity"] = zip(*df["clean_text"].apply(compute_sentiment_smart))



Similar to Challenge 1 - Numeric features review_length and helpfulness_score are added and normalized. MinMaxScaler normalizes values between 0 and 1. CSR_matrix used to store numeric features sparsely.

In [None]:
# Compute TF-IDF features (based on cleaned text)
tfidf = TfidfVectorizer(max_features=300) #converts text into a matrix of word importance
tfidf_matrix = tfidf.fit_transform(df["clean_text"])

# 2. Embeddings are already loaded (from earlier preprocessing)
embedding_sparse = csr_matrix(embeddings)

# 3. Numeric features: review length and helpfulness
scaler = MinMaxScaler()
numeric_features = scaler.fit_transform(df[["review_length", "helpful_vote"]]) #adding numeric features review_length and helpfulness score
numeric_sparse = csr_matrix(numeric_features)

# Combine all features into one
X_combined = hstack([tfidf_matrix, embedding_sparse, numeric_sparse])

#embeddings are 384 dimensional vectors that capture context and meaning.



In [None]:
# Target variable
y = df["rating"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y
)

Random forest was skipped to save time

In [None]:
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

# # Random Forest Regressor
# rf_reg = RandomForestRegressor(n_estimators=50, random_state=42)
# rf_reg.fit(X_train, y_train)
# y_pred_rf = rf_reg.predict(X_test)

# LightGBM Regressor (new)
lgb_reg = LGBMRegressor(n_estimators=100, random_state=42)
lgb_reg.fit(X_train, y_train)
y_pred_lgb = lgb_reg.predict(X_test)


In [None]:
def within_half_star(y_true, y_pred): #checks how often the predicted rating is with .5 stars of actual
    return np.mean(np.abs(y_true - y_pred) <= 0.5) #returns percentage between 0-1 (the higher, the better)

print("Linear Regression accuracy (within 0.5 stars):", within_half_star(y_test, y_pred_lin))
# print("Random Forest Regression accuracy (within 0.5 stars):", within_half_star(y_test, y_pred_rf))
print("LightGBM Regression accuracy (within 0.5 stars):", within_half_star(y_test, y_pred_lgb))

# Adding
print("\n=== Regression Performance Metrics ===")
print("Linear Regression MAE:", mean_absolute_error(y_test, y_pred_lin)) #mean absolute error - on avg, how far off are predictions from true ratings
print("Linear Regression R^2:", r2_score(y_test, y_pred_lin)) #measures how well model can explain variance (1 = perfect, 0 = bad)

print("LightGBM Regression MAE:", mean_absolute_error(y_test, y_pred_lgb))
print("LightGBM Regression R^2:", r2_score(y_test, y_pred_lgb))


Comparing Regression with Classifications - how would results differ if we treated this as a classification problem?

In [None]:
from sklearn.metrics import classification_report
print("\nClassification report for LightGBM (rounded predictions):")
print(classification_report(y_test, np.round(y_pred_lgb)))

print("\nClassification report for LinReg(rounded predictions):")
print(classification_report(y_test, np.round(y_pred_lin)))



New Section: Classification modeling - predicting star ratings based on review features

Models Used:


*  Logistic Regression - trained using TF-IDF and other engineered features
*   SVC - trained to find optimal hyper plane to seperate different rating classes



In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

# 1. Create classification labels
#reusing rating column as labels

# Training a logistic regression model
log_clf = LogisticRegression(max_iter=1000, random_state=42)
log_clf.fit(X_train, y_train)

y_pred_log = log_clf.predict(X_test) #logreg's predictions

print("\n=== Logistic Regression Classifier Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

# training a SVC - finds best decision boundary to separate classes
  #predicts review rating based on review features
svc_clf = SVC(kernel="linear", random_state=42) #using linear kernel, assuming data can be separated w straight line
svc_clf.fit(X_train, y_train)

y_pred_svc = svc_clf.predict(X_test)

print("\n=== Support Vector Classifier Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc))


The next section is used to identify the most influential words that drive a classification as positive or negative


In [None]:
from sklearn.linear_model import LogisticRegression

# New Lexicon Extraction using Logistic Regression coefficients

#creating a binary label for sentiment: 1 = positive (4-5 stars), 0 = negative (1-2 stars)
df["positive_sentiment"] = df["rating"].apply(lambda x: 1 if x >= 4 else 0)

mask = df["rating"] != 3
X_text = tfidf_matrix[mask]
y_sentiment = df.loc[mask, "positive_sentiment"]

# 2. Train logistic regression
lexicon_clf = LogisticRegression(max_iter=1000, random_state=42)
lexicon_clf.fit(X_text, y_sentiment)

# 3. Extract top positive and negative words based on coefficients
feature_names = np.array(tfidf.get_feature_names_out())
coefs = lexicon_clf.coef_[0]

# Sort words
top_positive_indices = np.argsort(coefs)[-15:]  # Top 15 positive
top_negative_indices = np.argsort(coefs)[:15]   # Top 15 negative

positive_terms = pd.DataFrame({
    "term": feature_names[top_positive_indices],
    "coef": coefs[top_positive_indices]
}).sort_values(by="coef", ascending=False)

negative_terms = pd.DataFrame({
    "term": feature_names[top_negative_indices],
    "coef": coefs[top_negative_indices]
}).sort_values(by="coef")

# Print
print("\n=== Top Positive Sentiment Terms ===")
print(positive_terms)

print("\n=== Top Negative Sentiment Terms ===")
print(negative_terms)

# 4. Plot together
plt.figure(figsize=(10, 6))
top_terms = pd.concat([positive_terms, negative_terms])
sns.barplot(data=top_terms, x="coef", y="term", palette="coolwarm")
plt.title("Top Words Predicting Positive vs Negative Sentiment")
plt.xlabel("Logistic Regression Coefficient")
plt.ylabel("Term")
plt.tight_layout()
plt.show()


Sentiment vs Rating Mismatch Detection- This code checks for mismatches between a review's text sentiment and its star rating.
Specifically, it flags reviews that:

Sound positive (high polarity) but have a low star rating (1–2 stars)

Sound negative (low polarity) but have a high star rating (4–5 stars)

In [None]:
# Define mismatch based on TextBlob polarity
# - Positive polarity (> 0.4) but low rating (1 or 2)
# - Negative polarity (< -0.4) but high rating (4 or 5)
df["mismatch"] = ((df["polarity"] > 0.4) & (df["rating"] <= 2)) | \
                 ((df["polarity"] < -0.4) & (df["rating"] >= 4))

# Show how many mismatches
print(f"Total mismatches found: {df['mismatch'].sum()} / {len(df)} reviews")

# Display more mismatched examples with longer text shown
print("\nExamples of mismatched reviews:")
mismatched_reviews = df[df["mismatch"]][["text", "rating", "polarity"]]

# Optional: display longer text
pd.set_option("display.max_colwidth", None)  # disables truncation

# Sample and show more mismatches
display(mismatched_reviews.sample(10, random_state=42))  # adjust 10 if you want


In [None]:
# Group mismatches by product (parent_asin)
product_mismatch_counts = df[df["mismatch"]].groupby("parent_asin").size().sort_values(ascending=False)

# Create a DataFrame
product_mismatch_df = product_mismatch_counts.reset_index()
product_mismatch_df.columns = ["parent_asin", "mismatch_count"]

# Bring in just category (drop title)
product_info = df[["parent_asin", "category"]].drop_duplicates(subset="parent_asin")

# Merge counts with category info
product_mismatch_df = product_mismatch_df.merge(product_info, on="parent_asin", how="left")

# Top 20 products with the most mismatches
print("\n=== Top 20 Products with Most Sentiment/Rating Mismatches (with category) ===")
print(product_mismatch_df.head(20))


Category-Level Predictability - for each category, how does ratiing predictability change?

Similar analysis to above, just breaking it out by category

In [None]:
# initializing empty dataframe
category_results = []

# Looping through each category separately
for category in df["category"].unique():
    # Select reviews from this category
    cat_mask = df["category"] == category
    X_cat = X_combined[cat_mask]
    y_cat = df.loc[cat_mask, "rating"]

    # Split into train/test for this category
    X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(
        X_cat, y_cat, test_size=0.2, random_state=42, stratify=y_cat
    )

    # Train a simple model (LightGBM, like before
    model_cat = LGBMRegressor(n_estimators=50, random_state=42)
    model_cat.fit(X_train_cat, y_train_cat)
    y_pred_cat = model_cat.predict(X_test_cat)

    # Evaluate accuracy within 0.5 stars
    acc = within_half_star(y_test_cat, y_pred_cat)

    # Save results
    category_results.append({
        "category": category,
        "accuracy_within_0.5_star": acc
    })

# Convert to DataFrame and show
category_results_df = pd.DataFrame(category_results).sort_values(by="accuracy_within_0.5_star", ascending=False)
print("\n=== Predictability by Product Category ===")
print(category_results_df)

# Plot
plt.figure(figsize=(10, 5))
sns.barplot(data=category_results_df, x="accuracy_within_0.5_star", y="category")
plt.title("Rating Predictability by Product Category")
plt.xlabel("Accuracy within 0.5 stars")
plt.ylabel("Category")
plt.tight_layout()
plt.show()

Review length and complexity analysis

Model Used


*   Light GBM Regressor - boosted decision tree model. Builds a series of decision trees, where each new tree tries to correct mistakes from previous



In [None]:
#making bins for review length
df["length_bin"] = pd.cut(df["review_length"], bins=[0, 25, 50, 100, 200, np.inf],
                          labels=["0-25 words", "26-50 words", "51-100 words", "101-200 words", "200+ words"])

# for each bin, analyze a subset of the reviews.
length_results = []

for bin_label in df["length_bin"].unique():
    bin_mask = df["length_bin"] == bin_label
    X_bin = X_combined[bin_mask]
    y_bin = df.loc[bin_mask, "rating"]

    if len(y_bin) < 100:  # Skip the smallest groups
        continue

    X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
        X_bin, y_bin, test_size=0.2, random_state=42, stratify=y_bin
    )

    model_bin = LGBMRegressor(n_estimators=50, random_state=42) #LightGBM regressor model used
    model_bin.fit(X_train_bin, y_train_bin)
    y_pred_bin = model_bin.predict(X_test_bin)

    acc = within_half_star(y_test_bin, y_pred_bin)

    length_results.append({
        "length_bin": bin_label,
        "accuracy_within_0.5_star": acc
    })

length_results_df = pd.DataFrame(length_results).sort_values(by="accuracy_within_0.5_star", ascending=False)
print("\n=== Predictability by Review Length ===")
print(length_results_df)

# Plot
plt.figure(figsize=(10, 5))
sns.barplot(data=length_results_df, x="accuracy_within_0.5_star", y="length_bin")
plt.title("Rating Predictability by Review Length")
plt.xlabel("Accuracy within 0.5 stars")
plt.ylabel("Review Length Bin")
plt.tight_layout()
plt.show()
