In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

sns.set_theme(style="whitegrid")

In [None]:
text_path = Path('../data/great_expectations.txt')
reviews_path = Path('../data/synthetic_product_reviews.csv')

if not reviews_path.exists():
    raise FileNotFoundError(f'Missing {reviews_path}. Run: python ../scripts/generate_synthetic_datasets.py')

with open(text_path, 'r', encoding='utf-8') as f:
    book = f.read()

reviews = pd.read_csv(reviews_path)
reviews.head()

## Sentiment on reviews (VADER)

In [None]:
analyzer = SentimentIntensityAnalyzer()
scores = reviews['review_text'].apply(lambda s: analyzer.polarity_scores(str(s))['compound'])
reviews = reviews.assign(vader_compound=scores)
reviews[['category','stars','vader_compound','review_text']].head()

In [None]:
plt.figure(figsize=(7,4))
sns.boxplot(data=reviews, x='stars', y='vader_compound')
plt.title('VADER compound vs star rating')
plt.show()

In [None]:
plt.figure(figsize=(8,4))
sns.kdeplot(data=reviews, x='vader_compound', hue='category', common_norm=False)
plt.title('Sentiment distribution by category')
plt.show()

## Topic modeling (NMF) on review text

In [None]:
corpus = reviews['review_text'].astype(str).tolist()

vectorizer = TfidfVectorizer(
    stop_words='english',
    min_df=10,
    max_df=0.95,
    ngram_range=(1, 2),
)
X = vectorizer.fit_transform(corpus)

n_topics = 6
nmf = NMF(n_components=n_topics, random_state=42, init='nndsvda')
W = nmf.fit_transform(X)
H = nmf.components_

feature_names = np.array(vectorizer.get_feature_names_out())

def top_terms(topic_idx: int, n: int = 10):
    top = np.argsort(H[topic_idx])[::-1][:n]
    return feature_names[top]

for i in range(n_topics):
    print(f'Topic {i}:', ', '.join(top_terms(i, 10)))

## Bonus: quick sentiment on the book (sampled)
We sample short snippets and score them with VADER just to show portability of the technique.

In [None]:
# Sample 200 short snippets from the book
rng = np.random.default_rng(42)
words = book.split()
snippets = []
for _ in range(200):
    start = int(rng.integers(0, max(len(words) - 60, 1)))
    snippet = ' '.join(words[start:start+60])
    snippets.append(snippet)

snippet_scores = [analyzer.polarity_scores(s)['compound'] for s in snippets]
plt.figure(figsize=(7,4))
sns.histplot(snippet_scores, bins=30)
plt.title('VADER compound score distribution (book snippets)')
plt.show()