In [4]:
from google.colab import drive
drive.mount('/content/drive')

import json
import re
from collections import defaultdict, Counter

import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from openai import OpenAI


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
REVIEWS_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Data/Movies_and_TV_sentiment_v1.jsonl"

CLUSTERS_OUTPUT_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Data/product_clusters.jsonl"


In [6]:
MIN_TEXT_LENGTH = 30

def is_valid_review(text):
    if not text or len(text.strip()) < MIN_TEXT_LENGTH:
        return False

    text = text.lower()
    noise_patterns = [
        "no comment", "no comments",
        "item was canceled", "order was canceled",
        "did not watch", "never watched",
        "was a gift"
    ]
    return not any(p in text for p in noise_patterns)


In [7]:
reviews = []

with open(REVIEWS_PATH, "r") as f:
    for line in f:
        r = json.loads(line)
        if "asin" in r and "text" in r:
            if is_valid_review(r["text"]):
                reviews.append(r)

print("Valid reviews loaded:", len(reviews))


Valid reviews loaded: 39499


In [8]:
product_reviews = defaultdict(list)

for r in reviews:
    product_reviews[r["asin"]].append(r["text"])


In [9]:
MAX_REVIEWS_PER_PRODUCT = 20
MIN_REVIEWS_PER_PRODUCT = 3
MAX_PRODUCTS = 20000

product_ids = []
product_texts = []

for asin, texts in product_reviews.items():
    if len(texts) < MIN_REVIEWS_PER_PRODUCT:
        continue

    product_ids.append(asin)
    product_texts.append(" ".join(texts[:MAX_REVIEWS_PER_PRODUCT]))

product_ids = product_ids[:MAX_PRODUCTS]
product_texts = product_texts[:MAX_PRODUCTS]

print("Products for clustering:", len(product_ids))


Products for clustering: 2009


In [10]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embedding_model.encode(
    product_texts,
    batch_size=32,
    show_progress_bar=True
)

kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

In [11]:
with open(CLUSTERS_OUTPUT_PATH, "w") as f:
    for asin, cid in zip(product_ids, cluster_labels):
        f.write(json.dumps({
            "asin": asin,
            "cluster_id": int(cid)
        }) + "\n")

print("Clusters saved to:")
print(CLUSTERS_OUTPUT_PATH)


Clusters saved to:
/content/drive/MyDrive/amazon_reviews_2025/Data/product_clusters.jsonl


In [12]:
product_clusters = {}

with open(CLUSTERS_OUTPUT_PATH, "r") as f:
    for line in f:
        r = json.loads(line)
        product_clusters[r["asin"]] = r["cluster_id"]

print("Products with clusters:", len(product_clusters))


Products with clusters: 2009


In [13]:
final_reviews = []

for r in reviews:
    asin = r["asin"]
    if asin not in product_clusters:
        continue

    final_reviews.append({
        "asin": asin,
        "text": r["text"],
        "sentiment": r["sentiment"],
        "cluster_id": product_clusters[asin]
    })

print("Final reviews:", len(final_reviews))


Final reviews: 10092


In [14]:
final_reviews = []

for r in reviews:
    asin = r["asin"]
    if asin not in product_clusters:
        continue

    final_reviews.append({
        "asin": asin,
        "text": r["text"],
        "sentiment": r["sentiment"],
        "cluster_id": product_clusters[asin]
    })

print("Final reviews:", len(final_reviews))


Final reviews: 10092


In [15]:
cluster_product_reviews = defaultdict(lambda: defaultdict(list))

for r in final_reviews:
    cluster_product_reviews[r["cluster_id"]][r["asin"]].append(r)


In [16]:
cluster_product_stats = {}

for cluster_id, products in cluster_product_reviews.items():
    stats = []

    for asin, revs in products.items():
        sentiments = [r["sentiment"] for r in revs]
        counts = Counter(sentiments)

        total = len(sentiments)
        neg = counts.get("negative", 0)

        if total < 5:
            continue

        stats.append({
            "asin": asin,
            "total_reviews": total,
            "negative": neg,
            "neg_ratio": neg / total
        })

    cluster_product_stats[cluster_id] = {
        "top_products": sorted(
            stats,
            key=lambda x: (-x["total_reviews"], x["neg_ratio"])
        )[:3],
        "worst_product": sorted(
            stats,
            key=lambda x: (-x["neg_ratio"], -x["total_reviews"])
        )[:1]
    }

print("Ranking computed")


Ranking computed


In [17]:
def split_into_ideas(text):
    return [s.strip() for s in re.split(r"[.!?]", text) if len(s.strip()) >= 20]

def select_canonical_idea(ideas):
    ideas = [i for i in ideas if len(i) >= 35]
    return ideas[len(ideas)//2] if ideas else None


In [18]:
cluster_texts = defaultdict(list)

for r in final_reviews:
    cluster_texts[r["cluster_id"]].append(r["text"])

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

cluster_canonical_ideas = {}

for cid, texts in cluster_texts.items():
    ideas = []
    for t in texts:
        ideas.extend(split_into_ideas(t))

    ideas = ideas[:1000]
    if len(ideas) < 8:
        continue

    emb = embedding_model.encode(ideas, show_progress_bar=False)
    labels = KMeans(n_clusters=8, random_state=42).fit_predict(emb)

    grouped = defaultdict(list)
    for idea, lab in zip(ideas, labels):
        grouped[lab].append(idea)

    cluster_canonical_ideas[cid] = [
        select_canonical_idea(g) for g in grouped.values() if select_canonical_idea(g)
    ]

print("Canonical ideas extracted")


Canonical ideas extracted


In [20]:
from google.colab import userdata
import os

os.environ["OPENAI_API_KEY"] = userdata.get("secretName")


In [21]:
client = OpenAI()

def build_prompt(cid):
    ideas = cluster_canonical_ideas.get(cid, [])
    stats = cluster_product_stats.get(cid, {})

    ideas_text = "\n".join(f"- {i}" for i in ideas[:8])

    top = stats.get("top_products", [])
    worst = stats.get("worst_product")

    top_text = "\n".join(
        f"- Product {i+1}: many reviews ({p['total_reviews']}), low negatives ({p['negative']})"
        for i, p in enumerate(top)
    )

    worst_text = (
        f"- Highest negative ratio ({worst[0]['neg_ratio']:.2f})"
        if worst else "- No clear worst product."
    )

    return f"""
You are writing a short, neutral recommendation article.

Overview ideas:
{ideas_text}

Top products:
{top_text}

Worst product:
{worst_text}
""".strip()


In [22]:
for cid in sorted(cluster_product_stats.keys()):
    prompt = build_prompt(cid)

    response = client.responses.create(
        model="gpt-4.1-mini",
        input=prompt,
        max_output_tokens=220,
        temperature=0.3
    )

    print(f"\n===== CLUSTER {cid} =====\n")
    print(response.output_text)



===== CLUSTER 0 =====

**A Candid Look at the Latest Film: Nostalgia Meets Complexity**


While the movie features a strong cast of talented actors, the overall execution leaves much to be desired. The storyline ventures into somewhat strange territory, exploring awkward relationships between characters and delving into themes like the emergence of primogeniture. These elements add complexity but also contribute to a somewhat disjointed narrative.

Interestingly, despite being marketed with a holiday backdrop, the film doesn't quite fit the mold of a traditional Christmas movie. This may disappoint viewers expecting a festive experience. Additionally, some peculiar plot points, such as why certain characters (like the gators) behave in unexpected ways, remain unexplained, adding to the film

===== CLUSTER 1 =====

**A Balanced Look at Some Popular Films**

Certain movies have a timeless quality that keeps viewers coming back, even after multiple viewings over many years. One such film