In [None]:
# --- Block 1: Environment setup ---
# This block mounts Google Drive so we can access the dataset.
# It must be executed at the start of every new session.

from google.colab import drive
drive.mount('/content/drive')


# --- Base imports ---
# Common libraries used across the notebook.

import json
import random
import numpy as np
import pandas as pd

# --- Reproducibility ---
# Fix random seeds for consistent behavior.

SEED = 42
random.seed(SEED)
np.random.seed(SEED)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# --- Block 2: Load dataset from Drive ---
# Define the path to the Amazon Reviews dataset (.jsonl).
# Update this path only if the file location changes.

DATASET_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Data/Movies_and_TV.jsonl"


In [None]:
# --- Block 2: Load dataset sample from Drive ---
# This block loads a fixed-size sample of the dataset for inspection and development.

MAX_REVIEWS_SAMPLE = 100_000  # adjust if needed, keep small for now

raw_reviews = []

with open(DATASET_PATH, "r") as f:
    for i, line in enumerate(f):
        if i >= MAX_REVIEWS_SAMPLE:
            break
        review = json.loads(line)
        raw_reviews.append(review)

print("Raw reviews loaded (sample):", len(raw_reviews))


Raw reviews loaded (sample): 100000


In [None]:
# --- Block 2.1: Group reviews by product (asin) ---
# This block builds a basic asin -> reviews mapping.
# No filtering or cleaning is applied yet.

from collections import defaultdict

product_reviews = defaultdict(list)

for review in raw_reviews:
    if "asin" not in review or "text" not in review:
        continue

    text = review["text"].strip()
    if not text:
        continue

    asin = review["asin"]
    product_reviews[asin].append(text)

print("Total products:", len(product_reviews))

# Quick sanity check (structure only)
sample_asin = next(iter(product_reviews))
print("Sample ASIN:", sample_asin)
print("Number of reviews:", len(product_reviews[sample_asin]))
print("First review (truncated):")
print(product_reviews[sample_asin][0][:300])


Total products: 56306
Sample ASIN: B013488XFS
Number of reviews: 24
First review (truncated):
Amazon, please buy the show! I'm hooked!


In [None]:
# --- Block 2.2: Minimal filtering for clustering ---
# Applies basic noise reduction and ensures minimum signal per product.

MAX_PRODUCTS = 20000
MAX_REVIEWS_PER_PRODUCT = 20
MIN_VALID_REVIEWS = 3
MIN_TEXT_LENGTH = 30

def is_valid_review(text):
    if not text:
        return False
    if len(text.strip()) < MIN_TEXT_LENGTH:
        return False

    text_lower = text.lower()
    noise_patterns = [
        "no comment",
        "no comments",
        "item was canceled",
        "order was canceled",
        "did not watch",
        "never watched",
        "was a gift"
    ]

    return not any(p in text_lower for p in noise_patterns)

from collections import defaultdict

product_reviews_filtered = defaultdict(list)

for asin, reviews in product_reviews.items():
    for text in reviews:
        if is_valid_review(text):
            if len(product_reviews_filtered[asin]) < MAX_REVIEWS_PER_PRODUCT:
                product_reviews_filtered[asin].append(text)

# keep products with minimum signal
product_reviews_filtered = {
    asin: reviews
    for asin, reviews in product_reviews_filtered.items()
    if len(reviews) >= MIN_VALID_REVIEWS
}

# optional size cap for experimentation
product_reviews_filtered = dict(
    list(product_reviews_filtered.items())[:MAX_PRODUCTS]
)

print("Products after filtering:", len(product_reviews_filtered))


Products after filtering: 5328


In [None]:
# --- Block 2.3: Build product-level texts ---
# Concatenate reviews per product to create a single text per asin.

product_texts = []
product_ids = []

for asin, reviews in product_reviews_filtered.items():
    product_texts.append(" ".join(reviews))
    product_ids.append(asin)

print("Products for embedding:", len(product_texts))


Products for embedding: 5328


In [None]:
# --- Block 2.4: Compute embeddings ---
# Generate embeddings for each product text.

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(
    "all-MiniLM-L6-v2",
    device="cuda"
)

embeddings = embedding_model.encode(
    product_texts,
    batch_size=32,
    show_progress_bar=True
)

print("Embeddings shape:", embeddings.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/167 [00:00<?, ?it/s]

Embeddings shape: (5328, 384)


In [None]:
# --- Block 2.5: KMeans clustering ---
# Cluster products into semantic groups.

from sklearn.cluster import KMeans

N_CLUSTERS = 5

kmeans = KMeans(
    n_clusters=N_CLUSTERS,
    random_state=SEED,
    n_init=10
)

cluster_labels = kmeans.fit_predict(embeddings)

# map cluster -> products
clustered_products = {}

for asin, label in zip(product_ids, cluster_labels):
    clustered_products.setdefault(int(label), []).append(asin)

print({c: len(v) for c, v in clustered_products.items()})


{2: 1325, 3: 1749, 4: 931, 0: 827, 1: 496}


In [None]:
# --- Cluster inspection utility ---

def inspect_cluster(cluster_id, n_products=3, n_reviews=2):
    print(f"\n===== CLUSTER {cluster_id} =====\n")
    for asin in clustered_products[cluster_id][:n_products]:
        print(f"ASIN: {asin}\n")
        for r in product_reviews_filtered[asin][:n_reviews]:
            print(r[:400])
            print("-" * 80)


In [None]:
for c in range(N_CLUSTERS):
    inspect_cluster(c)



===== CLUSTER 0 =====

ASIN: B00271DNP4

Great movie! My kids are obsessed with HP!
--------------------------------------------------------------------------------
While visiting, my grandsons wanted to watch this Harry Potter movie, having seen the previous ones.  I have never been too involve with these books or movies, but I was pleasantly surprised as I watched movie with them.  Exciting and fun.
--------------------------------------------------------------------------------
ASIN: B002DQDV6Y

Great movie! My kids are obsessed with HP!
--------------------------------------------------------------------------------
Love this movie every time! I recommend getting some Bertie Botts Jelly beans to snack on while watching!
--------------------------------------------------------------------------------
ASIN: B0779KMWG1

We loved Coco! My husband (Mexican) and I (Caucasian) saw this movie on our date night and were moved to tears. I asked him if the movie was accurate in his opinion o

In [None]:
# --- Block 3A.1: Collect cluster texts ---
# Gather all review texts per cluster.

cluster_texts = {}

for cluster_id, asins in clustered_products.items():
    texts = []
    for asin in asins:
        texts.extend(product_reviews_filtered[asin])
    cluster_texts[cluster_id] = texts

for c in cluster_texts:
    print(c, len(cluster_texts[c]))


2 8205
3 8594
4 4143
0 3712
1 1988


In [None]:
# --- Block 3A.2: Split reviews into idea candidates ---
# Split reviews into short idea-level sentences.

import re

def split_into_ideas(text):
    # basic sentence split
    sentences = re.split(r'[.!?]', text)
    ideas = []
    for s in sentences:
        s = s.strip()
        if len(s) >= 20:
            ideas.append(s)
    return ideas

cluster_ideas = {}

for cluster_id, texts in cluster_texts.items():
    ideas = []
    for t in texts:
        ideas.extend(split_into_ideas(t))
    cluster_ideas[cluster_id] = ideas

for c in cluster_ideas:
    print(c, len(cluster_ideas[c]))


2 26076
3 26718
4 22031
0 10261
1 7957


In [None]:
# --- Block 3A.3: Cluster ideas within each cluster ---
# Group similar idea candidates to reduce redundancy.

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

idea_embedding_model = SentenceTransformer(
    "all-MiniLM-L6-v2",
    device="cuda"
)

IDEAS_PER_CLUSTER = 1000   # cap for speed and stability
IDEA_CLUSTERS = 8          # small number, adjustable

cluster_idea_groups = {}

for cluster_id, ideas in cluster_ideas.items():
    # limit size to keep things manageable
    ideas_subset = ideas[:IDEAS_PER_CLUSTER]

    if len(ideas_subset) < IDEA_CLUSTERS:
        continue

    idea_embeddings = idea_embedding_model.encode(
        ideas_subset,
        batch_size=32,
        show_progress_bar=False
    )

    kmeans = KMeans(
        n_clusters=IDEA_CLUSTERS,
        random_state=SEED,
        n_init=10
    )

    labels = kmeans.fit_predict(idea_embeddings)

    grouped = {}
    for idea, label in zip(ideas_subset, labels):
        grouped.setdefault(int(label), []).append(idea)

    cluster_idea_groups[cluster_id] = grouped

for c in cluster_idea_groups:
    print(c, {k: len(v) for k, v in cluster_idea_groups[c].items()})


2 {1: 176, 2: 134, 5: 80, 7: 143, 4: 110, 3: 123, 0: 136, 6: 98}
3 {7: 99, 5: 178, 3: 152, 4: 53, 1: 111, 2: 188, 0: 165, 6: 54}
4 {6: 192, 3: 89, 4: 172, 0: 159, 7: 68, 5: 72, 1: 115, 2: 133}
0 {1: 122, 5: 151, 3: 92, 2: 171, 0: 222, 6: 96, 7: 72, 4: 74}
1 {5: 124, 0: 148, 1: 68, 4: 111, 2: 155, 3: 138, 6: 153, 7: 103}


In [None]:
# --- Block 3A.4: Canonical idea extraction ---
# Build clean, representative ideas per cluster.

def select_canonical_idea(ideas):
    clean = []

    for s in ideas:
        if not isinstance(s, str):
            continue

        s = s.strip()

        # remove markup
        if "<br" in s or "</" in s:
            continue

        # length filter
        if len(s) < 35:
            continue

        lower = s.lower()

        # discard personal, specific, transactional content
        bad_patterns = [
            "i ", "my ", "we ", "our ",
            "amazon", "prime", "shipping",
            "dvd", "blu", "disc",
            "bought", "order", "money back",
            "harry potter", "marvel", "got", "twd",
            "sally field", "james garner",
            "season", "episode"
        ]

        if any(p in lower for p in bad_patterns):
            continue

        # must sound like a general opinion
        if not any(
            k in lower for k in
            ["story", "acting", "characters", "plot", "series", "movie", "film"]
        ):
            continue

        clean.append(s)

    if not clean:
        return None

    # choose most general (mid-length)
    clean = sorted(clean, key=len)
    return clean[len(clean) // 2]



# rebuild canonical ideas from scratch
cluster_canonical_ideas = {}

for cluster_id, groups in cluster_idea_groups.items():
    canonical_ideas = []

    for group_id, ideas in groups.items():
        idea = select_canonical_idea(ideas)
        if idea:
            canonical_ideas.append(idea)

    cluster_canonical_ideas[cluster_id] = canonical_ideas


# display results clearly
for cluster_id, ideas in cluster_canonical_ideas.items():
    print(f"\n===== CLUSTER {cluster_id} =====")
    for idea in ideas:
        print("-", idea)



===== CLUSTER 2 =====
- Obviously this movie was made as a gift to the loyal fans of the series
- Really enjoy this series, the cast has great chemistry
- Great acting, great story line and character development
- Simply elegant, simply great script, simply wonderful characters
- Can't wait until the next series is available
- Characters are clever, funny, and just the right amount of odd
- Somehow these films produced by the enigmatic Man in the High Castle threaten the reality of  Japanese/German hegemony
- The story is futuristic—the protagonist has a robot for a son, mind you

===== CLUSTER 3 =====
- Rather an oldie - but a cute, satisfying story and Robert DeNero
- If you enjoy the show “This is us”, you’ll enjoy this movie
- The positive of this film is that it has realistic orbital mechanics (Physics)
- Some sad some funny some keeping you guessing , love Eddie Murphy in this movie
- Love to see how these women overcame so much, stood up for themselves and enjoyed a little hist

In [None]:
# --- Block 3A.X.1: asin -> cluster_id ---

product_clusters = {}
for cluster_id, asins in clustered_products.items():
    for asin in asins:
        product_clusters[asin] = cluster_id

print("Products with cluster:", len(product_clusters))


Products with cluster: 5328


In [None]:
# --- Block 3A.X.2: build final_reviews ---

final_reviews = []

for r in raw_reviews:
    if "asin" not in r or "text" not in r or "rating" not in r:
        continue

    text = r["text"].strip()
    if not is_valid_review(text):
        continue

    asin = r["asin"]
    if asin not in product_clusters:
        continue

    rating = r["rating"]
    if rating <= 2:
        sentiment = "negative"
    elif rating == 3:
        sentiment = "neutral"
    else:
        sentiment = "positive"

    final_reviews.append({
        "asin": asin,
        "text": text,
        "sentiment": sentiment,
        "cluster_id": product_clusters[asin]
    })

print("Final reviews:", len(final_reviews))
print(final_reviews[0])


Final reviews: 28685
{'asin': 'B013488XFS', 'text': "Amazon, please buy the show! I'm hooked!", 'sentiment': 'positive', 'cluster_id': 2}


In [None]:
# --- Block 3A.X.3: cluster -> product -> reviews ---

from collections import defaultdict

cluster_product_reviews = defaultdict(lambda: defaultdict(list))

for r in final_reviews:
    cluster_product_reviews[r["cluster_id"]][r["asin"]].append(r)

print("Clusters:", len(cluster_product_reviews))
for c in cluster_product_reviews:
    print("Cluster", c, "products:", len(cluster_product_reviews[c]))
    break


Clusters: 5
Cluster 2 products: 1325


In [None]:
# --- Block 3A.X.4: Rank products per cluster ---

from collections import Counter

cluster_product_stats = {}

for cluster_id, products in cluster_product_reviews.items():
    stats = []

    for asin, reviews in products.items():
        sentiments = [r["sentiment"] for r in reviews]
        counts = Counter(sentiments)

        total = len(sentiments)
        neg = counts.get("negative", 0)

        if total < 5:
            continue  # avoid low-signal products

        stats.append({
            "asin": asin,
            "total_reviews": total,
            "negative": neg,
            "neg_ratio": neg / total
        })

    # sort for top and worst
    top_products = sorted(
        stats,
        key=lambda x: (-x["total_reviews"], x["neg_ratio"])
    )[:3]

    worst_product = sorted(
        stats,
        key=lambda x: (-x["neg_ratio"], -x["total_reviews"])
    )[:1]

    cluster_product_stats[cluster_id] = {
        "top_products": top_products,
        "worst_product": worst_product[0] if worst_product else None
    }

# quick check
for c, v in cluster_product_stats.items():
    print("Cluster", c)
    print("Top:", [p["asin"] for p in v["top_products"]])
    print("Worst:", v["worst_product"]["asin"] if v["worst_product"] else None)
    break


Cluster 2
Top: ['B00I3MQNWG', 'B00RSGIVVO', 'B01J4SRJFW']
Worst: B07N6M11C9


In [None]:
# --- Block 3B.1: OpenAI API key (Colab secrets) ---

from google.colab import userdata
import os

os.environ["OPENAI_API_KEY"] = userdata.get("secretName")


In [None]:
# --- Block 3B.1: Build generation prompt per cluster ---

def build_cluster_prompt(cluster_id):
    ideas = cluster_canonical_ideas.get(cluster_id, [])
    stats = cluster_product_stats.get(cluster_id, {})

    top = stats.get("top_products", [])
    worst = stats.get("worst_product")

    ideas_text = "\n".join(f"- {i}" for i in ideas[:8])

    top_text = "\n".join(
        f"- Product {idx+1}: many reviews ({p['total_reviews']}), low negatives ({p['negative']})"
        for idx, p in enumerate(top)
    )

    worst_text = (
        f"- Highest negative ratio ({worst['neg_ratio']:.2f}) with {worst['negative']} negatives out of {worst['total_reviews']} reviews."
        if worst else "- No clear worst product."
    )

    prompt = f"""
You are writing a short, neutral recommendation article based only on the information below.

Rules:
- Do not mention brand names or product codes.
- Do not invent facts.
- Use general language.
- Write 2 short paragraphs (6–8 sentences total).

Overview ideas:
{ideas_text}

Top products:
{top_text}

Worst product:
{worst_text}
""".strip()

    return prompt


In [None]:
# --- Block 3B.2: Generate article with OpenAI ---

from openai import OpenAI

client = OpenAI()

def generate_cluster_article(prompt):
    response = client.responses.create(
        model="gpt-4.1-mini",
        input=prompt,
        max_output_tokens=220,
        temperature=0.3
    )
    return response.output_text.strip()


In [None]:
# --- Block 3B.3: Test generation for one cluster ---

test_cluster_id = 2

prompt = build_cluster_prompt(test_cluster_id)
article = generate_cluster_article(prompt)

print(article)


This movie clearly serves as a heartfelt gift to the loyal fans of the series, continuing to showcase the great chemistry among the cast. The story is futuristic and engaging, featuring clever, funny, and just the right amount of odd characters, including a protagonist with a robot son. The film offers elegant storytelling, strong character development, and a wonderful script that keeps viewers invested. The blend of humor and thoughtful narrative makes it a standout addition to the series.

Among the top products related to this series, several have garnered a large number of positive reviews with very few negatives, indicating consistent quality and audience satisfaction. These entries maintain the high standards set by previous installments, ensuring fans remain eager for the next release. In contrast, one product shows a notably higher negative ratio, suggesting it may not meet the expectations set by the others. Overall, the series continues to impress with its unique premise and 

In [None]:
# --- Block 3B.4: Generate articles for all clusters ---

cluster_articles = {}

for cluster_id in sorted(cluster_product_stats.keys()):
    prompt = build_cluster_prompt(cluster_id)
    article = generate_cluster_article(prompt)
    cluster_articles[cluster_id] = article

    print(f"\n===== CLUSTER {cluster_id} ARTICLE =====\n")
    print(article)



===== CLUSTER 0 ARTICLE =====

This animated family film stands out as one of the best in its genre, offering a heartfelt and engaging story that resonates with viewers of all ages. The acting is notably impressive, bringing to life a real-life story with authenticity and charm. The plot features well-crafted twists and turns, culminating in a satisfying and hopeful conclusion. With a blend of humor and warmth, this movie is especially enjoyable during the holiday season, providing plenty of laughs and memorable moments for the whole family.

Among similar offerings, several products have garnered many positive reviews with very few negatives, indicating strong overall satisfaction. These top-rated options are well-received for their quality and entertainment value. In contrast, one product shows a higher ratio of negative feedback, suggesting it may not meet expectations as consistently. For those seeking a reliable and enjoyable family movie experience, the highly rated selections a

In [None]:
# --- Block 3B.1: OpenAI API key (Colab secrets) ---

from google.colab import userdata
import os

os.environ["OPENAI_API_KEY"] = userdata.get("secretName")


In [None]:
# --- Block 3B.2: Generate article with OpenAI ---

from openai import OpenAI

client = OpenAI()

def generate_cluster_article(prompt):
    response = client.responses.create(
        model="gpt-4.1-mini",
        input=prompt,
        max_output_tokens=220,
        temperature=0.3
    )
    return response.output_text.strip()


In [None]:
# --- Block 3B.2: OpenAI client ---

from openai import OpenAI
client = OpenAI()


In [None]:
# --- Block 3B.x: Sanitize generation input (final) ---

import re

def sanitize_generation_text(text):
    # remove quoted content
    text = re.sub(r'".*?"', '', text)

    # remove capitalized multi-word entities
    text = re.sub(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b', '', text)

    # remove standalone capitalized words (names, places)
    text = re.sub(r'\b[A-Z][a-z]{2,}\b', '', text)

    # remove country / ideology references explicitly
    banned_terms = [
        "japanese", "german", "france", "british", "american",
        "hegemony", "dominance", "empire", "political"
    ]

    for term in banned_terms:
        text = re.sub(rf'\b{term}\b', '', text, flags=re.IGNORECASE)

    # normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
# --- Block 3B.3: Build generation inputs from canonical ideas ---

cluster_generation_inputs = {}

for cluster_id, ideas in cluster_canonical_ideas.items():
    text = "\n".join(f"- {idea}" for idea in ideas if idea)
    cluster_generation_inputs[cluster_id] = text


In [None]:
list(cluster_generation_inputs.items())[:1]


In [None]:
# --- Block 3B.4: OpenAI generation function ---

def generate_cluster_summary(ideas_text):
    response = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "You rewrite given bullet points into a single neutral summary. "
                    "Do not add information. Do not analyze. Do not mention titles or names."
                )
            },
            {
                "role": "user",
                "content": ideas_text
            }
        ],
        temperature=0.2
    )

    return response.choices[0].message.content.strip()


In [None]:
# --- Block 3B.5: Generate summary for one cluster (test) ---

test_cluster_id = list(cluster_generation_inputs.keys())[0]
ideas_text = cluster_generation_inputs[test_cluster_id]

clean_text = sanitize_generation_text(ideas_text)
summary = generate_cluster_summary(clean_text)

print(summary)


In [None]:
# --- Block 3B.6: Generate summaries for multiple clusters (test) ---

TEST_CLUSTERS = list(cluster_generation_inputs.keys())[:3]  # prueba con 3

for cluster_id in TEST_CLUSTERS:
    ideas_text = cluster_generation_inputs[cluster_id]
    clean_text = sanitize_generation_text(ideas_text)

    summary = generate_cluster_summary(clean_text)

    print(f"\n===== CLUSTER {cluster_id} =====")
    print(summary)


In [None]:
print([name for name in globals().keys() if "review" in name.lower()])


In [None]:
# ============================================================
# SAVE PRODUCT CLUSTERS (REQUIRED FOR BLOCK 3)
# ============================================================

OUTPUT_CLUSTERS_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Data/product_clusters.jsonl"

with open(OUTPUT_CLUSTERS_PATH, "w") as f:
    for cluster_id, asins in clustered_products.items():
        for asin in asins:
            record = {
                "asin": asin,
                "cluster_id": cluster_id
            }
            f.write(json.dumps(record) + "\n")

print("Clusters saved to:")
print(OUTPUT_CLUSTERS_PATH)
