In [4]:
!pip uninstall -y numpy
!pip install numpy==1.23.5 --no-deps --force-reinstall
!pip install torch==2.5.1 --no-deps
!pip install -U bitsandbytes --no-deps

Found existing installation: numpy 1.23.5
Uninstalling numpy-1.23.5:
  Successfully uninstalled numpy-1.23.5
[0mCollecting numpy==1.23.5
  Using cached numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Using cached numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Installing collected packages: numpy
Successfully installed numpy-1.23.5




In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm.auto import tqdm
import gc
import os
from google.colab import drive, userdata
import requests
from bs4 import BeautifulSoup
import time

In [2]:
# Mount Google Drive (safely)
try:
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive')

    FILE_PATH = "/content/drive/MyDrive/amazon_reviews_backup/sampled_data_3percent"
    print("\n" + "="*50)
    print("✓ Google Drive mounted successfully")
    print("✓ File directory:", FILE_PATH)
    print("="*50 + "\n")

    # Load the split datasets with clusters
    print("\nLoading clustered datasets...")
    train_df = pd.read_parquet(f"{FILE_PATH}/train_clustered_run_5.parquet")
    val_df = pd.read_parquet(f"{FILE_PATH}/val_clustered_run_5.parquet")
    test_df = pd.read_parquet(f"{FILE_PATH}/test_clustered_run_5.parquet")

    print(f"✓ Loaded train set: {len(train_df):,} reviews")
    print(f"✓ Loaded validation set: {len(val_df):,} reviews")
    print(f"✓ Loaded test set: {len(test_df):,} reviews")

    # Extract meta-category names from training data
    if 'meta_category' in train_df.columns:
        meta_category_names = train_df.groupby('meta_category')['meta_category'].first().to_dict()
        print("\n✓ Meta-categories loaded from training data")
        print(f"✓ Found {len(meta_category_names)} meta-categories")
    else:
        raise KeyError("The 'meta_category' column is not found in the DataFrame. Please check your data.")
    print("="*50 + "\n")

    # Check GPU and initialize model
    if torch.cuda.is_available():
        print(f"✓ Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"✓ Initial Memory Allocated: {torch.cuda.memory_allocated(0)/1024**2:.2f} MB")
    else:
        print("! No GPU available, using CPU")

    # Clear GPU memory
    torch.cuda.empty_cache()
    gc.collect()

except Exception as e:
    print("✗ Error:", str(e))
    raise

Mounted at /content/drive

✓ Google Drive mounted successfully
✓ File directory: /content/drive/MyDrive/amazon_reviews_backup/sampled_data_3percent


Loading clustered datasets...
✓ Loaded train set: 11,896,753 reviews
✓ Loaded validation set: 1,573,506 reviews
✓ Loaded test set: 1,508,019 reviews

✓ Meta-categories loaded from training data
✓ Found 5 meta-categories

✓ Using GPU: NVIDIA A100-SXM4-40GB
✓ Initial Memory Allocated: 0.00 MB


In [5]:
# Get the Hugging Face token from Colab secrets
try:
    hf_token = userdata.get('HF_TOKEN')
    os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
    print("✓ Hugging Face token loaded from secrets.")
except KeyError:
    print("✗ Hugging Face token not found in secrets. Please store it using:")
    print("   from google.colab import userdata")
    print("   userdata.set('hf_token', 'YOUR_ACTUAL_TOKEN')")
    raise

# Load Mistral model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
print(f"\nInitializing {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True,
)

# Meta-category mapping
meta_category_names = {
    0: "Entertainment & General Retail",
    1: "Technology & Automotive",
    2: "Industrial & DIY",
    3: "Health & Beauty",
    4: "Home & Garden"
}

def get_product_name(asin):
    """Enhanced product name lookup"""
    try:
        url = f"https://www.amazon.com/dp/{asin}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
        }
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find('span', {'id': 'productTitle'}) or soup.find('h1', {'id': 'title'})
        time.sleep(2)
        return title.text.strip() if title else f"Product {asin}"
    except Exception as e:
        print(f"Couldn't fetch name for {asin}: {str(e)}")
        return f"Product {asin}"

def prepare_review_data(meta_category):
    """Prepare review data with product type filtering"""
    # Get actual category name
    category_name = meta_category_names[meta_category]

    # Filter for the meta-category
    category_reviews = train_df[train_df['meta_category'] == meta_category]

    # Group by product category (subcategory) and get top ones
    subcategories = category_reviews.groupby('category').size().sort_values(ascending=False)
    main_subcategory = subcategories.index[0]  # Get most common subcategory

    # Handle [UNKNOWN] category
    if main_subcategory == '[UNKNOWN]':
        print("Warning: Unknown category detected. Getting next most common category...")
        if len(subcategories) > 1:
            main_subcategory = subcategories.index[1]  # Try next category
        else:
            raise ValueError(f"No valid subcategories found for {category_name}")

    # Filter for main subcategory
    subcategory_reviews = category_reviews[category_reviews['category'] == main_subcategory]

    # Helper function to get product category
    def get_product_category(asin, reviews):
        return reviews[reviews['asin'] == asin]['category'].iloc[0]

    # Get top products with category information
    product_stats = (subcategory_reviews.groupby('asin')
                    .agg({
                        'rating': ['count', 'mean'],
                        'helpful_vote': 'sum',
                        'category': 'first'  # Get category
                    })
                    .sort_values(('rating', 'count'), ascending=False))

    # Get top 5 most reviewed products
    top_products = product_stats.head(5)

    selected_reviews = []
    print(f"\nFetching product names for {category_name} - {main_subcategory}...")

    for asin in top_products.index:
        product_reviews = subcategory_reviews[subcategory_reviews['asin'] == asin]
        product_name = get_product_name(asin)

        selected_reviews.append(
            f"\nProduct: {product_name}\n"
            f"Category: {main_subcategory}\n"
            f"Total Reviews: {len(product_reviews):,}\n"
            f"Average Rating: {product_reviews['rating'].mean():.1f}/5\n"
            f"Sample Reviews:\n" +
            "\n".join(product_reviews.sort_values('helpful_vote', ascending=False)
                     .head(3)['text'].tolist())
        )

    return {
        'review_data': "\n".join(selected_reviews),
        'category': main_subcategory
    }

# Prompt template
prompt = """<s>[INST] Based on customer reviews, write a short article, like a blogpost reviewer would write, about {meta_category} products to help customers choose the best one.

Focus on:
1. The top 3 most recommended products and their key differences
2. Top complaints for each of those products
3. What is the worst product in the category {category} and why you should never buy it

Use this review data:
{review_data}
[/INST]"""

# Generation parameters
generation_config = {
    "max_new_tokens": 1024,
    "temperature": 0.7,
    "top_p": 0.9,
    "repetition_penalty": 1.2,
    "do_sample": True,
    "num_return_sequences": 1,
    "pad_token_id": tokenizer.eos_token_id
}

print("✓ Model initialized successfully")
if torch.cuda.is_available():
    print(f"✓ GPU Memory After Loading: {torch.cuda.memory_allocated(0)/1024**2:.2f} MB")
print("="*50 + "\n")

# Example usage:
"""
meta_category = 1  # Technology & Automotive
print(f"Analyzing {meta_category_names[meta_category]}...")
review_data = prepare_review_data(meta_category_id)
formatted_prompt = prompt.format(
    meta_category=meta_category_names[meta_category],
    review_data=review_data
)
"""

# Example usage:
"""
meta_category = "Electronics"
review_data = prepare_review_data(train_df, meta_category)
formatted_prompt = prompt.format(
    meta_category=meta_category,
    review_data=review_data
)

# Generate response
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, **generation_config)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
"""

✓ Hugging Face token loaded from secrets.

Initializing mistralai/Mistral-7B-Instruct-v0.2...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✓ Model initialized successfully
✓ GPU Memory After Loading: 8549.27 MB



'\nmeta_category = "Electronics"\nreview_data = prepare_review_data(train_df, meta_category)\nformatted_prompt = prompt.format(\n    meta_category=meta_category,\n    review_data=review_data\n)\n\n# Generate response\ninputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)\noutputs = model.generate(**inputs, **generation_config)\nresponse = tokenizer.decode(outputs[0], skip_special_tokens=True)\n'

In [6]:
# Get unique meta-categories and their counts
category_counts = train_df['meta_category'].value_counts()

print("\nAvailable meta-categories and their review counts:")
print("="*50)
for category_id, count in category_counts.items():
    category_name = meta_category_names[category_id]
    print(f"{category_name}: {count:,} reviews")
print("="*50)
print(f"Total categories: {len(category_counts)}")

# Function to generate summary for a category
def generate_category_summary(meta_category):
    # Verify category exists
    if meta_category not in train_df['meta_category'].unique():
        available_categories = [f"{id}: {meta_category_names[id]}"
                              for id in sorted(train_df['meta_category'].unique())]
        raise ValueError(f"Category '{meta_category}' not found. Available categories: {', '.join(available_categories)}")

    result = prepare_review_data(meta_category)
    formatted_prompt = prompt.format(
        meta_category=meta_category_names[meta_category],
        category=result['category'],
        review_data=result['review_data']
    )

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, **generation_config)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Let's try with the category that has the most reviews
top_category = category_counts.index[0]
print(f"\nGenerating summary for top category: {meta_category_names[top_category]}")
summary = generate_category_summary(top_category)
print("\nGenerated Summary:")
print("="*50)
print(summary)


Available meta-categories and their review counts:
Entertainment & General Retail: 5,772,508 reviews
Home & Garden: 2,325,856 reviews
Technology & Automotive: 1,648,158 reviews
Health & Beauty: 1,183,118 reviews
Industrial & DIY: 967,113 reviews
Total categories: 5

Generating summary for top category: Entertainment & General Retail

Fetching product names for Entertainment & General Retail - Books...





Generated Summary:
[INST] Based on customer reviews, write a short article, like a blogpost reviewer would write, about Entertainment & General Retail products to help customers choose the best one.

Focus on:
1. The top 3 most recommended products and their key differences
2. Top complaints for each of those products
3. What is the worst product in the category Books and why you should never buy it

Use this review data:

Product: Product B00L9B7IKE
Category: Books
Total Reviews: 973
Average Rating: 4.0/5
Sample Reviews:
I read the negative reviews and chose not to believe them. I made the wrong decision. For starters, you will dislike every single character in the book. There is no one to root for. Well, I take that back- you will root for Rachel for awhile but it is almost pointless. She is an unreliable narrator, which becomes tiresome after awhile. The plot twists are weak. The ending is absurd, rushed (after a painfully slow start), and disappointing. I have had this book on my 

In [11]:
# Select a meta-category
meta_category = 1

# Prepare review data using the function
review_data = prepare_review_data(meta_category)

# Format the prompt
formatted_prompt = prompt.format(
    meta_category=meta_category_names[meta_category],
    category=review_data['category'],
    review_data=review_data['review_data']
)

# Generate the article using the LLM
inputs = tokenizer(
    formatted_prompt,
    return_tensors="pt",
    truncation=True,
    max_length=1024
).to(model.device)

outputs = model.generate(
    **inputs,
    max_length=2048,
    temperature=0.7,
    do_sample=True
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Fetching product names for Technology & Automotive - Electronics...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[INST] Based on customer reviews, write a short article, like a blogpost reviewer would write, about Technology & Automotive products to help customers choose the best one.

Focus on:
1. The top 3 most recommended products and their key differences
2. Top complaints for each of those products
3. What is the worst product in the category Electronics and why you should never buy it

Use this review data:

Product: Product B00ZV9RDKK
Category: Electronics
Total Reviews: 3,710
Average Rating: 4.3/5
Sample Reviews:
I may have missed it buried in all the promotional hype somewhere but the seemingly endless volume of content and services they tell you you can access with this device all costs additional $$. Granted there is some minuscule amount of free content, not worth the cost of the device in my opinion. I tried it with Sling T.V., the Sling T.V. content would continually stop and send me back to the device's default screen, whereupon I would have to search for the Sling T.V. app manuall

In [14]:
# Select a meta-category
meta_category = 0

# Prepare review data using the function
review_data = prepare_review_data(meta_category)

# Format the prompt exactly as in generate_category_summary
formatted_prompt = prompt.format(
    meta_category=meta_category_names[meta_category],
    category=review_data['category'],
    review_data=review_data['review_data']
)

# Match the tokenization and generation exactly
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, **generation_config)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


Fetching product names for Entertainment & General Retail - Books...
[INST] Based on customer reviews, write a short article, like a blogpost reviewer would write, about Entertainment & General Retail products to help customers choose the best one.

Focus on:
1. The top 3 most recommended products and their key differences
2. Top complaints for each of those products
3. What is the worst product in the category Books and why you should never buy it

Use this review data:

Product: Product B00L9B7IKE
Category: Books
Total Reviews: 973
Average Rating: 4.0/5
Sample Reviews:
I read the negative reviews and chose not to believe them. I made the wrong decision. For starters, you will dislike every single character in the book. There is no one to root for. Well, I take that back- you will root for Rachel for awhile but it is almost pointless. She is an unreliable narrator, which becomes tiresome after awhile. The plot twists are weak. The ending is absurd, rushed (after a painfully slow star

In [15]:
# Select a meta-category
meta_category = 1

# Prepare review data using the function
review_data = prepare_review_data(meta_category)

# Format the prompt exactly as in generate_category_summary
formatted_prompt = prompt.format(
    meta_category=meta_category_names[meta_category],
    category=review_data['category'],
    review_data=review_data['review_data']
)

# Match the tokenization and generation exactly
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, **generation_config)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


Fetching product names for Technology & Automotive - Electronics...
[INST] Based on customer reviews, write a short article, like a blogpost reviewer would write, about Technology & Automotive products to help customers choose the best one.

Focus on:
1. The top 3 most recommended products and their key differences
2. Top complaints for each of those products
3. What is the worst product in the category Electronics and why you should never buy it

Use this review data:

Product: Fire TV Stick with Alexa Voice Remote, streaming media player - Previous Generation
Category: Electronics
Total Reviews: 3,710
Average Rating: 4.3/5
Sample Reviews:
I may have missed it buried in all the promotional hype somewhere but the seemingly endless volume of content and services they tell you you can access with this device all costs additional $$. Granted there is some minuscule amount of free content, not worth the cost of the device in my opinion. I tried it with Sling T.V., the Sling T.V. content w

In [16]:
# Select a meta-category
meta_category = 2

# Prepare review data using the function
review_data = prepare_review_data(meta_category)

# Format the prompt exactly as in generate_category_summary
formatted_prompt = prompt.format(
    meta_category=meta_category_names[meta_category],
    category=review_data['category'],
    review_data=review_data['review_data']
)

# Match the tokenization and generation exactly
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, **generation_config)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


Fetching product names for Industrial & DIY - Tools_and_Home_Improvement...
[INST] Based on customer reviews, write a short article, like a blogpost reviewer would write, about Industrial & DIY products to help customers choose the best one.

Focus on:
1. The top 3 most recommended products and their key differences
2. Top complaints for each of those products
3. What is the worst product in the category Tools_and_Home_Improvement and why you should never buy it

Use this review data:

Product: Product B0131RG6VK
Category: Tools_and_Home_Improvement
Total Reviews: 332
Average Rating: 4.1/5
Sample Reviews:
It seems Nest has a lot of mixed reviews across their ever growing product line. I first jumped on the Nest bandwagon about two years ago with the 2nd generation Nest Thermostat and shortly after picked up a couple Dropcams. Nest then released their highly controversial Nest Protects and I immediately bought five of them for my house. I only had an issue with one of them since and Ne

In [17]:
# Select a meta-category
meta_category = 3

# Prepare review data using the function
review_data = prepare_review_data(meta_category)

# Format the prompt exactly as in generate_category_summary
formatted_prompt = prompt.format(
    meta_category=meta_category_names[meta_category],
    category=review_data['category'],
    review_data=review_data['review_data']
)

# Match the tokenization and generation exactly
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, **generation_config)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


Fetching product names for Health & Beauty - Health_and_Household...
[INST] Based on customer reviews, write a short article, like a blogpost reviewer would write, about Health & Beauty products to help customers choose the best one.

Focus on:
1. The top 3 most recommended products and their key differences
2. Top complaints for each of those products
3. What is the worst product in the category Health_and_Household and why you should never buy it

Use this review data:

Product: Product B0026HDURA
Category: Health_and_Household
Total Reviews: 763
Average Rating: 4.3/5
Sample Reviews:
On the first night of application, about 2-3 hours later, I felt a overwhelming sense of euphoria that my lower back pain was almost 80% gone! I was ecstatic because I have been having chronic ache for more than 10 years. I continued to use it 3 times per day for the next 7-10days. It was fantastic! The pain was almost gone! At the same time I applied the cream one time on my other 3 brothers at around 

In [18]:
# Select a meta-category
meta_category = 4

# Prepare review data using the function
review_data = prepare_review_data(meta_category)

# Format the prompt exactly as in generate_category_summary
formatted_prompt = prompt.format(
    meta_category=meta_category_names[meta_category],
    category=review_data['category'],
    review_data=review_data['review_data']
)

# Match the tokenization and generation exactly
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, **generation_config)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


Fetching product names for Home & Garden - Home_and_Kitchen...
[INST] Based on customer reviews, write a short article, like a blogpost reviewer would write, about Home & Garden products to help customers choose the best one.

Focus on:
1. The top 3 most recommended products and their key differences
2. Top complaints for each of those products
3. What is the worst product in the category Home_and_Kitchen and why you should never buy it

Use this review data:

Product: Product B00FLYWNYQ
Category: Home_and_Kitchen
Total Reviews: 636
Average Rating: 4.6/5
Sample Reviews:
Not as easy to use as I thought it would be, didn't like the rice, and their return policy is not nearly as good as amazons! 20% restocking, shipping both way.<br /><br />For any meal with more than one ingredient you had to warm up the pot, steam the first ingredient, release the steam, add the second ingredient quickly, warm up the pot, re-steam, re-release the steam. took two or three times longer than the advertisi