In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Mar  8 07:23:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P0             52W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In order to use a GPU with your notebook, select the `Runtime > Change runtime type` menu, and then set the hardware accelerator to the desired option.

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
!pip install qdrant-client
!pip install sentence-transformers
!pip install tqdm

Collecting qdrant-client
  Downloading qdrant_client-1.13.3-py3-none-any.whl.metadata (10 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant-client)
  Downloading grpcio_tools-1.70.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools>=1.41.0->qdrant-client)
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Downloading qdrant_client-1.13.3-py3-none-any.whl (306 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.7/306.7 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading grpcio_tools-1.70.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-an

In [None]:
import os
import json
import gzip
import pandas as pd
import argparse
from tqdm import tqdm
import requests

# Import for sentiment and embedding
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Import Qdrant client
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

# Dropbox file links dictionary
DROPBOX_LINKS = {
    # Review files
    "Books": "https://www.dropbox.com/s/312wv7jtm1tpxeo/Books.jsonl.gz?dl=1",
    "Beauty_and_Personal_Care": "https://www.dropbox.com/s/w2bg91ewpziaaa3/Beauty_and_Personal_Care.jsonl.gz?dl=1",
    "Electronics": "https://www.dropbox.com/s/st07mgrwzazitru/Electronics.jsonl.gz?dl=1",
    "Home_and_Kitchen": "https://www.dropbox.com/s/oxn45ntlkxo8ju5/Home_and_Kitchen.jsonl.gz?dl=1",

    # Metadata files
    "meta_Books": "https://www.dropbox.com/s/k6gjgea3sn68xq0/meta_Books.jsonl.gz?dl=1",
    "meta_Beauty_and_Personal_Care": "https://www.dropbox.com/s/ghblw1oskik0bui/meta_Beauty_and_Personal_Care.jsonl.gz?dl=1",
    "meta_Electronics": "https://www.dropbox.com/s/v9rso2vqr3qyxf4/meta_Electronics.jsonl.gz?dl=1",
    "meta_Home_and_Kitchen": "https://www.dropbox.com/s/h9zdk9841dgp26s/meta_Home_and_Kitchen.jsonl.gz?dl=1",
}


def download_from_dropbox(category, data_dir="data/raw"):
    """Downloads a dataset from Dropbox if it doesn't already exist."""
    os.makedirs(data_dir, exist_ok=True)
    file_path = os.path.join(data_dir, f"{category}.jsonl.gz")

    if not os.path.exists(file_path):
        print(f"Downloading {category} dataset from Dropbox...")
        url = DROPBOX_LINKS[category]
        response = requests.get(url, stream=True)
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print(f"Saved to {file_path}")
    else:
        print(f"{category} dataset already exists. Skipping download.")

    return file_path

In [None]:
def process_reviews(category, sentiment_pipeline, embedding_model,
                    input_dir="data/raw", output_dir="data/processed", max_reviews=10000):
    """
    Downloads and processes reviews for a given category.
    It lowercases and strips the text, then computes:
      - Sentiment using DistilBERT
      - Embeddings using MPNet
    Finally, it saves the processed data as a Parquet file and returns a DataFrame.
    """
    # Download review file (and metadata if available)
    review_file = download_from_dropbox(category, input_dir)
    meta_category = f"meta_{category}"
    if meta_category in DROPBOX_LINKS:
        download_from_dropbox(meta_category, input_dir)

    output_path = os.path.join(output_dir, f"{category}_sample.parquet")
    os.makedirs(output_dir, exist_ok=True)

    processed_reviews = []
    review_count = 0

    # Process reviews from the JSONL gzip file
    with gzip.open(review_file, 'rt', encoding='utf-8') as file:
        for line in tqdm(file, desc=f"Processing {category} reviews"):
            if review_count >= max_reviews:
                break
            try:
                review = json.loads(line.strip())
                # Use the 'text' field; adjust the key if needed
                text = review.get('text', '')
                processed_text = text.lower().strip()
                review['processed_text'] = processed_text
                processed_reviews.append(review)
                review_count += 1
            except json.JSONDecodeError:
                continue

    if not processed_reviews:
        print(f"No reviews processed for {category}.")
        return pd.DataFrame()

    df = pd.DataFrame(processed_reviews)

    # Batch processing for sentiment and embeddings
    texts = df['processed_text'].tolist()

    print("Running sentiment analysis on reviews...")
    # Add truncation to ensure sequences longer than 512 tokens are cut off
    sentiment_results = sentiment_pipeline(texts, truncation=True)

    print("Generating embeddings using MPNet...")
    embeddings = embedding_model.encode(texts, batch_size=32, show_progress_bar=True)

    # Add new columns to the DataFrame
    df['sentiment'] = [result['label'] for result in sentiment_results]
    df['embedding'] = list(embeddings)

    df.to_parquet(output_path)
    print(f"Processed {len(df)} reviews from {category} and saved to {output_path}")

    return df


In [None]:
def upload_to_qdrant(df, collection_name, qdrant_client, batch_size=1000):
    """
    Uploads the processed reviews in DataFrame `df` to a Qdrant Cloud collection in batches.
    Each review is stored with its embedding and payload (e.g., text and sentiment).
    """
    points = []
    for idx, row in df.iterrows():
        # Use an existing review ID if available; otherwise, use the row index
        point_id = row.get("review_id", idx)
        vector = row['embedding']
        payload = {
            "text": row.get("text", row.get("processed_text", "")),
            "sentiment": row["sentiment"],
            "category": row.get("category", collection_name)
        }
        point = PointStruct(
            id=point_id,
            vector=vector,
            payload=payload
        )
        points.append(point)

    total_points = len(points)
    print(f"Uploading {total_points} points to Qdrant collection '{collection_name}' in batches of {batch_size}...")

    for i in range(0, total_points, batch_size):
        batch = points[i:i+batch_size]
        qdrant_client.upsert(collection_name=collection_name, points=batch)
        print(f"Uploaded batch {i//batch_size + 1} of {((total_points - 1) // batch_size) + 1}")

    print(f"Uploaded {total_points} points to Qdrant collection '{collection_name}'.")


def main():
    import argparse
    # Use parse_known_args to avoid issues in environments like Colab
    parser = argparse.ArgumentParser(
        description="Offline processing of Amazon review data with sentiment and embeddings"
    )
    parser.add_argument(
        "--categories", nargs="+",
        default=["Electronics", "Books", "Beauty_and_Personal_Care", "Home_and_Kitchen"],
        help="List of categories to process"
    )
    parser.add_argument(
        "--max_reviews", type=int, default=10000,
        help="Maximum number of reviews to process per category"
    )
    # Default URL and API key from your credentials
    parser.add_argument(
        "--qdrant_url", type=str,
        default="https://8294e264-e739-44ca-ab59-0aff628d5f01.us-east-1-0.aws.cloud.qdrant.io:6333",
        help="Qdrant Cloud URL (e.g., https://xxxx.aws.qdrant.tech)"
    )
    parser.add_argument(
        "--qdrant_api_key", type=str,
        default="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwiZXhwIjoxNzcyNjY0MTY1fQ.D6P0Uh4asPS0CPFvl53iEVvxw9bBneRy-wI6yvwF2NY",
        help="Qdrant API Key"
    )
    args, unknown = parser.parse_known_args()

    # Load offline models
    print("Loading DistilBERT sentiment model...")
    sentiment_pipeline_model = pipeline(
        "sentiment-analysis",
        model="distilbert-base-uncased-finetuned-sst-2-english"
    )
    print("Loading MPNet embedding model...")
    embedding_model = SentenceTransformer("all-mpnet-base-v2")

    # Connect to Qdrant Cloud with an increased timeout if needed
    q_client = QdrantClient(
        url=args.qdrant_url,
        api_key=args.qdrant_api_key,
        timeout=60  # timeout in seconds (adjust as needed)
    )

    # Recreate the collection with the correct embedding dimension (768 for MPNet)
    q_client.recreate_collection(
        collection_name="amazon_reviews",
        vectors_config=VectorParams(size=768, distance=Distance.COSINE)
    )

    # Process each category and upload to Qdrant Cloud
    for category in args.categories:
        print(f"\n--- Processing category: {category} ---")
        df = process_reviews(category, sentiment_pipeline_model, embedding_model, max_reviews=args.max_reviews)
        if not df.empty:
            upload_to_qdrant(df, collection_name="amazon_reviews", qdrant_client=q_client)


if __name__ == "__main__":
    main()


Loading DistilBERT sentiment model...


Device set to use cuda:0


Loading MPNet embedding model...


  q_client.recreate_collection(



--- Processing category: Electronics ---
Electronics dataset already exists. Skipping download.
meta_Electronics dataset already exists. Skipping download.


Processing Electronics reviews: 10000it [00:00, 73556.66it/s]

Running sentiment analysis on reviews...





Generating embeddings using MPNet...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Processed 10000 reviews from Electronics and saved to data/processed/Electronics_sample.parquet
Uploading 10000 points to Qdrant collection 'amazon_reviews' in batches of 1000...
Uploaded batch 1 of 10
Uploaded batch 2 of 10
Uploaded batch 3 of 10
Uploaded batch 4 of 10
Uploaded batch 5 of 10
Uploaded batch 6 of 10
Uploaded batch 7 of 10
Uploaded batch 8 of 10
Uploaded batch 9 of 10
Uploaded batch 10 of 10
Uploaded 10000 points to Qdrant collection 'amazon_reviews'.

--- Processing category: Books ---
Downloading Books dataset from Dropbox...
Saved to data/raw/Books.jsonl.gz
Downloading meta_Books dataset from Dropbox...
Saved to data/raw/meta_Books.jsonl.gz


Processing Books reviews: 10000it [00:00, 40815.51it/s]


Running sentiment analysis on reviews...
Generating embeddings using MPNet...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Processed 10000 reviews from Books and saved to data/processed/Books_sample.parquet
Uploading 10000 points to Qdrant collection 'amazon_reviews' in batches of 1000...
Uploaded batch 1 of 10
Uploaded batch 2 of 10
Uploaded batch 3 of 10
Uploaded batch 4 of 10
Uploaded batch 5 of 10
Uploaded batch 6 of 10
Uploaded batch 7 of 10
Uploaded batch 8 of 10
Uploaded batch 9 of 10
Uploaded batch 10 of 10
Uploaded 10000 points to Qdrant collection 'amazon_reviews'.

--- Processing category: Beauty_and_Personal_Care ---
Downloading Beauty_and_Personal_Care dataset from Dropbox...
Saved to data/raw/Beauty_and_Personal_Care.jsonl.gz
Downloading meta_Beauty_and_Personal_Care dataset from Dropbox...
Saved to data/raw/meta_Beauty_and_Personal_Care.jsonl.gz


Processing Beauty_and_Personal_Care reviews: 10000it [00:00, 66197.04it/s]

Running sentiment analysis on reviews...





Generating embeddings using MPNet...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Processed 10000 reviews from Beauty_and_Personal_Care and saved to data/processed/Beauty_and_Personal_Care_sample.parquet
Uploading 10000 points to Qdrant collection 'amazon_reviews' in batches of 1000...
Uploaded batch 1 of 10
Uploaded batch 2 of 10
Uploaded batch 3 of 10
Uploaded batch 4 of 10
Uploaded batch 5 of 10
Uploaded batch 6 of 10
Uploaded batch 7 of 10
Uploaded batch 8 of 10
Uploaded batch 9 of 10
Uploaded batch 10 of 10
Uploaded 10000 points to Qdrant collection 'amazon_reviews'.

--- Processing category: Home_and_Kitchen ---
Downloading Home_and_Kitchen dataset from Dropbox...
Saved to data/raw/Home_and_Kitchen.jsonl.gz
Downloading meta_Home_and_Kitchen dataset from Dropbox...
Saved to data/raw/meta_Home_and_Kitchen.jsonl.gz


Processing Home_and_Kitchen reviews: 10000it [00:00, 81677.04it/s]

Running sentiment analysis on reviews...





Generating embeddings using MPNet...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Processed 10000 reviews from Home_and_Kitchen and saved to data/processed/Home_and_Kitchen_sample.parquet
Uploading 10000 points to Qdrant collection 'amazon_reviews' in batches of 1000...
Uploaded batch 1 of 10
Uploaded batch 2 of 10
Uploaded batch 3 of 10
Uploaded batch 4 of 10
Uploaded batch 5 of 10
Uploaded batch 6 of 10
Uploaded batch 7 of 10
Uploaded batch 8 of 10
Uploaded batch 9 of 10
Uploaded batch 10 of 10
Uploaded 10000 points to Qdrant collection 'amazon_reviews'.
