In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# !pip install datasets --quiet

In [3]:
import re
import numpy as np
import pandas as pd
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from scipy.sparse import csr_matrix
import pickle


# Load spaCy model and stopwords
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

# Clean and lemmatize review text
def clean_and_lemmatize(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in stop_words and token.is_alpha]
    return " ".join(tokens)

# Add numeric features to the dataframe
def add_features(df):
    df["review_length"] = df["clean_text"].apply(lambda x: len(x.split()))

    # If helpful_vote exists, fill NaNs, otherwise create the column
    if "helpful_vote" in df.columns:
        df["helpful_vote"] = df["helpful_vote"].fillna(0)
    else:
        df["helpful_vote"] = 0

    return df
# Normalize numeric features (returns both scaled features and sparse format)
def normalize_features(df, feature_cols):
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(df[feature_cols])
    sparse = csr_matrix(scaled)
    return scaled, sparse

# Generate TF-IDF matrix
def compute_tfidf(texts, max_features=300):
    tfidf = TfidfVectorizer(max_features=max_features)
    matrix = tfidf.fit_transform(texts)
    return tfidf, matrix

# Generate embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
def generate_embeddings(texts):
    return model.encode(texts, show_progress_bar=True, batch_size=64)

# Save preprocessed data and embeddings
def save_preprocessed(df, embeddings, path_prefix):
    df.to_pickle(f"{path_prefix}_df.pkl")
    np.save(f"{path_prefix}_embeddings.npy", embeddings)

# Load preprocessed data
def load_preprocessed(path_prefix):
    df = pd.read_pickle(f"{path_prefix}_review_df_with_clean_text.pkl")
    embeddings = np.load(f"{path_prefix}_review_embeddings.npy")
    return df, embeddings

# All-in-one preprocessing function (for convenience)
def preprocess_reviews(categories, max_rows=5000, save_path=None):
    from datasets import load_dataset

    dfs = []
    for hf_name, label in categories.items():
        print(f"Loading: {label}")
        dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", hf_name, trust_remote_code=True)
        subset = dataset["full"].select(range(max_rows))
        df = subset.to_pandas()

        # Rename rating column if needed
        if "overall" in df.columns:
            df.rename(columns={"overall": "rating"}, inplace=True)

        df = df[df["text"].notnull()].copy()
        df["category"] = label

        # Only keep necessary columns: text, rating, and category
        df = df[["text", "rating", "category", "parent_asin","title"]]

        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)

    # Clean the text
    df["clean_text"] = df["text"].apply(clean_and_lemmatize)

    # Add numeric features (length, helpfulness — assuming helpful_vote already there, otherwise you can skip)
    df = add_features(df)

    # Generate embeddings
    embeddings = generate_embeddings(df["clean_text"].tolist())

    # Save if path provided
    if save_path:
        print("Saving data...")
        np.save(f"{save_path}/review_embeddings.npy", embeddings)
        df.to_pickle(f"{save_path}/review_df_with_clean_text.pkl")
        print("Data saved to:", save_path)

    return df, embeddings


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
if __name__ == "__main__":
    from datasets import load_dataset

    categories = {
        "raw_review_All_Beauty": "Beauty",
        "raw_review_Amazon_Fashion": "Fashion",
        "raw_review_CDs_and_Vinyl": "CDs_and_Vinyl",
        "raw_review_Appliances": "Appliances",
        "raw_review_Movies_and_TV": "Movies_and_TV"
    }

    df, embeddings = preprocess_reviews(categories, max_rows=5000, save_path="/content/drive/MyDrive/DEAssignment3")


Loading: Beauty
Loading: Fashion
Loading: CDs_and_Vinyl
Loading: Appliances
Loading: Movies_and_TV


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Saving data...
Data saved to: /content/drive/MyDrive/DEAssignment3
