In [1]:
# Data Cleaning & Extraction for Sentiment Analysis (Amazon Reviews)
# - Uses content/Reviews.csv
# - Produces TF-IDF features and tokenized sequences
# - Saves train/val/test splits and artifacts in processed_data/



import os
import re
import json
import pickle
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

# NLTK setup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def _ensure_nltk():
    try:
        _ = stopwords.words("english")
    except LookupError:
        nltk.download("stopwords")
    try:
        _ = nltk.data.find("corpora/wordnet")
    except LookupError:
        nltk.download("wordnet")
    try:
        _ = nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt")

_ensure_nltk()
STOP_WORDS = set(stopwords.words("english"))
LEMM = WordNetLemmatizer()



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



1) Load dataset (Amazon Reviews)


In [2]:

def load_dataset():
    candidates = [
        os.path.join("content", "Reviews.csv"),
        os.path.join("content", "dataset_part1_random_70pct.csv"),
    ]
    for path in candidates:
        if os.path.exists(path):
            print(f"Loading dataset: {path}")
            df = pd.read_csv(path)
            return df, path
    raise FileNotFoundError("No dataset found in 'content/'. Expected Reviews.csv")



In [3]:
# -----------------------------
# 2) Normalize columns and labels
# -----------------------------
def normalize_and_label(df):
    # Expecting columns: 'Text' and 'Score'
    # Map Score 1-2 -> 0 (negative), 4-5 -> 1 (positive), drop 3 (neutral)
    text_col = None
    for c in ["Text", "text", "reviewText", "ReviewText", "review", "Review"]:
        if c in df.columns:
            text_col = c
            break
    if text_col is None:
        raise ValueError("Could not find text column. Expected 'Text' in Reviews.csv")

    if "Score" not in df.columns:
        # If alternative sentiment column exists, try mapping
        if "Sentiment" in df.columns:
            df = df.rename(columns={text_col: "text", "Sentiment": "sentiment"})
            df["label"] = df["sentiment"].map({"negative": 0, "positive": 1})
            df = df.dropna(subset=["text", "label"])
            return df[["text", "label"]]
        else:
            raise ValueError("Expected 'Score' column for Amazon Reviews dataset")

    df = df.rename(columns={text_col: "text"})
    df = df[["text", "Score"]].dropna(subset=["text", "Score"])
    # Drop neutral 3
    df = df[df["Score"] != 3]
    df["label"] = (df["Score"] >= 4).astype(int)
    df = df.drop(columns=["Score"]).reset_index(drop=True)
    return df



In [4]:
# -----------------------------
# 3) Text cleaning
# -----------------------------
def clean_text(s: str) -> str:
    s = str(s)
    s = s.lower()
    s = re.sub(r"<[^>]+>", " ", s)            # HTML tags
    s = re.sub(r"http\S+|www\.\S+", " ", s)   # URLs
    s = re.sub(r"[^a-z\s]", " ", s)           # keep letters and space
    s = re.sub(r"\s+", " ", s).strip()
    return s

def lemmatize_and_filter(s: str) -> str:
    tokens = s.split()  # simple fast tokenization on whitespace
    kept = []
    for t in tokens:
        if t in STOP_WORDS: 
            continue
        if len(t) < 3:
            continue
        kept.append(LEMM.lemmatize(t))
    return " ".join(kept)

def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    print("Cleaning text...")
    df = df.copy()
    df["clean"] = df["text"].apply(clean_text)
    print("Lemmatizing and removing stopwords...")
    df["proc"] = df["clean"].apply(lemmatize_and_filter)
    # drop empty after processing
    df = df[df["proc"].str.len() > 0].reset_index(drop=True)
    return df



In [5]:
# -----------------------------
# 4) Split into train/val/test
# -----------------------------
def stratified_splits(df, test_size=0.2, val_size=0.1, seed=42):
    y = df["label"].values
    X = df["proc"].values

    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=seed
    )
    val_ratio = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_ratio, stratify=y_temp, random_state=seed
    )
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)



In [6]:
# -----------------------------
# 5) TF-IDF features (for ML models)
# -----------------------------
def build_tfidf(X_train, X_val, X_test, max_features=30000):
    vect = TfidfVectorizer(
        max_features=max_features,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.98
    )
    Xtr = vect.fit_transform(X_train)
    Xv = vect.transform(X_val)
    Xte = vect.transform(X_test)
    return Xtr, Xv, Xte, vect

# -----------------------------
# 6) Simple tokenizer + sequences (for DL models)
# -----------------------------
def build_vocab(texts, max_words=50000, min_freq=2):
    from collections import Counter
    cnt = Counter()
    for s in texts:
        cnt.update(s.split())
    # keep tokens by freq
    vocab = [w for w, f in cnt.items() if f >= min_freq]
    vocab.sort(key=lambda w: (-cnt[w], w))
    vocab = vocab[:max_words]
    word_index = {w: i+1 for i, w in enumerate(vocab)}  # 0 reserved for PAD
    return word_index, cnt

def texts_to_padded_sequences(texts, word_index, max_len=200):
    seqs = []
    for s in texts:
        ids = [word_index.get(w, 0) for w in s.split()]
        if len(ids) >= max_len:
            ids = ids[:max_len]
        else:
            ids = ids + [0] * (max_len - len(ids))
        seqs.append(ids)
    return np.array(seqs, dtype=np.int32)



In [9]:
# --------- RUN ---------
# Tip: Set limit_rows for a faster first run (e.g., 100000). None uses all rows.
run_pipeline(limit_rows=None, seed=42)

# Minimal quick check (no extra helper loaders)
from scipy import sparse
import numpy as np, os, json

# TF-IDF (for Logistic Regression)
X_train_tfidf = sparse.load_npz(os.path.join("processed_data", "X_train_tfidf.npz"))
X_val_tfidf   = sparse.load_npz(os.path.join("processed_data", "X_val_tfidf.npz"))
X_test_tfidf  = sparse.load_npz(os.path.join("processed_data", "X_test_tfidf.npz"))

# Sequences + Embedding (for RNN/LSTM)
X_train_seq = np.load(os.path.join("processed_data", "X_train_seq.npy"))
X_val_seq   = np.load(os.path.join("processed_data", "X_val_seq.npy"))
X_test_seq  = np.load(os.path.join("processed_data", "X_test_seq.npy"))
emb         = np.load(os.path.join("processed_data", "embedding_matrix_glove100.npy"))

# Labels
y_train = np.load(os.path.join("processed_data", "y_train.npy"))
y_val   = np.load(os.path.join("processed_data", "y_val.npy"))
y_test  = np.load(os.path.join("processed_data", "y_test.npy"))

print("Loaded artifacts:")
print("- TF-IDF shapes:", X_train_tfidf.shape, X_val_tfidf.shape, X_test_tfidf.shape)
print("- Seq shapes:",   X_train_seq.shape, X_val_seq.shape, X_test_seq.shape)
print("- Embedding:",    emb.shape)
print("- y sizes:",      y_train.shape, y_val.shape, y_test.shape)

Loading dataset: content\Reviews.csv
Rows in raw: 158506
Rows after label filtering (drop score=3): 146514
Cleaning text...
Lemmatizing and removing stopwords...
Saved cleaned dataset -> processed_data\processed_dataset.csv
Splits -> train: 102557, val: 14652, test: 29303
Saved TF-IDF features and vectorizer.
Saved tokenized padded sequences.
Saved labels.
GloVe not found. Using random embeddings. (Optional: place glove.6B.100d.txt in content/)
Saved embedding matrix (GloVe/random).

All done. Files written to ./processed_data
Loaded artifacts:
- TF-IDF shapes: (102557, 30000) (14652, 30000) (29303, 30000)
- Seq shapes: (102557, 200) (14652, 200) (29303, 200)
- Embedding: (28533, 100)
- y sizes: (102557,) (14652,) (29303,)
