In [1]:
# Data Cleaning & Extraction for Sentiment Analysis (Amazon Reviews)
# - Uses content/Reviews.csv
# - Produces TF-IDF features and tokenized sequences
# - Saves train/val/test splits and artifacts in processed_data/



import os
import re
import json
import pickle
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

# NLTK setup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def _ensure_nltk():
    try:
        _ = stopwords.words("english")
    except LookupError:
        nltk.download("stopwords")
    try:
        _ = nltk.data.find("corpora/wordnet")
    except LookupError:
        nltk.download("wordnet")
    try:
        _ = nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt")

_ensure_nltk()
STOP_WORDS = set(stopwords.words("english"))
LEMM = WordNetLemmatizer()



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



1) Load dataset (Amazon Reviews)


In [2]:

def load_dataset():
    candidates = [
        os.path.join("content", "Reviews.csv"),
        os.path.join("content", "dataset_part1_random_70pct.csv"),
    ]
    for path in candidates:
        if os.path.exists(path):
            print(f"Loading dataset: {path}")
            df = pd.read_csv(path)
            return df, path
    raise FileNotFoundError("No dataset found in 'content/'. Expected Reviews.csv")



In [3]:
# -----------------------------
# 2) Normalize columns and labels
# -----------------------------
def normalize_and_label(df):
    # Expecting columns: 'Text' and 'Score'
    # Map Score 1-2 -> 0 (negative), 4-5 -> 1 (positive), drop 3 (neutral)
    text_col = None
    for c in ["Text", "text", "reviewText", "ReviewText", "review", "Review"]:
        if c in df.columns:
            text_col = c
            break
    if text_col is None:
        raise ValueError("Could not find text column. Expected 'Text' in Reviews.csv")

    if "Score" not in df.columns:
        # If alternative sentiment column exists, try mapping
        if "Sentiment" in df.columns:
            df = df.rename(columns={text_col: "text", "Sentiment": "sentiment"})
            df["label"] = df["sentiment"].map({"negative": 0, "positive": 1})
            df = df.dropna(subset=["text", "label"])
            return df[["text", "label"]]
        else:
            raise ValueError("Expected 'Score' column for Amazon Reviews dataset")

    df = df.rename(columns={text_col: "text"})
    df = df[["text", "Score"]].dropna(subset=["text", "Score"])
    # Drop neutral 3
    df = df[df["Score"] != 3]
    df["label"] = (df["Score"] >= 4).astype(int)
    df = df.drop(columns=["Score"]).reset_index(drop=True)
    return df



In [4]:
# -----------------------------
# 3) Text cleaning
# -----------------------------
def clean_text(s: str) -> str:
    s = str(s)
    s = s.lower()
    s = re.sub(r"<[^>]+>", " ", s)            # HTML tags
    s = re.sub(r"http\S+|www\.\S+", " ", s)   # URLs
    s = re.sub(r"[^a-z\s]", " ", s)           # keep letters and space
    s = re.sub(r"\s+", " ", s).strip()
    return s

def lemmatize_and_filter(s: str) -> str:
    tokens = s.split()  # simple fast tokenization on whitespace
    kept = []
    for t in tokens:
        if t in STOP_WORDS: 
            continue
        if len(t) < 3:
            continue
        kept.append(LEMM.lemmatize(t))
    return " ".join(kept)

def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    print("Cleaning text...")
    df = df.copy()
    df["clean"] = df["text"].apply(clean_text)
    print("Lemmatizing and removing stopwords...")
    df["proc"] = df["clean"].apply(lemmatize_and_filter)
    # drop empty after processing
    df = df[df["proc"].str.len() > 0].reset_index(drop=True)
    return df



In [5]:
# -----------------------------
# 4) Split into train/val/test
# -----------------------------
def stratified_splits(df, test_size=0.2, val_size=0.1, seed=42):
    y = df["label"].values
    X = df["proc"].values

    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=seed
    )
    val_ratio = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_ratio, stratify=y_temp, random_state=seed
    )
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)



In [6]:
# -----------------------------
# 5) TF-IDF features (for ML models)
# -----------------------------
def build_tfidf(X_train, X_val, X_test, max_features=30000):
    vect = TfidfVectorizer(
        max_features=max_features,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.98
    )
    Xtr = vect.fit_transform(X_train)
    Xv = vect.transform(X_val)
    Xte = vect.transform(X_test)
    return Xtr, Xv, Xte, vect

# -----------------------------
# 6) Simple tokenizer + sequences (for DL models)
# -----------------------------
def build_vocab(texts, max_words=50000, min_freq=2):
    from collections import Counter
    cnt = Counter()
    for s in texts:
        cnt.update(s.split())
    # keep tokens by freq
    vocab = [w for w, f in cnt.items() if f >= min_freq]
    vocab.sort(key=lambda w: (-cnt[w], w))
    vocab = vocab[:max_words]
    word_index = {w: i+1 for i, w in enumerate(vocab)}  # 0 reserved for PAD
    return word_index, cnt

def texts_to_padded_sequences(texts, word_index, max_len=200):
    seqs = []
    for s in texts:
        ids = [word_index.get(w, 0) for w in s.split()]
        if len(ids) >= max_len:
            ids = ids[:max_len]
        else:
            ids = ids + [0] * (max_len - len(ids))
        seqs.append(ids)
    return np.array(seqs, dtype=np.int32)



In [7]:

import os, json, numpy as np
from scipy import sparse
import pickle

os.makedirs("processed_data", exist_ok=True)

# 1) Load + normalize
df_raw, used_path = load_dataset()
df = normalize_and_label(df_raw)
df = preprocess_dataframe(df)

# Persist cleaned dataset
cleaned_path = os.path.join("processed_data", "processed_dataset.csv")
df[["text", "proc", "label"]].to_csv(cleaned_path, index=False, encoding="utf-8")
print(f"Saved cleaned dataset -> {cleaned_path}")

# 2) Splits
(X_train, y_train), (X_val, y_val), (X_test, y_test) = stratified_splits(
    df, test_size=0.2, val_size=0.1, seed=42
)
print(f"Splits -> train: {len(X_train)}, val: {len(X_val)}, test: {len(X_test)}")

# 3) TF-IDF (for Logistic Regression)
Xtr_tfidf, Xv_tfidf, Xte_tfidf, tfidf_vect = build_tfidf(X_train, X_val, X_test, max_features=30000)
sparse.save_npz(os.path.join("processed_data", "X_train_tfidf.npz"), Xtr_tfidf)
sparse.save_npz(os.path.join("processed_data", "X_val_tfidf.npz"),   Xv_tfidf)
sparse.save_npz(os.path.join("processed_data", "X_test_tfidf.npz"),  Xte_tfidf)
with open(os.path.join("processed_data", "tfidf_vectorizer.pkl"), "wb") as f:
    pickle.dump(tfidf_vect, f)
print("Saved TF-IDF features and vectorizer.")

# 4) Sequences (for RNN/LSTM)
word_index, _ = build_vocab(X_train, max_words=50000, min_freq=2)
with open(os.path.join("processed_data", "word_index.json"), "w", encoding="utf-8") as f:
    json.dump(word_index, f)

max_len = 200
Xtr_seq = texts_to_padded_sequences(X_train, word_index, max_len=max_len)
Xv_seq  = texts_to_padded_sequences(X_val,   word_index, max_len=max_len)
Xte_seq = texts_to_padded_sequences(X_test,  word_index, max_len=max_len)
np.save(os.path.join("processed_data", "X_train_seq.npy"), Xtr_seq)
np.save(os.path.join("processed_data", "X_val_seq.npy"),   Xv_seq)
np.save(os.path.join("processed_data", "X_test_seq.npy"),  Xte_seq)
print("Saved tokenized padded sequences.")

# 5) Labels
np.save(os.path.join("processed_data", "y_train.npy"), y_train)
np.save(os.path.join("processed_data", "y_val.npy"),   y_val)
np.save(os.path.join("processed_data", "y_test.npy"),  y_test)
print("Saved labels.")

# 6) Embedding matrix (GloVe if available, else random)
glove_path = os.path.join("content", "glove.6B.100d.txt")
embed_dim = 100
vocab_size = (max(word_index.values()) + 1) if word_index else 1
rng = np.random.default_rng(42)
emb = rng.normal(scale=0.02, size=(vocab_size, embed_dim)).astype(np.float32)
emb[0] = 0.0  # PAD row

if os.path.exists(glove_path):
    print(f"Loading GloVe from {glove_path} ...")
    found = 0
    with open(glove_path, "r", encoding="utf8") as f:
        for line in f:
            parts = line.rstrip().split(" ")
            token, vec = parts[0], parts[1:]
            if len(vec) != embed_dim: 
                continue
            if token in word_index:
                emb[word_index[token]] = np.asarray(vec, dtype=np.float32)
                found += 1
    print(f"GloVe matches: {found}/{vocab_size-1}")
else:
    print("GloVe not found. Using random embeddings.")

np.save(os.path.join("processed_data", "embedding_matrix_glove100.npy"), emb)
print("Saved embedding matrix.")

# --------- Minimal quick check ---------
X_train_tfidf = sparse.load_npz(os.path.join("processed_data", "X_train_tfidf.npz"))
X_val_tfidf   = sparse.load_npz(os.path.join("processed_data", "X_val_tfidf.npz"))
X_test_tfidf  = sparse.load_npz(os.path.join("processed_data", "X_test_tfidf.npz"))

X_train_seq = np.load(os.path.join("processed_data", "X_train_seq.npy"))
X_val_seq   = np.load(os.path.join("processed_data", "X_val_seq.npy"))
X_test_seq  = np.load(os.path.join("processed_data", "X_test_seq.npy"))
emb         = np.load(os.path.join("processed_data", "embedding_matrix_glove100.npy"))

y_train = np.load(os.path.join("processed_data", "y_train.npy"))
y_val   = np.load(os.path.join("processed_data", "y_val.npy"))
y_test  = np.load(os.path.join("processed_data", "y_test.npy"))

print("\nLoaded artifacts:")
print("- TF-IDF shapes:", X_train_tfidf.shape, X_val_tfidf.shape, X_test_tfidf.shape)
print("- Seq shapes:",   X_train_seq.shape, X_val_seq.shape, X_test_seq.shape)
print("- Embedding:",    emb.shape)
print("- y sizes:",      y_train.shape, y_val.shape, y_test.shape)

Loading dataset: content\Reviews.csv
Cleaning text...
Lemmatizing and removing stopwords...
Saved cleaned dataset -> processed_data\processed_dataset.csv
Splits -> train: 49842, val: 7121, test: 14241
Saved TF-IDF features and vectorizer.
Saved tokenized padded sequences.
Saved labels.
GloVe not found. Using random embeddings.
Saved embedding matrix.

Loaded artifacts:
- TF-IDF shapes: (49842, 30000) (7121, 30000) (14241, 30000)
- Seq shapes: (49842, 200) (7121, 200) (14241, 200)
- Embedding: (20530, 100)
- y sizes: (49842,) (7121,) (14241,)


In [1]:
# Minimal LSTM for sentiment classification using processed_data
import os, numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from sklearn.metrics import accuracy_score, f1_score

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Load data
X_train = np.load(os.path.join("processed_data", "X_train_seq.npy"))
X_val   = np.load(os.path.join("processed_data", "X_val_seq.npy"))
X_test  = np.load(os.path.join("processed_data", "X_test_seq.npy"))
y_train = np.load(os.path.join("processed_data", "y_train.npy"))
y_val   = np.load(os.path.join("processed_data", "y_val.npy"))
y_test  = np.load(os.path.join("processed_data", "y_test.npy"))
emb     = np.load(os.path.join("processed_data", "embedding_matrix_glove100.npy"))

max_len = X_train.shape[1]
vocab_size, embed_dim = emb.shape

# Model
inp = layers.Input(shape=(max_len,), dtype="int32")
embd = layers.Embedding(vocab_size, embed_dim, weights=[emb], trainable=False, mask_zero=True)(inp)
x = layers.LSTM(128)(embd)
out = layers.Dense(1, activation="sigmoid")(x)
model = models.Model(inp, out)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

es = callbacks.EarlyStopping(monitor="val_loss", patience=1, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=3,
    batch_size=256,
    callbacks=[es],
    verbose=1
)

# Evaluate
loss, acc = model.evaluate(X_test, y_test, verbose=0)
pred = (model.predict(X_test, batch_size=512, verbose=0).ravel() >= 0.5).astype(int)
print(f"Test accuracy: {acc:.4f}, F1: {f1_score(y_test, pred):.4f}")

# Save
model.save(os.path.join("processed_data", "lstm_glove.keras"))

Epoch 1/3
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m302s[0m 1s/step - accuracy: 0.8433 - loss: 0.4390 - val_accuracy: 0.8448 - val_loss: 0.4142
Epoch 2/3
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 1s/step - accuracy: 0.8453 - loss: 0.4082 - val_accuracy: 0.8438 - val_loss: 0.4007
Epoch 3/3
[1m195/195[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 1s/step - accuracy: 0.8456 - loss: 0.3969 - val_accuracy: 0.8457 - val_loss: 0.3965
Test accuracy: 0.8448, F1: 0.9149
