<a href="https://colab.research.google.com/github/mmourtias/Data-Engineering-Roadmap/blob/main/amazon_reviews_4_models_comparison_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/mmourtias/Data-Engineering-Roadmap/blob/main/amazon_reviews_4_models_comparison_ADAPTED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Amazon Reviews — Σύγκριση 4 λύσεων
**(α) Word2Vec, (β) GloVe, (γ) BERT + Μηχανική Μάθηση (feature extraction), (δ) BERT fine-tuning**



In [2]:
!pip -q install gensim transformers datasets evaluate


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from gensim.models import Word2Vec

import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x7f43f79a7d90>

In [5]:
# Step 1: Load το Amazon CSV



df = pd.read_csv("/content/Amazon_Unlocked_Mobile.csv")

print(df.shape)
df.head()


(413840, 6)


Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [6]:
# Step 1b: Φτιάχνουμε labels για sentiment από το Rating
# Συνηθισμένη επιλογή για binary sentiment:
# 1-2 -> negative (0)
# 4-5 -> positive (1)
# 3 -> neutral (το αφαιρούμε για καθαρό binary task)

df = df.dropna(subset=["Reviews", "Rating"]).copy()
df = df[df["Rating"].isin([1,2,3,4,5])].copy()

df = df[df["Rating"] != 3].copy()
df["label"] = (df["Rating"] >= 4).astype(int)

# Κρατάμε τα reviews ως input και το label ως output
df = df[["Reviews", "label"]].rename(columns={"Reviews":"text"})
print(df["label"].value_counts())
df.head()


label
1    284948
0     97059
Name: count, dtype: int64


Unnamed: 0,text,label
0,I feel so LUCKY to have found this used (phone...,1
1,"nice phone, nice up grade from my pantach revu...",1
2,Very pleased,1
3,It works good but it goes slow sometimes but i...,1
4,Great phone to replace my lost phone. The only...,1


In [7]:
# Προαιρετικό: sampling για πιο γρήγορη εκτέλεση
# MAX_SAMPLES = None για να χρησιμοποιήσεις όλο το dataset (~413k rows μετά το drop των 3-star)

MAX_SAMPLES = 100000  # π.χ. μπορώ να βάλω 100k για γρήγορη σύγκριση
if MAX_SAMPLES is not None and len(df) > MAX_SAMPLES:
    df = df.sample(n=MAX_SAMPLES, random_state=SEED).reset_index(drop=True)

print("Using rows:", len(df))


Using rows: 100000


In [8]:
# Step 1c: Train/Val/Test split (ίδιο για όλα τα μοντέλα για δίκαιη σύγκριση)
X = df["text"].astype(str).tolist()
y = df["label"].astype(int).tolist()

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp
)

print(len(X_train), len(X_val), len(X_test))


80000 10000 10000


In [9]:
# Κοινό preprocessing για Word2Vec & GloVe:
# - lowercase
# - αφαίρεση στίξης/συμβόλων (μειώνει θόρυβο στο vocab)
# Για BERT δεν κάνουμε χειροκίνητο cleaning (κρατάμε raw text).

def clean_for_static_embeddings(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize_simple(s: str):
    return clean_for_static_embeddings(s).split()

tok_train = [tokenize_simple(t) for t in X_train]
tok_val   = [tokenize_simple(t) for t in X_val]
tok_test  = [tokenize_simple(t) for t in X_test]


## (α) Word2Vec

In [10]:
# Step 2: Train Word2Vec
w2v_dim = 200
w2v_model = Word2Vec(
    sentences=tok_train,
    vector_size=w2v_dim,
    window=5,
    min_count=2,
    workers=4,
    sg=1,   # skip-gram
    seed=SEED
)
w2v_kv = w2v_model.wv
print("Word2Vec vocab size:", len(w2v_kv))


Word2Vec vocab size: 19678


In [11]:
# Step 3: Review embeddings με mean pooling

def mean_pool(tokens, model, dim):
    vecs = [model[w] for w in tokens if w in model]
    if not vecs:
        return np.zeros(dim, dtype=np.float32)
    return np.mean(vecs, axis=0).astype(np.float32)

def build_matrix(token_lists, model, dim):
    return np.vstack([mean_pool(toks, model, dim) for toks in token_lists])

Xtr_w2v = build_matrix(tok_train, w2v_kv, w2v_dim)
Xva_w2v = build_matrix(tok_val,   w2v_kv, w2v_dim)
Xte_w2v = build_matrix(tok_test,  w2v_kv, w2v_dim)
Xtr_w2v.shape


(80000, 200)

In [12]:
# Step 4: Classifier πάνω στα embeddings + metrics
clf_w2v = LogisticRegression(max_iter=2000, n_jobs=None)
clf_w2v.fit(Xtr_w2v, y_train)

pred_w2v = clf_w2v.predict(Xte_w2v)

w2v_acc = accuracy_score(y_test, pred_w2v)
w2v_f1m = f1_score(y_test, pred_w2v, average="macro")

print("Word2Vec TEST Accuracy:", w2v_acc)
print("Word2Vec TEST F1 macro:", w2v_f1m)
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred_w2v))


Word2Vec TEST Accuracy: 0.9245
Word2Vec TEST F1 macro: 0.8985855368871603

Confusion matrix:
 [[2095  442]
 [ 313 7150]]


## (β) GloVe + Logistic Regression (ίδιο pipeline με Word2Vec)

In [13]:
# Χρησιμοποιούμε pre-trained GloVe embeddings.
# Στο Colab αποφεύγουμε torchtext (συχνά έχει ασυμβατότητες με torch),
# οπότε φορτώνουμε GloVe μέσω gensim και κάνουμε mean pooling όπως στο Word2Vec.

import gensim.downloader as api

glove_name = "glove-wiki-gigaword-200"  # 200d
glove_model = api.load(glove_name)      # κατεβάζει embeddings
glove_dim = glove_model.vector_size

print("GloVe loaded:", glove_name, "dim:", glove_dim, "vocab:", len(glove_model))


GloVe loaded: glove-wiki-gigaword-200 dim: 200 vocab: 400000


In [14]:
Xtr_g = build_matrix(tok_train, glove_model, glove_dim)
Xva_g = build_matrix(tok_val,   glove_model, glove_dim)
Xte_g = build_matrix(tok_test,  glove_model, glove_dim)

clf_g = LogisticRegression(max_iter=2000)
clf_g.fit(Xtr_g, y_train)

pred_g = clf_g.predict(Xte_g)

glove_acc = accuracy_score(y_test, pred_g)
glove_f1m = f1_score(y_test, pred_g, average="macro")

print("GloVe TEST Accuracy:", glove_acc)
print("GloVe TEST F1 macro:", glove_f1m)
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred_g))


GloVe TEST Accuracy: 0.8896
GloVe TEST F1 macro: 0.8486410516507475

Confusion matrix:
 [[1847  690]
 [ 414 7049]]


## (γ) BERT + Μηχανική Μάθηση (Traditional feature extraction)

In [15]:
# Σε αυτό το workflow:
# - ΔΕΝ εκπαιδεύουμε το BERT
# - Παίρνουμε embeddings (CLS) από DistilBERT
# - Εκπαιδεύουμε μόνο έναν κλασικό ταξινομητή (Logistic Regression)

import torch
from transformers import AutoTokenizer, AutoModel

bert_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_name)
bert_encoder = AutoModel.from_pretrained(bert_name)  # PyTorch
bert_encoder.eval()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [16]:
# Παίρνουμε το [CLS] embedding (πρώτο token) από το last_hidden_state (DistilBERT).
# Σημείωση: Χρησιμοποιούμε raw text (X_train / X_val / X_test) όπως ορίστηκαν στο split.

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_encoder.to(device)
bert_encoder.eval()

@torch.no_grad()
def bert_cls_embeddings(texts, batch_size=64, max_len=128):
    all_vecs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        out = bert_encoder(**enc)
        cls = out.last_hidden_state[:, 0, :].detach().cpu().numpy()  # [B, H]
        all_vecs.append(cls)
    return np.vstack(all_vecs)

# Υπολογίζουμε embeddings για train/test (με ίδια splits)
Xtr_b = bert_cls_embeddings(X_train)
Xte_b = bert_cls_embeddings(X_test)

print("BERT CLS shapes:", Xtr_b.shape, Xte_b.shape)


BERT CLS shapes: (80000, 768) (10000, 768)


In [17]:
clf_b = LogisticRegression(max_iter=2000)
clf_b.fit(Xtr_b, y_train)

pred_b = clf_b.predict(Xte_b)

bertml_acc = accuracy_score(y_test, pred_b)
bertml_f1m = f1_score(y_test, pred_b, average="macro")

print("BERT+ML TEST Accuracy:", bertml_acc)
print("BERT+ML TEST F1 macro:", bertml_f1m)
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred_b))


BERT+ML TEST Accuracy: 0.9348
BERT+ML TEST F1 macro: 0.9134348294910435

Confusion matrix:
 [[2190  347]
 [ 305 7158]]


## (δ) BERT Fine-tuning (Modern workflow)

Σε αυτό το workflow κάνουμε fine-tuning end-to-end του DistilBERT με classification head πάνω στα ίδια splits.

In [19]:
# Fine-tuning με HuggingFace Trainer (PyTorch)
# - ίδια δεδομένα/splits/μετρικές
# - μικρός αριθμός εποχών για να ολοκληρώνεται σε Colab

metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

ft_tokenizer = tokenizer  # ίδιο tokenizer (distilbert-base-uncased)

# Φτιάχνουμε HF datasets από τα ήδη split lists
from datasets import Dataset

train_ds = Dataset.from_dict({"text": X_train, "label": y_train})
val_ds   = Dataset.from_dict({"text": X_val,   "label": y_val})
test_ds  = Dataset.from_dict({"text": X_test,  "label": y_test})

def tok_fn(batch):
    return ft_tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(tok_fn, batched=True)
val_ds   = val_ds.map(tok_fn, batched=True)
test_ds  = test_ds.map(tok_fn, batched=True)

cols = ["input_ids", "attention_mask", "label"]
train_ds.set_format(type="torch", columns=cols)
val_ds.set_format(type="torch", columns=cols)
test_ds.set_format(type="torch", columns=cols)

ft_model = AutoModelForSequenceClassification.from_pretrained(bert_name, num_labels=2)

args = TrainingArguments(
    output_dir="./bert_ft_out",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    seed=SEED,
    logging_steps=50,
)

trainer = Trainer(
    model=ft_model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=ft_tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.1482,0.110976,0.9652,0.953649
2,0.0717,0.121304,0.9709,0.961631


TrainOutput(global_step=10000, training_loss=0.11115442472696305, metrics={'train_runtime': 2103.8906, 'train_samples_per_second': 76.05, 'train_steps_per_second': 4.753, 'total_flos': 5298695946240000.0, 'train_loss': 0.11115442472696305, 'epoch': 2.0})

In [20]:
# Αξιολόγηση στο test set (fine-tuned model)
test_metrics = trainer.evaluate(test_ds)
bertft_acc = test_metrics["eval_accuracy"]
bertft_f1m = test_metrics["eval_f1_macro"]

print("BERT fine-tuning TEST Accuracy:", bertft_acc)
print("BERT fine-tuning TEST F1 macro:", bertft_f1m)

# Confusion matrix στο test
pred_logits = trainer.predict(test_ds).predictions
pred_ft = np.argmax(pred_logits, axis=-1)
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred_ft))


BERT fine-tuning TEST Accuracy: 0.9707
BERT fine-tuning TEST F1 macro: 0.9612769416174298

Confusion matrix:
 [[2387  150]
 [ 143 7320]]


## Σύνοψη αποτελεσμάτων (ίδιες μετρικές για δίκαιη σύγκριση)

In [21]:
results = pd.DataFrame([
    ["Word2Vec + LR", w2v_acc, w2v_f1m],
    ["GloVe + LR", glove_acc, glove_f1m],
    ["BERT (feature extraction) + LR", bertml_acc, bertml_f1m],
    ["BERT fine-tuning", bertft_acc, bertft_f1m],
], columns=["Model", "Test Accuracy", "Test F1 (macro)"])

results.sort_values("Test F1 (macro)", ascending=False)


Unnamed: 0,Model,Test Accuracy,Test F1 (macro)
3,BERT fine-tuning,0.9707,0.961277
2,BERT (feature extraction) + LR,0.9348,0.913435
0,Word2Vec + LR,0.9245,0.898586
1,GloVe + LR,0.8896,0.848641



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

