<a href="https://colab.research.google.com/github/mmourtias/Data-Engineering-Roadmap/blob/main/amazon_reviews_4_models_comparison_ADAPTED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Amazon Reviews — Σύγκριση 4 λύσεων
**(α) Word2Vec, (β) GloVe, (γ) BERT + Μηχανική Μάθηση (feature extraction), (δ) BERT fine-tuning**



In [None]:


!pip -q install gensim torchtext transformers datasets evaluate



In [None]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from gensim.models import Word2Vec

import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel, TFAutoModelForSequenceClassification

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)


In [5]:
# Step 1: Load το Amazon CSV



df = pd.read_csv("/content/Amazon_Unlocked_Mobile.csv")

print(df.shape)
df.head()


(413840, 6)


Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [6]:
# Step 1b: Φτιάχνουμε labels για sentiment από το Rating
# Συνηθισμένη επιλογή για binary sentiment:
# 1-2 -> negative (0)
# 4-5 -> positive (1)
# 3 -> neutral (το αφαιρούμε για καθαρό binary task)

df = df.dropna(subset=["Reviews", "Rating"]).copy()
df = df[df["Rating"].isin([1,2,3,4,5])].copy()

df = df[df["Rating"] != 3].copy()
df["label"] = (df["Rating"] >= 4).astype(int)

# Κρατάμε τα reviews ως input και το label ως output
df = df[["Reviews", "label"]].rename(columns={"Reviews":"text"})
print(df["label"].value_counts())
df.head()


label
1    284948
0     97059
Name: count, dtype: int64


Unnamed: 0,text,label
0,I feel so LUCKY to have found this used (phone...,1
1,"nice phone, nice up grade from my pantach revu...",1
2,Very pleased,1
3,It works good but it goes slow sometimes but i...,1
4,Great phone to replace my lost phone. The only...,1


In [7]:
# Προαιρετικό: sampling για πιο γρήγορη εκτέλεση
# MAX_SAMPLES = None για να χρησιμοποιήσεις όλο το dataset (~413k rows μετά το drop των 3-star)

MAX_SAMPLES = None  # π.χ. μπορώ να βάλω 100k για γρήγορη σύγκριση
if MAX_SAMPLES is not None and len(df) > MAX_SAMPLES:
    df = df.sample(n=MAX_SAMPLES, random_state=SEED).reset_index(drop=True)

print("Using rows:", len(df))


Using rows: 382007


In [8]:
# Step 1c: Train/Val/Test split (ίδιο για όλα τα μοντέλα για δίκαιη σύγκριση)
X = df["text"].astype(str).tolist()
y = df["label"].astype(int).tolist()

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp
)

print(len(X_train), len(X_val), len(X_test))


305605 38201 38201


In [9]:
# Κοινό preprocessing για Word2Vec & GloVe:
# - lowercase
# - αφαίρεση στίξης/συμβόλων (μειώνει θόρυβο στο vocab)
# Για BERT δεν κάνουμε χειροκίνητο cleaning (κρατάμε raw text).

def clean_for_static_embeddings(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize_simple(s: str):
    return clean_for_static_embeddings(s).split()

tok_train = [tokenize_simple(t) for t in X_train]
tok_val   = [tokenize_simple(t) for t in X_val]
tok_test  = [tokenize_simple(t) for t in X_test]


## (α) Word2Vec

In [10]:
# Step 2: Train Word2Vec
w2v_dim = 200
w2v_model = Word2Vec(
    sentences=tok_train,
    vector_size=w2v_dim,
    window=5,
    min_count=2,
    workers=4,
    sg=1,   # skip-gram
    seed=SEED
)
w2v_kv = w2v_model.wv
print("Word2Vec vocab size:", len(w2v_kv))


Word2Vec vocab size: 39321


In [11]:
# Step 3: Review embeddings με mean pooling

def mean_pool(tokens, model, dim):
    vecs = [model[w] for w in tokens if w in model]
    if not vecs:
        return np.zeros(dim, dtype=np.float32)
    return np.mean(vecs, axis=0).astype(np.float32)

def build_matrix(token_lists, model, dim):
    return np.vstack([mean_pool(toks, model, dim) for toks in token_lists])

Xtr_w2v = build_matrix(tok_train, w2v_kv, w2v_dim)
Xva_w2v = build_matrix(tok_val,   w2v_kv, w2v_dim)
Xte_w2v = build_matrix(tok_test,  w2v_kv, w2v_dim)
Xtr_w2v.shape


(305605, 200)

In [12]:
# Step 4: Classifier πάνω στα embeddings + metrics
clf_w2v = LogisticRegression(max_iter=2000, n_jobs=None)
clf_w2v.fit(Xtr_w2v, y_train)

pred_w2v = clf_w2v.predict(Xte_w2v)

w2v_acc = accuracy_score(y_test, pred_w2v)
w2v_f1m = f1_score(y_test, pred_w2v, average="macro")

print("Word2Vec TEST Accuracy:", w2v_acc)
print("Word2Vec TEST F1 macro:", w2v_f1m)
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred_w2v))


Word2Vec TEST Accuracy: 0.9275935184942803
Word2Vec TEST F1 macro: 0.9030147619099019

Confusion matrix:
 [[ 8102  1604]
 [ 1162 27333]]


## (β) GloVe + Logistic Regression (ίδιο pipeline με Word2Vec)

In [14]:
# Χρησιμοποιούμε pre-trained GloVe (torchtext).
# Έπειτα κάνουμε mean pooling ανά review, όπως και στο Word2Vec.

import gensim.downloader as api

glove_name = "glove-wiki-gigaword-200"  # 200d, κοντά στο 6B/200
glove_model = api.load(glove_name)     # κατεβάζει embeddings
glove_dim = glove_model.vector_size

print("GloVe loaded:", glove_name, "dim:", glove_dim, "vocab:", len(glove_model))



GloVe loaded: glove-wiki-gigaword-200 dim: 200 vocab: 400000


In [15]:
Xtr_g = build_matrix(tok_train, glove_model, glove_dim)
Xva_g = build_matrix(tok_val,   glove_model, glove_dim)
Xte_g = build_matrix(tok_test,  glove_model, glove_dim)

clf_g = LogisticRegression(max_iter=2000)
clf_g.fit(Xtr_g, y_train)

pred_g = clf_g.predict(Xte_g)

glove_acc = accuracy_score(y_test, pred_g)
glove_f1m = f1_score(y_test, pred_g, average="macro")

print("GloVe TEST Accuracy:", glove_acc)
print("GloVe TEST F1 macro:", glove_f1m)
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred_g))


GloVe TEST Accuracy: 0.8927253213266668
GloVe TEST F1 macro: 0.8535862423850633

Confusion matrix:
 [[ 7176  2530]
 [ 1568 26927]]


## (γ) BERT + Μηχανική Μάθηση (Traditional feature extraction)

In [17]:
# Σε αυτό το workflow:
# - ΔΕΝ εκπαιδεύουμε το BERT
# - Παίρνουμε embeddings (CLS) από DistilBERT
# - Εκπαιδεύουμε μόνο έναν κλασικό ταξινομητή (Logistic Regression)

import torch
from transformers import AutoTokenizer, AutoModel

bert_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_name)
bert_encoder = AutoModel.from_pretrained(bert_name)  # PyTorch
bert_encoder.eval()





DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [20]:
def bert_cls_embeddings(texts, batch_size=64, max_len=128):
    # Παίρνουμε το [CLS] embedding (πρώτο token) από το last_hidden_state
    all_vecs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=max_len, return_tensors="tf")
        out = bert_encoder(**enc)

        cls = out.last_hidden_state[:, 0, :]  # [B, H]
        all_vecs.append(cls.numpy())
    return np.vstack(all_vecs)

# Υπολογίζουμε embeddings για train/test (με ίδια splits)
Xtr_b = bert_cls_embeddings(X_train)
Xte_b = bert_cls_embeddings(X_test)

print(Xtr_b.shape, Xte_b.shape)


AttributeError: EagerTensor object has no attribute 'size'. 
        If you are looking for numpy-related methods, please run the following:
        tf.experimental.numpy.experimental_enable_numpy_behavior()
      

In [None]:
clf_b = LogisticRegression(max_iter=2000)
clf_b.fit(Xtr_b, y_train)

pred_b = clf_b.predict(Xte_b)

bertml_acc = accuracy_score(y_test, pred_b)
bertml_f1m = f1_score(y_test, pred_b, average="macro")

print("BERT+ML TEST Accuracy:", bertml_acc)
print("BERT+ML TEST F1 macro:", bertml_f1m)
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred_b))


## (δ) BERT Fine-tuning (Modern workflow όπως στο TF notebook)

In [None]:
# Σε αυτό το workflow:
# - Εκπαιδεύουμε end-to-end το DistilBERT + classification head
# - (όπως στο TensorFlow notebook που έστειλες, με model.fit)

ft_model = TFAutoModelForSequenceClassification.from_pretrained(bert_name, num_labels=2)


In [None]:
# Προετοιμασία tf.data datasets
def make_tf_dataset(texts, labels, batch_size=16, shuffle=False):
    enc = tokenizer(texts, truncation=True, padding=True, max_length=128)
    ds = tf.data.Dataset.from_tensor_slices((dict(enc), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(10000, len(texts)), seed=SEED)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

tf_train = make_tf_dataset(X_train, y_train, batch_size=16, shuffle=True)
tf_val   = make_tf_dataset(X_val,   y_val,   batch_size=32, shuffle=False)
tf_test  = make_tf_dataset(X_test,  y_test,  batch_size=32, shuffle=False)


In [None]:
# Compile + fine-tune
ft_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]
)

history = ft_model.fit(tf_train, validation_data=tf_val, epochs=2)


In [None]:
# Αξιολόγηση στο test set
logits = ft_model.predict(tf_test).logits
pred_ft = np.argmax(logits, axis=-1)

bertft_acc = accuracy_score(y_test, pred_ft)
bertft_f1m = f1_score(y_test, pred_ft, average="macro")

print("BERT fine-tuning TEST Accuracy:", bertft_acc)
print("BERT fine-tuning TEST F1 macro:", bertft_f1m)
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred_ft))


## Σύνοψη αποτελεσμάτων (ίδιες μετρικές για δίκαιη σύγκριση)

In [None]:
results = pd.DataFrame([
    ["Word2Vec + LR", w2v_acc, w2v_f1m],
    ["GloVe + LR", glove_acc, glove_f1m],
    ["BERT (feature extraction) + LR", bertml_acc, bertml_f1m],
    ["BERT fine-tuning", bertft_acc, bertft_f1m],
], columns=["Model", "Test Accuracy", "Test F1 (macro)"])

results.sort_values("Test F1 (macro)", ascending=False)
