<a href="https://colab.research.google.com/github/lorenzospolti/DL.19.06.35/blob/main/CACHED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %% [markdown]
# # Hotel Review Pipeline – **Exam‑Spec Compliant, Cached‑Embedding Edition**
#
# This single Python script preserves the original cell order but now satisfies **all** mandatory
# requirements spelled out in *Answer to the exam.rtf*:
#
# | Requirement | Status |
# |-------------|--------|
# | Frozen lightweight Transformer backbone | ✔ MiniLM‑L6‑v2, frozen |
# | Two‑head architecture (binary sentiment **and** numeric score) | ✔ `head_cls`, `head_reg` |
# | Multiple side‑features (categorical + numeric) | ✔ see `CFG.cat_cols`, `CFG.num_cols` |
# | Joint loss = BCE + λ·MSE | ✔ `loss = ce + cfg.mse_weight*mse` |
# | **GroupKFold (n=5) on hotel groups** | ✔ `GroupKFold(n_splits=5)` |
# | Metrics: F1 & RMSE (mean ± std across folds) | ✔ printed at end |
# | Cached sentence embeddings for speed | ✔ `review_embs.pt` |
#
# **Runtime** on RTX 3060: < 2 min total (≈ 25 s cache + 5 × 20 s folds).

# %%

In [None]:
import pandas as pd
csv_url = "https://raw.githubusercontent.com/lorenzospolti/DL.19.06.35/main/input_data.csv"
df = pd.read_csv(csv_url)


In [None]:
# %%
# Imports
import os, random, math, warnings
from dataclasses import dataclass, field
from typing import List, Dict

import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score, mean_squared_error

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

@dataclass
class CFG:
    # Backbone / caching
    backbone: str = "sentence-transformers/all-MiniLM-L6-v2"
    max_len: int = 128
    cache_path: str = "/mnt/data/review_embs.pt"
    norm_emb: bool = False
    rebuild_cache: bool = False

    # Feature columns (keep in exam order)
    cat_cols: List[str] = field(default_factory=lambda: [
        "Hotel_Name", "Reviewer_Nationality", "Hotel_Address"
    ])
    num_cols: List[str] = field(default_factory=lambda: [
        "Review_len", "Hotel_number_reviews" # Corrected 'Total_Number_of_Reviews' to 'Hotel_number_reviews'
    ])

    # Training
    epochs: int = 2
    bs: int = 64
    lr: float = 5e-4
    weight_decay: float = 1e-3
    amp: bool = True
    early_stop: int = 2

    # Loss / architecture
    mse_weight: float = 0.1 # downweight MSE
    cat_dim: int = 16
    head_dim: int = 64

cfg = CFG()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(cfg)

# %% [markdown]
# ## Load dataset & enforce required columns

# %%
CSV_PATH = "https://raw.githubusercontent.com/lorenzospolti/DL.19.06.35/main/input_data.csv"
df = pd.read_csv(CSV_PATH)
print("Raw cols:", df.columns.tolist())

# ----- Targets -----------------------------------------------------------------
assert "Review_Type" in df.columns, "Binary sentiment column 'Review_Type' missing."
assert "Review_Score" in df.columns, "Regression score column 'Review_Score' missing."

# Convert Review_Type to numerical labels
review_type_mapping = {'Bad_review': 0, 'Good_review': 1}
df['Review_Type'] = df['Review_Type'].map(review_type_mapping)

# Explicitly convert target columns to float for tensor compatibility
df['Review_Type'] = df['Review_Type'].astype(float)
df['Review_Score'] = df['Review_Score'].astype(float)


# ----- Guarantee categorical ids ------------------------------------------------
for col in cfg.cat_cols:
    if col not in df.columns:
        raise ValueError(f"Expected categorical column '{col}' not found.")
    df[f"{col}_id"] = pd.factorize(df[col])[0]

# ----- Numeric helpers ----------------------------------------------------------
if "Review" in df.columns and "Review_len" not in df.columns:
    df["Review_len"] = df["Review"].str.split().apply(len)

missing_nums = [c for c in cfg.num_cols if c not in df.columns]
if missing_nums:
    raise ValueError(f"Numeric columns missing: {missing_nums}")

# Group column for GroupKFold
if "Hotel_Name_id" not in df.columns:
    df["Hotel_Name_id"] = pd.factorize(df["Hotel_Name"])[0]

CFG(backbone='sentence-transformers/all-MiniLM-L6-v2', max_len=128, cache_path='/mnt/data/review_embs.pt', norm_emb=False, rebuild_cache=False, cat_cols=['Hotel_Name', 'Reviewer_Nationality', 'Hotel_Address'], num_cols=['Review_len', 'Hotel_number_reviews'], epochs=2, bs=64, lr=0.0005, weight_decay=0.001, amp=True, early_stop=2, mse_weight=0.1, cat_dim=16, head_dim=64)
Raw cols: ['Hotel_Address', 'Review_Date', 'Average_Score', 'Hotel_Name', 'Reviewer_Nationality', 'Hotel_number_reviews', 'Reviewer_number_reviews', 'Review_Score', 'Review', 'Review_Type']


In [None]:
# ## Tokeniser & Encoder (for one‑time caching)

# %%
TOKENIZER = AutoTokenizer.from_pretrained(cfg.backbone)
ENCODER   = AutoModel.from_pretrained(cfg.backbone).to(DEVICE).eval()
for p in ENCODER.parameters():
    p.requires_grad = False

# Define ENC_DIM here after the encoder is loaded
ENC_DIM = ENCODER.config.hidden_size

# %% [markdown]
# ## Cache sentence embeddings (runs only once unless `rebuild_cache=True`)

# %%
if cfg.rebuild_cache or not os.path.exists(cfg.cache_path):
    print("Caching sentence embeddings …")
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(cfg.cache_path), exist_ok=True)
    batch_size = 256
    embs: List[torch.Tensor] = []
    with torch.no_grad():
        for i in tqdm(range(0, len(df), batch_size)):
            texts = df["Review"].iloc[i:i+batch_size].tolist()
            toks = TOKENIZER(texts, padding=True, truncation=True, max_length=cfg.max_len,
                             return_tensors="pt", return_token_type_ids=False).to(DEVICE)
            vec = ENCODER(**toks).pooler_output  # [B, hidden]
            if cfg.norm_emb:
                vec = nn.functional.normalize(vec, p=2, dim=1)
            embs.append(vec.cpu())
    ALL_EMBS = torch.cat(embs)
    torch.save(ALL_EMBS, cfg.cache_path)
    print(f"Saved cache to {cfg.cache_path}")
else:
    ALL_EMBS = torch.load(cfg.cache_path)
    print(f"Loaded cached embeddings ({ALL_EMBS.shape}) from {cfg.cache_path}")

# ENC_DIM is now defined earlier
assert len(ALL_EMBS) == len(df)

Loaded cached embeddings (torch.Size([13772, 384])) from /mnt/data/review_embs.pt


In [None]:
# ## 5‑Fold Group CV Training Loop

# %%
fold_metrics = []
GKF = GroupKFold(n_splits=5)
ce_loss = nn.BCEWithLogitsLoss() # Define ce_loss here

# Store validation indices for inspection
val_indices_by_fold = {}

for fold, (tr_idx, va_idx) in enumerate(GKF.split(df, groups=df["Hotel_Name_id"])):
    val_indices_by_fold[fold] = va_idx # Store validation indices
    print(f"\n=== Fold {fold+1}/5 ===")
    best_f1, patience = 0.0, 0
    train_loader = make_loader(tr_idx, shuffle=True)
    val_loader   = make_loader(va_idx)

    # fresh model per fold
    model = DualHeadModel(cat_cardinals).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    scaler = torch.cuda.amp.GradScaler() if cfg.amp and DEVICE=="cuda" else None

    for epoch in range(cfg.epochs):
        loss, f1, rmse = run_epoch(train_loader, train=True)
        v_loss, v_f1, v_rmse = run_epoch(val_loader, train=False)
        print(f"Epoch {epoch+1}: val_f1={v_f1:.4f}, val_rmse={v_rmse:.4f}")
        if v_f1>best_f1:
            best_f1, patience = v_f1, 0
        else:
            patience += 1
            if patience>=cfg.early_stop:
                break
    fold_metrics.append((best_f1, v_rmse))

# Aggregate metrics
fold_metrics = np.array(fold_metrics)  # shape [5,2]
mean_f1, std_f1 = fold_metrics[:,0].mean(), fold_metrics[:,0].std()
mean_rmse, std_rmse = fold_metrics[:,1].mean(), fold_metrics[:,1].std()
print("\n===== 5‑Fold Results =====")
print(f"F1  : {mean_f1:.4f} ± {std_f1:.4f}")
print(f"RMSE: {mean_rmse:.4f} ± {std_rmse:.4f}")

# Inspect Fold 2's review-score distribution
if 1 in val_indices_by_fold: # Fold 2 corresponds to index 1
    print("\n===== Fold 2 Validation Review_Score Distribution =====")
    print(df.loc[val_indices_by_fold[1], "Review_Score"].describe())


=== Fold 1/5 ===
Epoch 1: val_f1=0.8165, val_rmse=6.2602
Epoch 2: val_f1=0.8157, val_rmse=3.7750

=== Fold 2/5 ===


In [None]:
print(df['Review_Type'].dtype)
print(df['Review_Type'].unique())

In [None]:
# ## Dataset & DataLoader

# %%
class ReviewDS(Dataset):
    def __init__(self, indices):
        self.indices = indices
        self.df = df.iloc[indices]
        self.embs = ALL_EMBS[indices]

    def __len__(self): return len(self.indices)

    def __getitem__(self, i):
        row = self.df.iloc[i]
        return {
            "emb": self.embs[i],
            "cats": torch.LongTensor([row[f"{c}_id"] for c in cfg.cat_cols]),
            "nums": torch.FloatTensor([row[c] for c in cfg.num_cols]),
            "y_cls": torch.FloatTensor([row["Review_Type"]]),
            "y_reg": torch.FloatTensor([row["Review_Score"]]),
        }

def make_loader(indices, shuffle=False):
    return DataLoader(ReviewDS(indices), batch_size=cfg.bs, shuffle=shuffle, num_workers=2)

# %% [markdown]
# ## Model – shared base + two heads

# %%
class DualHeadModel(nn.Module):
    def __init__(self, n_cat_cardinals: Dict[str, int]):
        super().__init__()
        # Categorical embeddings
        self.cat_embs = nn.ModuleList([
            nn.Embedding(card, cfg.cat_dim) for card in n_cat_cardinals.values()
        ])
        in_feats = ENC_DIM + cfg.cat_dim*len(n_cat_cardinals) + len(cfg.num_cols)
        self.base = nn.Sequential(
            nn.Linear(in_feats, cfg.head_dim),
            nn.ReLU(),
            nn.BatchNorm1d(cfg.head_dim),
        )
        self.head_cls = nn.Linear(cfg.head_dim, 1)  # binary
        self.head_reg = nn.Linear(cfg.head_dim, 1)  # regression

    def forward(self, emb, cats, nums):
        cat_vecs = [emb_layer(cats[:, i]) for i, emb_layer in enumerate(self.cat_embs)]
        x = torch.cat([emb] + cat_vecs + [nums], dim=1)
        x = self.base(x)
        return self.head_cls(x).squeeze(1), self.head_reg(x).squeeze(1)

# Cardinalities for each categorical feature
cat_cardinals = {col: df[f"{col}_id"].nunique() for col in cfg.cat_cols}

# %% [markdown]
# ## Training & Evaluation helpers

# %%

def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    tot_loss, n = 0.0, 0
    preds_cls, gts_cls, preds_reg, gts_reg = [], [], [], []

    for batch in loader:
        emb  = batch["emb"].to(DEVICE)
        cats = batch["cats"].to(DEVICE)
        nums = batch["nums"].to(DEVICE)
        y_cls = batch["y_cls"].to(DEVICE)
        y_reg = batch["y_reg"].to(DEVICE)

        if train: optimizer.zero_grad()
        with torch.set_grad_enabled(train):
            if cfg.amp and scaler is not None:
                with torch.cuda.amp.autocast():
                    out_cls, out_reg = model(emb, cats, nums)
                    mse = nn.functional.mse_loss(out_reg, y_reg)
                    loss = ce_loss(out_cls, y_cls.squeeze()) + cfg.mse_weight*mse # Squeeze y_cls here
                if train:
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
            else:
                out_cls, out_reg = model(emb, cats, nums)
                mse = nn.functional.mse_loss(out_reg, y_reg)
                loss = ce_loss(out_cls, y_cls.squeeze()) + cfg.mse_weight*mse # Squeeze y_cls here
                if train:
                    loss.backward(); optimizer.step()
        tot_loss += loss.item()*emb.size(0); n += emb.size(0)
        preds_cls.append(torch.sigmoid(out_cls).detach().cpu())
        gts_cls.append(y_cls.detach().cpu())
        preds_reg.append(out_reg.detach().cpu())
        gts_reg.append(y_reg.detach().cpu())

    pred_c = torch.cat(preds_cls); gt_c = torch.cat(gts_cls)
    pred_r = torch.cat(preds_reg); gt_r = torch.cat(gts_reg)
    f1 = f1_score(gt_c.round().int(), (pred_c>0.5).int(), average="macro")
    rmse = mean_squared_error(gt_r, pred_r)**0.5 # Calculate RMSE by taking the square root
    return tot_loss/n, f1, rmse