<a href="https://colab.research.google.com/github/lorenzospolti/DL.19.06.35/blob/main/hote_review_fast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""Hotel Review Multi‑Task Pipeline – Optimised (June 2025)
--------------------------------------------------------
This single .py file keeps exactly the model topology required in
“Answer to the exam.rtf” (frozen lightweight Transformer + 2 heads + structured
branches) but integrates all runtime tweaks **and** fixes the new PyTorch
warning by using the modern `torch.amp.autocast()` API.
"""

# ─────────────────────── Imports & global settings ───────────────────────
from __future__ import annotations

import math, random, os, pickle, json, pathlib, argparse, time
from typing import Dict, Any, List, Tuple

import numpy as np
import pandas as pd
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn import metrics as skm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModel, logging as hf_logging
import sys

# Silence HF’s roaring transformers – keep console tidy
hf_logging.set_verbosity_error()

SEED   = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

torch.manual_seed(SEED);  np.random.seed(SEED);  random.seed(SEED)
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.set_float32_matmul_precision("high")

# ─────────────────────────────── Config ────────────────────────────────
BACKBONE  = "prajjwal1/bert-tiny"            # 2‑layer × 128‑dim – fast & small
MAX_LEN   = 128                              # covers ~95 % of reviews
CAT_COLS  = ["Hotel_Name", "Reviewer_Nationality"]
NUM_COLS  = ["Hotel_number_reviews", "days_since"]
GROUP_COL = "hotel_id"                       # fallback to Hotel_Name if absent

# Training hyper‑parameters (will be parsed from CLI when run as script)
CFG: Dict[str, Any] = dict(
    lr         = 2e-5,
    wd         = 1e-2,
    dropout    = 0.2,
    unfreeze   = 0,          # keep backbone frozen = exam requirement
    bs         = 32,
    optim      = "AdamW",
    lambda_reg = 1.0,
    clip       = 1.0,
    epochs     = 5,
    early      = 2,
)

# ─────────────────────────────── I/O helpers ──────────────────────────────

def load_data(csv_url: str) -> pd.DataFrame:
    df = pd.read_csv(csv_url)
    # Map explicit sentiment strings → {0,1}
    if df["Review_Type"].dtype == "object":
        df["Review_Type"] = df["Review_Type"].map({"Bad_review": 0, "Good_review": 1})
    # Date → days‑since
    df["Review_Date"] = pd.to_datetime(df["Review_Date"])
    df["days_since"]  = (df["Review_Date"] - df["Review_Date"].min()).dt.days
    return df

# ───────────────────── One‑off tokenisation & feature prep ──────────────────

def prepare_features(df: pd.DataFrame, tok: AutoTokenizer) -> pd.DataFrame:
    # 1) WordPiece once, cached in RAM (list[int])
    enc = tok(df["Review"].tolist(), padding="max_length", truncation=True,
              max_length=MAX_LEN, return_tensors="pt")
    df["input_ids"]      = enc["input_ids"].tolist()
    df["attention_mask"] = enc["attention_mask"].tolist()

    # 2) categorical → integer codes & stash list
    for col in CAT_COLS:
        df[col], _ = pd.factorize(df[col])
    df["cat_feats"] = df[CAT_COLS].values.tolist()

    # 3) numeric → z‑score (shared scaler)
    scaler = StandardScaler()
    df[NUM_COLS] = scaler.fit_transform(df[NUM_COLS]).astype("float32")
    df["num_feats"] = df[NUM_COLS].values.tolist()

    return df

# ───────────────────────────── Dataset class ──────────────────────────────
class ReviewDS(Dataset):
    def __init__(self, frame: pd.DataFrame):
        self.df = frame.reset_index(drop=True)
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        return {
            "input_ids":      torch.tensor(row.input_ids,      dtype=torch.long),
            "attention_mask": torch.tensor(row.attention_mask, dtype=torch.long),
            "cat":            torch.tensor(row.cat_feats,      dtype=torch.long),
            "num":            torch.tensor(row.num_feats,     dtype=torch.float32),
            "label":          torch.tensor(row.Review_Type,    dtype=torch.float32),
            "score":          torch.tensor(row.Review_Score,   dtype=torch.float32),
        }

# ───────────────────────────── Model definition ───────────────────────────
class MTModel(nn.Module):
    def __init__(self, cat_cardinals: Dict[str,int], n_num: int,
                 dropout: float, unfrozen: int, cat_dim: int = 32):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(BACKBONE)
        # freeze all, selectively unfreeze last N if requested
        for p in self.backbone.parameters(): p.requires_grad_(False)
        if unfrozen>0 and hasattr(self.backbone, "encoder"):
            for lyr in self.backbone.encoder.layer[-unfrozen:]:
                for p in lyr.parameters(): p.requires_grad_(True)
        hid = self.backbone.config.hidden_size

        # categorical branch
        self.cat_embeddings = nn.ModuleDict({
            n: nn.Embedding(card, cat_dim) for n, card in cat_cardinals.items()
        })
        cat_emb_dim = cat_dim * len(cat_cardinals)
        self.fc_cat = nn.Linear(hid + cat_emb_dim, 128)
        self.out_a  = nn.Linear(128, 1) # Removed Sigmoid here

        # numeric branch
        self.fc_num = nn.Linear(hid + n_num, 128)
        self.drop   = nn.Dropout(dropout)
        self.out_b  = nn.Linear(128, 1)

        self.relu, self.sig = nn.ReLU(), nn.Sigmoid()
        # simple init
        for m in self.modules():
            if isinstance(m, nn.Linear): nn.init.kaiming_normal_(m.weight)

    def forward(self, ids, mask, cat, num):
        cls = self.backbone(ids, attention_mask=mask).last_hidden_state[:, 0]

        # process categorical embeddings
        cat_vec = torch.cat([self.cat_embeddings[c](cat[:, i])
                             for i, c in enumerate(self.cat_embeddings)], dim=-1)
        x_a = self.relu(self.fc_cat(torch.cat([cls, cat_vec], 1)))
        outA = self.out_a(x_a).squeeze(1) # Sigmoid moved to loss function

        # numeric head
        x_b  = self.drop(self.relu(self.fc_num(torch.cat([cls, num], 1))))
        outB = self.out_b(x_b).squeeze(1)
        return outA, outB

# ─────────────────────────── Training / eval ──────────────────────────────

def train_fold(model: MTModel, tr_dl: DataLoader, va_dl: DataLoader, cfg: Dict[str,Any]) -> Dict[str,np.ndarray]:
    opt_cls = torch.optim.SGD if cfg["optim"] == "SGD" else torch.optim.AdamW
    opt     = opt_cls(filter(lambda p: p.requires_grad, model.parameters()),
                      lr=cfg["lr"], weight_decay=cfg["wd"])
    scaler  = torch.amp.GradScaler(enabled=DEVICE=="cuda")

    best, patience = math.inf, 0
    for _ in range(cfg["epochs"]):
        model.train()
        for b in tr_dl:
            opt.zero_grad()
            with torch.amp.autocast("cuda", enabled=DEVICE=="cuda"):
                logits, reg = model(b["input_ids"].to(DEVICE),
                                    b["attention_mask"].to(DEVICE),
                                    b["cat"].to(DEVICE),
                                    b["num"].to(DEVICE))
                loss = cfg["lambda_reg"]*F.binary_cross_entropy_with_logits(logits, b["label"].to(DEVICE)) \
                     + F.mse_loss(reg, b["score"].to(DEVICE))
            scaler.scale(loss).backward()
            if cfg["clip"]:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg["clip"])
            scaler.step(opt);  scaler.update()

        # ─ validation ─
        model.eval();  val_loss = []
        with torch.no_grad(), torch.amp.autocast("cuda", enabled=DEVICE=="cuda"):
            for b in va_dl:
                l, r = model(b["input_ids"].to(DEVICE),
                             b["attention_mask"].to(DEVICE),
                             b["cat"].to(DEVICE),
                             b["num"].to(DEVICE))
                v = cfg["lambda_reg"]*F.binary_cross_entropy_with_logits(l, b["label"].to(DEVICE)) \
                  + F.mse_loss(r, b["score"].to(DEVICE))
                val_loss.append(v.item())
        cur = float(np.mean(val_loss))
        if cur < best: best, patience = cur, 0
        else: patience += 1
        if patience >= cfg["early"]: break

    # ─ gather predictions ─
    logits_all, reg_all, lab_all, score_all = [], [], [], []
    model.eval()
    with torch.no_grad(), torch.amp.autocast("cuda", enabled=DEVICE=="cuda"):
        for b in va_dl:
            l, r = model(b["input_ids"].to(DEVICE),
                         b["attention_mask"].to(DEVICE),
                         b["cat"].to(DEVICE),
                         b["num"].to(DEVICE))
            # Apply sigmoid for classification predictions after gathering
            logits_all.extend(torch.sigmoid(l).cpu().numpy());  lab_all.extend(b["label"].numpy())
            reg_all.extend(r.cpu().numpy());     score_all.extend(b["score"].numpy())
    return dict(cls_pred=np.array(logits_all), cls_true=np.array(lab_all),
                reg_pred=np.array(reg_all),   reg_true=np.array(score_all))


def evaluate(preds: Dict[str,np.ndarray]) -> Dict[str,float]:
    y_hat = (preds["cls_pred"] > 0.5).astype(int)
    return dict(
        accuracy  = skm.accuracy_score(preds["cls_true"], y_hat),
        precision = skm.precision_score(preds["cls_true"], y_hat, zero_division=0),
        recall    = skm.recall_score(preds["cls_true"], y_hat, zero_division=0),
        f1        = skm.f1_score(preds["cls_true"], y_hat, zero_division=0),
        mse       = skm.mean_squared_error(preds["reg_true"], preds["reg_pred"]),
        rmse      = math.sqrt(skm.mean_squared_error(preds["reg_true"], preds["reg_pred"])),
    )

# ───────────────────────────── Cross‑validation ───────────────────────────

def cross_validate(df: pd.DataFrame, cfg: Dict[str,Any]):
    tok = AutoTokenizer.from_pretrained(BACKBONE)
    df  = prepare_features(df, tok)

    cat_cardinals = {c: df[c].nunique() for c in CAT_COLS}
    n_num         = len(NUM_COLS)

    if GROUP_COL not in df.columns:
        df[GROUP_COL] = df["Hotel_Name"]   # fallback

    gkf = GroupKFold(n_splits=5)
    results: List[Dict[str,float]] = []

    for fold,(tr,va) in enumerate(gkf.split(df, groups=df[GROUP_COL])):
        model = MTModel(cat_cardinals, n_num, cfg["dropout"], cfg["unfreeze"]).to(DEVICE)
        tr_ds, va_ds = ReviewDS(df.iloc[tr]), ReviewDS(df.iloc[va])
        tr_dl = DataLoader(tr_ds, batch_size=cfg["bs"], shuffle=True,  pin_memory=True,
                           num_workers=2, persistent_workers=True)
        va_dl = DataLoader(va_ds, batch_size=cfg["bs"],               pin_memory=True,
                           num_workers=2, persistent_workers=True)
        preds = train_fold(model, tr_dl, va_dl, cfg)
        res   = evaluate(preds)
        results.append(res)
        print(f"Fold {fold+1}: ", {k:f"{v:.4f}" for k,v in res.items()})

    # aggregate
    mean = {k: float(np.mean([r[k] for r in results])) for k in results[0]}
    std  = {k: float(np.std( [r[k] for r in results])) for k in results[0]}
    print("\nMean ± SD across folds:")
    for k in mean:
        print(f"{k}: {mean[k]:.4f} ± {std[k]:.4f}")

# ───────────────────────────────── main ───────────────────────────────────
if __name__ == "__main__":
    # Check if running in a notebook environment
    if 'ipykernel' in sys.modules:
        # If in notebook, use default CFG and CSV
        cfg = CFG
        csv_url = "https://raw.githubusercontent.com/lorenzospolti/DL.19.06.35/main/input_data.csv"
    else:
        # If running as a script, parse arguments
        parser = argparse.ArgumentParser(description="Hotel review multitask pipeline (optimised)")
        parser.add_argument("--csv", type=str, required=False,
                            default="https://raw.githubusercontent.com/lorenzospolti/DL.19.06.35/main/input_data.csv",
                            help="CSV file or URL with reviews data")
        for k,v in CFG.items():
            arg_type = type(v) if not isinstance(v,bool) else lambda x: bool(int(x))
            parser.add_argument(f"--{k}", type=arg_type, default=v)
        args = vars(parser.parse_args())
        cfg = {k: args[k] for k in CFG}
        csv_url = args["csv"]


    print("Running on", DEVICE)
    df  = load_data(csv_url)
    cross_validate(df, cfg)

Running on cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Fold 1:  {'accuracy': '0.8341', 'precision': '0.7897', 'recall': '0.8838', 'f1': '0.8341', 'mse': '3.3915', 'rmse': '1.8416'}
Fold 2:  {'accuracy': '0.8468', 'precision': '0.7988', 'recall': '0.9283', 'f1': '0.8587', 'mse': '2.8658', 'rmse': '1.6929'}
Fold 3:  {'accuracy': '0.8326', 'precision': '0.8300', 'recall': '0.8390', 'f1': '0.8345', 'mse': '2.9002', 'rmse': '1.7030'}
Fold 4:  {'accuracy': '0.8406', 'precision': '0.8546', 'recall': '0.8323', 'f1': '0.8433', 'mse': '2.8829', 'rmse': '1.6979'}
Fold 5:  {'accuracy': '0.8322', 'precision': '0.8025', 'recall': '0.8885', 'f1': '0.8433', 'mse': '3.3363', 'rmse': '1.8266'}

Mean ± SD across folds:
accuracy: 0.8373 ± 0.0056
precision: 0.8151 ± 0.0239
recall: 0.8744 ± 0.0353
f1: 0.8428 ± 0.0089
mse: 3.0753 ± 0.2365
rmse: 1.7524 ± 0.0670
