In [None]:
# ╭─ Auto‑download dataset (Option B) ─╮
import urllib.request, pathlib, hashlib, pandas as pd, io

URL  = "https://raw.githubusercontent.com/your-org/hotel-reviews-data/main/input_data.csv"
DEST = pathlib.Path("hotel_reviews.csv")
SHA256 = "d48c1d8741ce3adc97c974654bea5f1bec83a69150451a2168342f6a91e9240b"

if not DEST.exists():
    print("⬇️  Downloading dataset …")
    urllib.request.urlretrieve(URL, DEST)
    print("✅  Saved to", DEST)
    # integrity check
    h = hashlib.sha256(DEST.read_bytes()).hexdigest()
    assert h == SHA256, f"Checksum mismatch! Expected {SHA256}, got {h}"
    print("🔒 Checksum OK")

df = pd.read_csv(DEST)  # preview
print(df.head())

# Hotel Review Multi‑Task Pipeline
**Author:** Lorenzo Spolti

## 0  Load Data
Replace the file path with your dataset. The CSV is assumed to have these columns:

* `Review` — raw text
* `Review_Type` — 1 = positive, 0 = negative
* `Review_Score` — 1‑10 numeric score
* `hotel_name`, `reviewer_nationality` — categorical
* `hotel_number_reviews`, `review_date` — numeric

In [None]:
import pandas as pd

df = pd.read_csv('hotel_reviews.csv')
print(df.head())


## 1  Model Overview
We follow the exact design described in **“Answer to the exam.rtf”**:

1. **Pre‑trained lightweight Transformer** (BERT‑tiny) provides language features  
2. **WordPiece tokenisation** is inherited from the same BERT model  
3. **Two additional branches** on top of the pooled `[CLS]` representation:
   * **Head A** – binary classifier (`sigmoid`) → review type  
   * **Head B** – regression (`linear`) → review score  
4. **Structured features** are exploited by a small MLP and concatenated to the pooled text vector

All pre‑trained weights are **frozen** by default; only the new layers learn.



## 2  Input Pre‑processing
### 2 .1  WordPiece Tokeniser

```python
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')
```

### 2 .2  Categorical → Embeddings  
We factorise each categorical column and keep the integer codes.

### 2 .3  Numeric → Standard Scaler  
`hotel_number_reviews` and a **days‑since** version of `review_date` are z‑scored.


In [None]:
from transformers import AutoTokenizer
from sklearn.preprocessing import StandardScaler
import torch

tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')

# --- categorical ---
cat_cols = ['hotel_name', 'reviewer_nationality']
cat_maps = {c: pd.factorize(df[c])[0] for c in cat_cols}
cat_tensors = [torch.tensor(v, dtype=torch.long) for v in cat_maps.values()]

# --- numeric ---
num_cols = ['hotel_number_reviews', 'review_date']
# make 'review_date' a numeric (days since first date)
df['review_date'] = pd.to_datetime(df['review_date'])
df['days_since'] = (df['review_date'] - df['review_date'].min()).dt.days
num_cols = ['hotel_number_reviews', 'days_since']

scaler = StandardScaler()
num_array = scaler.fit_transform(df[num_cols]).astype('float32')
num_tensor = torch.tensor(num_array)

# --- text ---
enc = tokenizer(df['Review'].tolist(), padding=True, truncation=True, return_tensors='pt')
input_ids = enc['input_ids']
attention = enc['attention_mask']

# --- targets ---
y = torch.tensor(df['Review_Type'].values, dtype=torch.float32)
scores = torch.tensor(df['Review_Score'].values, dtype=torch.float32)

## 3  Label Mapping

In [None]:
# Already 0/1 in the CSV; nothing to do here.

## 4  Dataset & DataLoaders

In [None]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Stack categorical codes into a single tensor [N, C]
cats = torch.stack(cat_tensors, dim=1)
# Numeric is [N, num_features]
nums = num_tensor

idx = torch.arange(len(df))
train_idx, val_idx = train_test_split(idx, test_size=0.2, stratify=y, random_state=42)

train_ds = TensorDataset(
    input_ids[train_idx], attention[train_idx],
    cats[train_idx], nums[train_idx],
    y[train_idx], scores[train_idx]
)
val_ds = TensorDataset(
    input_ids[val_idx], attention[val_idx],
    cats[val_idx], nums[val_idx],
    y[val_idx], scores[val_idx]
)

BATCH = 16
train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH*2)


## 5  Model Definition
The diagram matches the RTF answer exactly:

```
Input text        →  Transformer  →  [CLS]
Input categorical →  Embeddings –┐
Input numerical   →  MLP         ├─ concat → Dense → ReLU →    HEAD A  (sigmoid)
                                 └─────────┤
                                           └→ Dense → Dropout → HEAD B (linear)
```
All Transformer layers are frozen; we only unfreeze **N** top layers when `UNFREEZE_TOP_N > 0`.


In [None]:
import torch.nn as nn
from transformers import AutoModel
import types

class ReviewModel(nn.Module):
    def __init__(self,
                 pretrained='prajjwal1/bert-tiny',
                 cat_cardinals=None,
                 num_features=2,
                 cat_dim=32,
                 proj_dim=32,
                 head_dim=128):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained(pretrained)
        self.hidden = self.text_encoder.config.hidden_size

        # Freeze everything
        for p in self.text_encoder.parameters():
            p.requires_grad_(False)

        # Embeddings for categorical cols
        self.cat_embeddings = nn.ModuleDict({
            name: nn.Embedding(card, cat_dim)
            for name, card in cat_cardinals.items()
        })
        total_cat = cat_dim * len(cat_cardinals)

        # Projection for numeric
        self.num_proj = nn.Sequential(
            nn.Linear(num_features, proj_dim),
            nn.ReLU()
        )

        fused_dim = self.hidden + total_cat + proj_dim
        self.shared = nn.Sequential(
            nn.Linear(fused_dim, head_dim),
            nn.ReLU()
        )

        # Heads
        self.cls_head = nn.Sequential(nn.Linear(head_dim, 1))  # sigmoid later
        self.reg_head = nn.Linear(head_dim, 1)

        # backbone alias for training script
        self.backbone = self.text_encoder
        self.backbone.config = types.SimpleNamespace(num_hidden_layers=self.text_encoder.config.num_hidden_layers)

    def forward(self, input_ids, attention_mask, cats, nums):
        out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = out.pooler_output if hasattr(out, 'pooler_output') and out.pooler_output is not None else out.last_hidden_state[:,0]

        cat_vecs = [self.cat_embeddings[name](cats[:, i]) for i, name in enumerate(self.cat_embeddings)]
        cat_concat = torch.cat(cat_vecs, dim=-1)

        num_vec = self.num_proj(nums)

        x = torch.cat([pooled, cat_concat, num_vec], dim=-1)
        h = self.shared(x)

        logit = self.cls_head(h).squeeze(-1)   # BCEWithLogitsLoss will apply sigmoid
        score = self.reg_head(h).squeeze(-1)
        return logit, score

# Instantiate
cat_cardinals = {c: int(df[c].nunique()) for c in cat_cols}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ReviewModel(cat_cardinals=cat_cardinals).to(device)
print('Model instantiated. Hidden size:', model.hidden)

## 6  Training

In [None]:
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
import math

UNFREEZE_TOP_N = 2
LR_HEAD = 1e-3
LR_BB   = 2e-5
REG_LAMBDA = 0.05
EPOCHS = 5
CLIP = 1.0

# Unfreeze last N layers (if any)
if UNFREEZE_TOP_N > 0:
    for name, p in model.backbone.named_parameters():
        if any(f'layer.{i}.' in name for i in range(model.backbone.config.num_hidden_layers - UNFREEZE_TOP_N, model.backbone.config.num_hidden_layers)):
            p.requires_grad_(True)

# Losses
pos_weight = (y==0).sum() / (y==1).sum()
bce = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
mse = nn.MSELoss()

head_params = [p for n,p in model.named_parameters() if p.requires_grad and 'text_encoder' not in n]
bb_params   = [p for n,p in model.named_parameters() if p.requires_grad and 'text_encoder' in n]
optimizer = optim.AdamW([
    {'params': head_params, 'lr': LR_HEAD},
    {'params': bb_params,   'lr': LR_BB}
])

scaler = GradScaler()

def run_epoch(loader, train=True):
    model.train(train)
    mode = 'train' if train else 'val  '
    total, n = 0, 0
    y_true, y_prob = [], []
    s_true, s_pred = [], []

    for batch in loader:
        ids, attn, cats, nums, yb, sb = [x.to(device) if torch.is_tensor(x) else x for x in batch]
        optimizer.zero_grad(set_to_none=True)

        with autocast():
            logits, preds = model(ids, attn, cats, nums)
            cls_loss = bce(logits, yb)
            reg_loss = mse(preds, sb)
            loss = cls_loss + REG_LAMBDA * reg_loss

        if train:
            scaler.scale(loss).backward()
            nn.utils.clip_grad_norm_(model.parameters(), CLIP)
            scaler.step(optimizer)
            scaler.update()

        total += loss.item()*len(ids); n += len(ids)
        y_true.append(yb.cpu()); y_prob.append(torch.sigmoid(logits).cpu())
        s_true.append(sb.cpu()); s_pred.append(preds.cpu())

    y_true = torch.cat(y_true); y_prob = torch.cat(y_prob)
    y_hat  = (y_prob >= 0.5).int()
    s_true = torch.cat(s_true); s_pred = torch.cat(s_pred)
    acc = accuracy_score(y_true, y_hat)
    f1  = f1_score(y_true, y_hat, zero_division=0)
    rmse = math.sqrt(mean_squared_error(s_true, s_pred))
    print(f'{mode} | loss {total/n:.4f} | acc {acc:.3f} | f1 {f1:.3f} | rmse {rmse:.3f}')

for epoch in range(1, EPOCHS+1):
    print(f'— Epoch {epoch}/{EPOCHS} —')
    run_epoch(train_loader, train=True)
    torch.cuda.empty_cache()
    run_epoch(val_loader,   train=False)

## 7  Evaluation
Metrics already printed after each epoch as required: *Accuracy*, *Precision/Recall/F1*, and *RMSE*.