# Product Length Prediction on Amazon Dataset.

In [1]:
import os
import math
import random
from typing import Dict, Tuple

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from transformers import AutoModel, AutoTokenizer

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", DEVICE)


Running on: cpu


In [2]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(421)


## Data Preparation

We will:

- Merge text fields into one input string  
- Map product-type IDs to embedding indices  
- Log-transform & standardize target values  


In [3]:
def build_type_mapping(df: pd.DataFrame, min_count: int = 10):
    counts = df["PRODUCT_TYPE_ID"].value_counts()
    id_to_index = {}
    current = 0

    for raw_id, cnt in counts.items():
        if cnt >= min_count:
            id_to_index[int(raw_id)] = current
            current += 1

    default_index = current
    return id_to_index, default_index


def compute_target_stats(df: pd.DataFrame, clip: float = 12.0):
    lengths = df["PRODUCT_LENGTH"].astype(float).values
    log_lengths = np.log(np.clip(lengths, 1e-6, None))
    log_lengths = np.clip(log_lengths, None, clip)
    mean = float(log_lengths.mean())
    std = float(log_lengths.std())
    return mean, std


In [4]:
class ProductLengthDataset(Dataset):
    def __init__(
        self, df, id_to_index, default_index,
        mean, std, is_test=False, transform_target=True
    ):
        self.df = df.reset_index(drop=True)
        self.id_to_index = id_to_index
        self.default_index = default_index
        self.mean = mean
        self.std = std
        self.is_test = is_test
        self.transform_target = transform_target

    def __len__(self):
        return len(self.df)

    def map_type(self, type_id):
        # Safely handle missing or weird values
        import pandas as pd

        if pd.isna(type_id):
            return self.default_index

        try:
            type_id_int = int(type_id)
        except (ValueError, TypeError):
            return self.default_index

        return self.id_to_index.get(type_id_int, self.default_index)

    def concat_text(self, row):
        t = str(row.get("TITLE", ""))
        bp = str(row.get("BULLET_POINTS", ""))
        desc = str(row.get("DESCRIPTION", ""))
        return f"Title: {t}, Bullet Points: {bp}, Description: {desc}"

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = self.concat_text(row)
        cat = self.map_type(row["PRODUCT_TYPE_ID"])

        if self.is_test:
            return {"text": text, "cat_id": torch.tensor(cat), "product_id": row.get("PRODUCT_ID")}

        y = float(row["PRODUCT_LENGTH"])
        if self.transform_target:
            y = math.log(max(y, 1e-6))
            y = (y - self.mean) / self.std

        return {
            "text": text,
            "cat_id": torch.tensor(cat),
            "y": torch.tensor(y, dtype=torch.float32)
        }


## Model Architecture

- Transformer encoder (BERT / RoBERTa)
- Category embedding (32d)
- MLP regressor  


In [5]:
class MLPRegressor(nn.Module):
    def __init__(self, in_dim, hidden=256, dropout=0.2):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.BatchNorm1d(hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, 1)
        )

    def forward(self, x):
        return self.layers(x).squeeze(-1)


class TextCategoryRegressor(nn.Module):
    def __init__(self, backbone, num_cats, cat_dim=32):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(backbone)
        self.tokenizer = AutoTokenizer.from_pretrained(backbone)
        hidden = self.encoder.config.hidden_size

        self.cat_emb = nn.Embedding(num_cats, cat_dim)
        self.regressor = MLPRegressor(hidden + cat_dim)

    def encode_text(self, texts):
        tokens = self.tokenizer(
            texts,
            padding=True, truncation=True, max_length=128,
            return_tensors="pt"
        )
        tokens = {k: v.to(DEVICE) for k, v in tokens.items()}
        out = self.encoder(**tokens)
        return out.last_hidden_state[:, 0, :]

    def forward(self, batch):
        text_emb = self.encode_text(batch["text"])
        cat_emb = self.cat_emb(batch["cat_id"].to(DEVICE))
        x = torch.cat([text_emb, cat_emb], dim=1)
        return self.regressor(x)


In [6]:
def collate_fn(batch):
    texts = [b["text"] for b in batch]
    cat_ids = torch.stack([b["cat_id"] for b in batch])
    output = {"text": texts, "cat_id": cat_ids}

    if "y" in batch[0]:
        output["y"] = torch.stack([b["y"] for b in batch])

    if "product_id" in batch[0]:
        output["product_id"] = [b["product_id"] for b in batch]

    return output


## Training & Evaluation Loops


In [7]:
def train_epoch(model, loader, optim):
    model.train()
    loss_fn = nn.MSELoss()
    losses = []

    for batch in tqdm(loader, desc="Training"):
        optim.zero_grad()
        batch = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
        preds = model(batch)
        loss = loss_fn(preds, batch["y"])
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        losses.append(loss.item())

    return np.mean(losses)


def evaluate(model, loader, mean, std):
    model.eval()
    loss_fn = nn.MSELoss()
    mse_list, mape_sum, total = [], 0, 0

    with torch.no_grad():
        for batch in tqdm(loader, desc="Validation"):
            batch = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            preds = model(batch)

            mse_list.append(loss_fn(preds, batch["y"]).item())

            # Convert back
            y_true = batch["y"].cpu().numpy() * std + mean
            y_true = np.exp(y_true)
            y_pred = preds.cpu().numpy() * std + mean
            y_pred = np.exp(y_pred)

            mape_sum += np.sum(np.abs(y_true - y_pred) / (y_true + 1e-6))
            total += len(y_true)

    return np.mean(mse_list), mape_sum / total


## Load Data, Create Split, Build Dataloaders


In [22]:
TRAIN_PATH = "train.csv"

train_full = pd.read_csv(
    TRAIN_PATH,
    engine="python",
    on_bad_lines="skip"
)

train_full = train_full.sample(n=60000, random_state=421)

val_fraction = 0.2
val_size = int(val_fraction * len(train_full))

val_df = train_full.iloc[:val_size].reset_index(drop=True)
train_df = train_full.iloc[val_size:].reset_index(drop=True)

print("Total rows used:", len(train_full))
print("Train rows:", len(train_df), " Val rows:", len(val_df))

id_to_index, default_idx = build_type_mapping(train_df)
mean, std = compute_target_stats(train_df)

train_set = ProductLengthDataset(train_df, id_to_index, default_idx, mean, std)
val_set   = ProductLengthDataset(val_df, id_to_index, default_idx, mean, std)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_set,   batch_size=32, shuffle=False, collate_fn=collate_fn)


Total rows used: 60000
Train rows: 48000  Val rows: 12000


## Train Model


In [23]:
BACKBONE = "bert-base-uncased"
model = TextCategoryRegressor(BACKBONE, len(id_to_index) + 1).to(DEVICE)

optim = torch.optim.AdamW(model.parameters(), lr=1e-4)

best_mape = 999
for epoch in range(3):
    print(f"\nEpoch {epoch+1}")
    tr_loss = train_epoch(model, train_loader, optim)
    val_mse, val_mape = evaluate(model, val_loader, mean, std)

    print("Train Loss:", tr_loss)
    print("Val MSE:", val_mse)
    print("Val MAPE:", val_mape)

    if val_mape < best_mape:
        best_mape = val_mape
        torch.save(model.state_dict(), "best.pt")
        print("Saved best model!")



Epoch 1


Training: 100%|██████████| 1500/1500 [1:42:13<00:00,  4.09s/it]
Validation: 100%|██████████| 375/375 [05:19<00:00,  1.18it/s]


Train Loss: 1.0171955492099125
Val MSE: 0.9230010840098063
Val MAPE: 1.8599287
Saved best model!

Epoch 2


Training: 100%|██████████| 1500/1500 [1:41:54<00:00,  4.08s/it]
Validation: 100%|██████████| 375/375 [05:11<00:00,  1.20it/s]


Train Loss: 0.9446689146806796
Val MSE: 0.9038403712908427
Val MAPE: 1.9146003

Epoch 3


Training: 100%|██████████| 1500/1500 [1:41:20<00:00,  4.05s/it]
Validation: 100%|██████████| 375/375 [05:14<00:00,  1.19it/s]


Train Loss: 0.9154600207805633
Val MSE: 0.9029799310564994
Val MAPE: 1.7654423
Saved best model!


## Final Evaluation on the 20% Testing Set

We now reload the best model (based on validation MAPE during training)
and compute the final metrics on the held-out 20% validation split.


In [24]:
# Load best model found during training
model.load_state_dict(torch.load("best.pt", map_location=DEVICE))
model.to(DEVICE)
model.eval()

final_mse, final_mape = evaluate(model, val_loader, mean, std)

print("Final evaluation on 20% validation split:")
print("MSE  (normalized target):", final_mse)
print("MAPE (original length):  ", final_mape)


Validation: 100%|██████████| 375/375 [05:13<00:00,  1.20it/s]

Final evaluation on 20% validation split:
MSE  (normalized target): 0.9029799310564994
MAPE (original length):   1.7654423



