This notebook builds a **GRU-based sequence encoder with attention** to generate a **dynamic user embedding** from each user's interaction history.

## Import Libraries and Load Dataset

In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import os, json, random
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader

In [None]:
# Mount
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7c7580fe1770>

In [None]:
# Data paths
DATA_PATH_MERGED = "/content/drive/MyDrive/BT4222 Group 3/1. Data Preparation/Data/features_engineered/user_behavior_data_sampled_parsed_features_merged.parquet"
SAVE_PATH = "/content/drive/MyDrive/BT4222 Group 3/2. Models/Artifacts/gru_attention"
os.makedirs(SAVE_PATH, exist_ok = True)

In [None]:
# Load merged df as a table
queries_table_with_engineered_merged = pq.read_table(DATA_PATH_MERGED)
queries_df = pd.DataFrame(queries_table_with_engineered_merged.to_pydict())

In [None]:
queries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173831 entries, 0 to 173830
Data columns (total 29 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   query                      173831 non-null  object 
 1   candidate_wid_list         173831 non-null  object 
 2   candidate_label_list       173831 non-null  object 
 3   history_qry_list           173831 non-null  object 
 4   history_wid_list           173831 non-null  object 
 5   history_type_list          173831 non-null  object 
 6   history_time_list          173831 non-null  object 
 7   candidate_label_list_int   173831 non-null  object 
 8   history_type_list_ordinal  173831 non-null  object 
 9   query_list                 173831 non-null  object 
 10  history_qry_list_terms     173831 non-null  object 
 11  nunique_cats_1             173831 non-null  float64
 12  nunique_cats_2             173831 non-null  float64
 13  nunique_cats_3             17

In [None]:
# Use GPU if available, else fall back to CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

## Helpers

This section defines small helper utilities used throughout the notebook.

- `TIME_BINS` specifies discrete time buckets in days
- `bucketize_gaps(gaps)` converts raw time gaps to bucket indices
- `clip_recent(seq, max_len)` keeps only the most recent `max_len` interactions

In [None]:
# Time gap bins in days (adjust as needed)
TIME_BINS = [0, 1, 3, 7, 14, 30, 60, 90, 180, 365, 1000]

# Convert each time gap into a bucket index based on TIME_BINS
def bucketize_gaps(gaps):
  if gaps is None:
    return []
  out = []
  for gap in gaps:
    try:
      out.append(int(np.digitize(float(gap), TIME_BINS)))
    except Exception:
      out.append(int(np.digitize(0.0, TIME_BINS)))
  return out

# Keep only the most recent max_len steps in sequence
# - Ensure that sequence length is manageable for the model
def clip_recent(seq, max_len):
  if max_len is None:
    return seq
  if len(seq) > max_len:
    return seq[-max_len:]
  else:
    return seq

## Build vocabularies for embedding

Item vocabulary: `wid2idx`
- Maps each unique item ID from `history_wid_list` to an integer index
- Index 0 is reserved for padding
- `N_ITEMS` gives the vocabulary size for the item embedding layer

Interaction type vocabulary: `type2idx`
- Maps each unique interaction type from `history_type_list_ordinal` to an integer index
- Index 0 is reserved for padding
- `N_TYPES` gives the vocabulary size for the type embedding layer

Time buckets:
- `TIME_BINS` (from the helper section) defines the discrete time gap buckets
- `N_TIME` gives the vocabulary size for the time embedding layer

In [None]:
# Build wid2idx: map each unique wid to an idx  (0=PAD, idx starts from 1)
all_wids = set(w for seq in queries_df["history_wid_list"] for w in (seq or []))
wid2idx = {w: idx+1 for idx, w in enumerate(sorted(all_wids))}
N_ITEMS = len(wid2idx) + 1

# Build type2idx: map each unique interaction type to an idx  (0=PAD, idx starts from 1)
type_set = set(t for seq in queries_df["history_type_list_ordinal"] for t in (seq or []))
type2idx = {t: idx+1 for idx, t in enumerate(sorted(type_set))}
N_TYPES = len(type2idx) + 1

# No. of possible time bucket idx  (0=PAD + bins)
N_TIME = len(TIME_BINS) + 1

# Save vocab artifacts for reuse
with open(os.path.join(SAVE_PATH, "wid2idx.json"), "w") as f: json.dump(wid2idx, f)
with open(os.path.join(SAVE_PATH, "type2idx.json"), "w") as f: json.dump(type2idx, f)
with open(os.path.join(SAVE_PATH, "time_bins.json"), "w") as f: json.dump(TIME_BINS, f)

## Dataset & Collate function

The `HistoryDataset` class converts each user's sequential interaction history into a numerical format suitable for embedding and GRU encoding.

**Design choices**:
1. **History truncation**

    `clip_recent` keeps only the last `MAX_LEN` interactions to focus on recent user behaviour and avoid excessive memory consumption

2. **Empty history handling**

    Users with no recorded history are represented by a single padded step and `true_len = 0`, ensuring model stability and avoiding shape mismatches

3. **Index mapping**

    - `history_wid_list`: Map each unique wid to item indices (`wid2idx`)
    - `history_type_list_ordinal`: Map each interaction type to interaction-type indices (`type2idx`)
    - `history_time_list`: Map raw time gaps to time-bucket indices (`bucketize_gaps`)

4. **Output**

    Returns a dictionary of PyTorch tensors for each sequence:
    - `"wid"` – item index sequence  
    - `"type"` – interaction type index sequence  
    - `"time"` – time-bucket index sequence  
    - `"true_len"` – actual unpadded sequence length  
    - `"row_id"` – original dataframe row reference

In [None]:
# Max no. of steps to keep in a user's history
MAX_LEN = 100

class HistoryDataset(Dataset):
  def __init__(self, df, wid2idx, type2idx, max_len=MAX_LEN):
    self.df = df
    self.wid2idx = wid2idx
    self.type2idx = type2idx
    self.max_len = max_len

  def __len__(self):
    return len(self.df)

  def _map_seq(self, seq, mapping):
    # convert each token in sequence to idx using mapping.
    # if token not found, map to 0 (PAD)
    return [mapping.get(tok, 0) for tok in (seq or [])]

  def __getitem__(self, idx):
    # get 1 row by idx
    row = self.df.iloc[idx]

    # extract history lists and clip to most recent max_len steps
    wid_seq = clip_recent(row["history_wid_list"] or [], self.max_len)
    type_seq = clip_recent(row["history_type_list_ordinal"] or [], self.max_len)
    time_seq = clip_recent(row["history_time_list"] or [], self.max_len)

    # empty history list
    if len(wid_seq) == 0:
      return {
          "wid": torch.tensor([0], dtype=torch.long),
          "type": torch.tensor([0], dtype=torch.long),
          "time": torch.tensor([0], dtype=torch.long),
          "true_len": torch.tensor(0, dtype=torch.long),
          "row_id": idx
      }

    # map wid and type to idx
    wid_idx = torch.tensor(self._map_seq(wid_seq, self.wid2idx), dtype = torch.long)
    type_idx = torch.tensor(self._map_seq(type_seq, self.type2idx), dtype = torch.long)

    # convert raw time gaps into time-bucket indices
    time_idx = torch.tensor(bucketize_gaps(time_seq), dtype = torch.long)

    return {
        "wid": wid_idx,
        "type": type_idx,
        "time": time_idx,
        "true_len": torch.tensor(len(wid_idx), dtype = torch.long),  #before padding
        "row_id": idx
    }

Users have **different sequence lengths**, but GRUs expects tensors of the same length within a batch.

We therefore pad sequences, sort by length (for packed sequences), and keep an index to restore the original order after the forward pass.

The `collate_fn(batch)` prepares a batch of variable-length sequences for the GRU.

**Steps:**
1. **Extract fields** from each sample in `batch`: `"wid"`, `"type"`, `"time"`, `"true_len"`, `"row_id"`
2. **Pad sequences** to equal length using `pad_sequence` with PAD index = 0
3. **Sort** by length descending to support `pack_padded_sequence`
4. **Keep restore index** so that outputs can be re-ordered back to the original batch order

**Outputs**
- `"wid"`, `"type"`, `"time"` – padded index tensors, sorted by length
- `"lens"` – sorted true lengths
- `"restore"` – indices to invert the sort
- `"row_ids_sorted"` – original dataframe row ids, in the sorted order

In [None]:
def collate_fn(batch):
  wids = [b["wid"] for b in batch]
  types = [b["type"] for b in batch]
  times = [b["time"] for b in batch]
  lens = torch.stack([b["true_len"] for b in batch], dim = 0)
  rows = torch.tensor([b["row_id"] for b in batch], dtype = torch.long)

  # Pad sequences to the same length (using 0 as PAD)
  wids_p = pad_sequence(wids, batch_first = True, padding_value = 0)
  types_p = pad_sequence(types, batch_first = True, padding_value = 0)
  times_p = pad_sequence(times, batch_first = True, padding_value = 0)

  # Sort sequences by length in descending order
  lens_sorted, idx = torch.sort(lens, descending = True)

  # Compute inverse idx so we can restore original batch later on
  restore = torch.argsort(idx)

  return {
      "wid": wids_p[idx],
      "type": types_p[idx],
      "time": times_p[idx],
      "lens": lens_sorted,
      "restore": restore,
      "row_ids_sorted": rows[idx]
  }

We now wrap the data into the `HistoryDataset` and `DataLoader` so that user histories can be efficiently processed in batches.

In [None]:
# Build dataset and dataloader
dataset = HistoryDataset(queries_df, wid2idx, type2idx, max_len=MAX_LEN)
dataloader = DataLoader(dataset, batch_size=256, shuffle=False, num_workers=0,
                        collate_fn=collate_fn, pin_memory=(device=="cuda"))

### Generating the Embeddings

**Goal**: Generate a dense, fixed-size embedding for each user that captures their sequential interaction patterns with items.

**Design Choices**

1. **GRU (Gated Recurrent Unit)**  
   - Handles sequences of varying lengths.  
   - Captures temporal dependencies, i.e. how past actions influence future behavior.
   - Chosen over LSTM for similar performance but lower computational cost.

2. **Attention Layer**  
   - Aggregates GRU outputs into a single weighted embedding.  
   - Allows the model to focus on more important interactions instead of treating all steps equally.

3. **Embedding Sum**  
   - Each user interaction is represented by three components: product ID, interaction type, and time gap bucket.  
   - We sum their embeddings to get a combined representation for each time step.

4. **Empty History Handling**  
   - Users with no historical interactions are represented by a padded step.  
   - Ensures stability and prevents shape mismatches during batching.

5. **Batch Processing with Padding and Packing**  
   - `pad_sequence` ensures all sequences in a batch have the same length.  
   - `pack_padded_sequence` allows GRU to skip padded steps efficiently, preserving variable-length information.

6. **Precomputation for Downstream Tasks**  
   - Embeddings are computed once and stored, allowing fast usage in downstream tasks without recomputing each time.

The following cell defines the GRU-based sequence encoer architecture used to generate user embeddings.

In [None]:
class SeqEncoder(nn.Module):
  def __init__(self, n_items, n_types, n_time, emb_dim=64, dropout=0.1):
    super().__init__()

    # Embedding layers
    self.item_emb = nn.Embedding(n_items, emb_dim, padding_idx=0)
    self.type_emb = nn.Embedding(n_types, emb_dim, padding_idx=0)
    self.time_emb = nn.Embedding(n_time, emb_dim, padding_idx=0)

    # GRU reads the sequence of embedded steps
    self.gru = nn.GRU(input_size=emb_dim, hidden_size=emb_dim, batch_first=True)

    # Attention layer
    self.attn = nn.Linear(emb_dim, 1)

    # Dropout layer for regularisation
    self.dropout = nn.Dropout(dropout)


  def forward(self, wid, type_, time_, lens):
    # Convert ids to embeddings, Sum embeddings at each timestep
    x = self.item_emb(wid) + self.type_emb(type_) + self.time_emb(time_)

    # Pack padded batch
    lens_cpu = lens.cpu().clamp_min(1)
    packed = pack_padded_sequence(x, lengths=lens_cpu, batch_first=True, enforce_sorted=False)
    packed_out, _ = self.gru(packed)
    out, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)

    # Mask out padding positions
    mask = (wid != 0).float()
    attn_scores = self.attn(out).squeeze(-1)
    attn_scores = attn_scores.masked_fill(mask == 0, -1e9)  #ignore PADs
    attn_weights = torch.softmax(attn_scores, dim=1).unsqueeze(-1)

    # Weighted sum of GRU outputs
    context = torch.sum(attn_weights * out, dim=1)
    return self.dropout(context)

Next, we use the model in evaluation mode to compute user embeddings in batches.

In [None]:
model = SeqEncoder(N_ITEMS, N_TYPES, N_TIME).to(device)
model.eval()

user_dynamic_chunks = []
row_chunks = []

with torch.no_grad():
  for batch in dataloader:
    wid = batch["wid"].to(device)
    typ = batch["type"].to(device)
    time = batch["time"].to(device)
    lens = batch["lens"].to(device)
    restore = batch["restore"].to(device)

    # Forward pass: Get user embeddings for batch
    u = model(wid, typ, time, lens)

    # Restore original raw order within batch
    u = u[restore]

    # Get original row_ids in correct order
    rows = batch["row_ids_sorted"][restore].cpu().numpy()

    user_dynamic_chunks.append(u.cpu().numpy())
    row_chunks.append(rows)

# Concatenate all batches into 1 array
user_dynamic_embs = np.concatenate(user_dynamic_chunks, axis=0)
row_ids = np.concatenate(row_chunks, axis=0)

## Export data

In [None]:
# Save embeddings
np.save(os.path.join(SAVE_PATH, "user_dynamic_embs.npy"), user_dynamic_embs)

In [None]:
# Add embeddings as a new column in a copy of dataframe
df_out = queries_df.copy()
df_out["user_dynamic_embs"] = list(user_dynamic_embs)

In [None]:
# Save dataframe with embeddings to parquet
df_out.to_parquet(os.path.join(SAVE_PATH, "user_dynamic_embs.parquet"))

In [None]:
# save state_dict with N_ITEMS, N_TYPES, N_TIME
checkpoint = {
    'state_dict': model.state_dict(),
    'n_items': N_ITEMS,
    'n_types': N_TYPES,
    'n_time': N_TIME
}
torch.save(checkpoint, os.path.join(SAVE_PATH, "seq_encoder_ckpt.pt"))