In [10]:
from src.experiment import load_mind_data_with_neg
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

#######################################################
# 1) Load GRV Data (Item Popularity Over Time)
#######################################################

def load_cox_data_and_survival(cox_data_csv, cox_survival_csv, itemHourLog_csv):
    cox_df = pd.read_csv(cox_data_csv)
    if "T_i0" in cox_df.columns:
        t0_map = dict(zip(cox_df["item_id"], cox_df["T_i0"]))
    else:
        hour_df = pd.read_csv(itemHourLog_csv)
        tmp = hour_df.groupby("item_id")["hour_offset"].min().reset_index()
        t0_map = dict(zip(tmp["item_id"], tmp["hour_offset"]))

    surv_df = pd.read_csv(cox_survival_csv)
    grv_cols = [c for c in surv_df.columns if c.startswith("GRV_t")]
    
    def parse_off(col):
        return int(col.split("t")[-1])

    item_grv = {}
    for row in surv_df.itertuples(index=False):
        it = getattr(row, "item_id")
        d = {parse_off(c): getattr(row, c) for c in grv_cols}
        item_grv[it] = d

    cox_map = {it_id: {"T_i0": t0_map.get(it_id, 0), "grv_map": item_grv[it_id]} for it_id in item_grv}
    return cox_map

def get_grv(cox_map, item_id, current_hour, default_val=0.0):
    if item_id not in cox_map:
        return default_val
    T0 = cox_map[item_id]["T_i0"]
    offset = int(current_hour - T0)
    if offset <= 0:
        return 0.0
    grv_map = cox_map[item_id]["grv_map"]
    offsets = sorted(grv_map.keys())
    offset = min(max(offset, offsets[0]), offsets[-1])
    return grv_map.get(offset, default_val)

#######################################################
# 2) TiSASRec Model (Time-Aware Self-Attention)
#######################################################

class TiSASRec(nn.Module):
    def __init__(self, num_items, emb_dim=16, num_heads=2, num_layers=2, max_seq_len=50, dropout=0.2):
        super().__init__()
        self.max_seq_len = max_seq_len  # Ensure max length is set
        self.item_emb = nn.Embedding(num_items, emb_dim)
        self.pos_emb = nn.Embedding(max_seq_len, emb_dim)
        self.time_emb = nn.Linear(1, emb_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=num_heads, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(emb_dim, num_items)
        self.dropout = nn.Dropout(dropout)
        nn.init.xavier_uniform_(self.item_emb.weight)
        nn.init.xavier_uniform_(self.fc.weight)

    def forward(self, item_seq, time_seq):
        batch_size, seq_len = item_seq.shape
        if seq_len > self.max_seq_len:
            item_seq = item_seq[:, -self.max_seq_len:]  # Truncate to max_seq_len
            time_seq = time_seq[:, -self.max_seq_len:]  # Truncate to match

        item_embeddings = self.item_emb(item_seq)
        pos_indices = torch.arange(item_seq.shape[1], device=item_seq.device).unsqueeze(0).expand_as(item_seq)
        pos_embeddings = self.pos_emb(pos_indices)
        time_embeddings = self.time_emb(time_seq.unsqueeze(-1))

        seq_embeddings = item_embeddings + pos_embeddings + time_embeddings
        seq_embeddings = self.encoder(seq_embeddings)
        last_output = seq_embeddings[:, -1, :]
        return self.fc(self.dropout(last_output))

class TimeAwareSessionDataset(Dataset):
    def __init__(self, df, item2idx, session_length=5):
        self.sessions = []
        self.time_diffs = []
        grouped = df.groupby("user_id")
        for _, group in grouped:
            group = group.sort_values("time")
            items = group["item_id"].values
            times = group["time"].values
            if len(items) < session_length:
                continue
            for i in range(len(items) - session_length + 1):
                session_seq = items[i: i + session_length]
                time_seq = times[i: i + session_length] - times[i]
                indices = [item2idx[it] for it in session_seq if it in item2idx]
                if len(indices) == session_length:
                    self.sessions.append(indices)
                    self.time_diffs.append(time_seq)
        self.sessions = np.array(self.sessions)
        self.time_diffs = np.array(self.time_diffs)

    def __len__(self):
        return len(self.sessions)

    def __getitem__(self, idx):
        session = self.sessions[idx]
        time_diff = self.time_diffs[idx]
        return torch.tensor(session[:-1], dtype=torch.long), torch.tensor(time_diff[:-1], dtype=torch.float), torch.tensor(session[-1], dtype=torch.long)

#######################################################
# 3) Training TiSASRec
#######################################################

def train_one_epoch_tisasrec(model, loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for item_seq, time_seq, target in loader:
        item_seq, time_seq, target = item_seq.to(device), time_seq.to(device), target.to(device)
        optimizer.zero_grad()
        logits = model(item_seq, time_seq)
        loss = loss_fn(logits, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * item_seq.size(0)
    return total_loss / len(loader.dataset)

#######################################################
# 4) GRV-Based Ranking for TiSASRec
#######################################################

def evaluate_tisasrec_ranking(model, test_df, item2idx, cox_map, gamma=0.3, K=10, device="cpu"):
    test_df = test_df.copy()
    test_df["time_hr"] = (test_df["time"] // 3600).astype(int)
    grouped = test_df.groupby("user_id")
    coverage_items, newcov = set(), 0  # `newcov` starts as a float
    hits_at_k, ndcg_at_k, total_positives = 0, 0, 0
    all_users = list(grouped.groups.keys())
    all_item_ids = test_df["item_id"].unique()
    rng = np.random.default_rng(0)

    for user_id in tqdm(all_users, desc="Evaluating TiSASRec with GRV"):
        g = grouped.get_group(user_id)
        t_hr = g["time_hr"].min()
        pos_items = g[g["label"] == 1]["item_id"].unique()
        candidate_items = np.unique(np.concatenate([pos_items, rng.choice(all_item_ids, size=50, replace=False)]))

        valid_items = [item2idx[it] for it in candidate_items if it in item2idx]
        if not valid_items:
            continue  
        if len(valid_items) > model.max_seq_len:
            valid_items = valid_items[-model.max_seq_len:]

        with torch.no_grad():
            inputs = torch.tensor(valid_items, dtype=torch.long, device=device).unsqueeze(0)
            time_inputs = torch.zeros_like(inputs, dtype=torch.float)
            logits = model(inputs, time_inputs)
            base_scores = logits.cpu().numpy().flatten()

        final_scores = [(1 - gamma) * base_score + gamma * get_grv(cox_map, it, t_hr) for it, base_score in zip(candidate_items, base_scores)]
        top_items = candidate_items[np.argsort(-np.array(final_scores))[:K]]

        # ✅ Normalize Coverage
        coverage_items.update(top_items)

        # ✅ Normalize New Item Coverage
        new_item_count = sum(1 for it in top_items if it in cox_map and cox_map[it]["T_i0"] >= 100)
        newcov += new_item_count / K  # Proportion of new items recommended

        total_positives += len(pos_items)
        hits = sum(1 for pos_it in pos_items if pos_it in top_items)
        hits_at_k += hits
        ndcg_at_k += sum(1.0 / np.log2(np.where(top_items == pos_it)[0][0] + 2) for pos_it in pos_items if pos_it in top_items)

    hr = hits_at_k / total_positives if total_positives > 0 else 0
    ndcg = ndcg_at_k / len(all_users) if len(all_users) > 0 else 0
    newcov /= len(all_users)  # Normalize new coverage by total users
    coverage = len(coverage_items) / len(item2idx)  # Normalize coverage by total items

    return hr, ndcg, coverage, newcov  # ✅ Coverage is now a proportion

#######################################################
# 5) Main Experiment for TiSASRec with GRV
#######################################################

def main():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Load data using the correct function
    train_df, val_df, test_df = load_mind_data_with_neg("train.csv", "dev.csv", "test.csv")

    # Debug: Print column names for verification
    print(f"[DEBUG] Columns in train.csv: {train_df.columns}")

    # Ensure 'item_id' column exists
    if 'item_id' not in train_df.columns:
        raise KeyError("Column 'item_id' is missing from train.csv. Check the column names.")

    # Build item2idx mapping
    all_items = pd.concat([train_df["item_id"], val_df["item_id"], test_df["item_id"]]).unique()
    item2idx = {i: idx for idx, i in enumerate(all_items)}

    # Load GRV data
    cox_map = load_cox_data_and_survival(
        "cox_output/cox_data.csv", 
        "cox_output/cox_survival.csv", 
        "output/ItemHourLog.csv"
    )

    # Build training dataset and loader
    train_ds = TimeAwareSessionDataset(train_df, item2idx)
    train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)

    # Define model
    model = TiSASRec(len(item2idx)).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()

    # Train the model
    for ep in range(3):
        train_one_epoch_tisasrec(model, train_loader, optimizer, loss_fn, device)

    # **Evaluate with GRV-based ranking**
    hr, ndcg, cov, newcov = evaluate_tisasrec_ranking(
        model, test_df, item2idx, cox_map, gamma=0.1, K=10, device=device
    )

    print(f"[RESULT] HR@10={hr:.4f}, NDCG@10={ndcg:.4f}, coverage@10={cov}, new_item_coverage@10={newcov}")

if __name__ == "__main__":
    main()

[DEBUG] Columns in train.csv: Index(['user_id', 'item_id', 'time', 'label'], dtype='object')


Evaluating TiSASRec with GRV:   0%|          | 0/57900 [00:00<?, ?it/s]


TypeError: '>=' not supported between instances of 'dict' and 'int'