# Experiment Reproduction Pipeline

### Requirements install

In [None]:
!pip install requirements.txt

### 1) Item Hour Log generation from ReChorus TOPK generated MIND dataset

In [None]:
!python 1_item_hour_log_from_ReChorus.py

### 2) GRV generation from COX model

In [1]:
!python 2_COX_GRV.py

Traceback (most recent call last):
  File "C:\Users\Max\Experiment-Design-Project\src\TaFR-reproducible\2_COX_GRV.py", line 149, in <module>
    train_and_generate_grv_with_vitality(
  File "C:\Users\Max\Experiment-Design-Project\src\TaFR-reproducible\2_COX_GRV.py", line 27, in train_and_generate_grv_with_vitality
    df = pd.read_csv(item_hour_log_csv)
  File "C:\Users\Max\Experiment-Design-Project\venv\lib\site-packages\pandas\io\parsers\readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "C:\Users\Max\Experiment-Design-Project\venv\lib\site-packages\pandas\io\parsers\readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "C:\Users\Max\Experiment-Design-Project\venv\lib\site-packages\pandas\io\parsers\readers.py", line 1620, in __init__
    self._engine = self._make_engine(f, self.engine)
  File "C:\Users\Max\Experiment-Design-Project\venv\lib\site-packages\pandas\io\parsers\readers.py", line 1880, in _make_eng

### 3) Backbone Models applying GAMMA = 0 and GAMMA = 0.3, 0.1

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from math import log2

KeyboardInterrupt: 

#### 3.1) NeuMF

In [None]:
#######################################################
# 1) Data loading with negative samples
#######################################################

def load_mind_data_with_neg(train_csv, val_csv, test_csv):
    """
    We assume train.csv might have columns: user_id, item_id, time, neg_items, etc.
    We will parse them:
      - For each row => (user, item, label=1)
      - For each item in 'neg_items' => (user, neg_item, label=0)
    Return dataframes for train, val, test with [user_id, item_id, time, label].
    """
    def parse_dataset(filename):
        data_pos = []
        data_neg = []
        if not os.path.exists(filename):
            return pd.DataFrame(columns=["user_id","item_id","time","label"])
        df = pd.read_csv(filename, sep="\t")
        # If there's no 'neg_items' col => no negative sampling
        if "neg_items" not in df.columns:
            # fallback => treat all as label=1?
            df["label"] = 1
            return df[["user_id","item_id","time","label"]]
        # parse neg_items
        for row in df.itertuples(index=False):
            user = getattr(row,"user_id")
            item = getattr(row,"item_id")
            tval = getattr(row,"time")
            # label=1
            data_pos.append((user,item,tval,1))
            # read neg_items => string of format "[7856, 8058, ...]"
            s = getattr(row,"neg_items")
            # parse them
            s = s.strip()
            s = s.lstrip("[").rstrip("]")
            if len(s)>0:
                parts = s.split(",")
                for neg_str in parts:
                    neg_str=neg_str.strip()
                    if neg_str:
                        neg_id = int(neg_str)
                        data_neg.append((user,neg_id,tval,0))
        df_pos = pd.DataFrame(data_pos, columns=["user_id","item_id","time","label"])
        df_neg = pd.DataFrame(data_neg, columns=["user_id","item_id","time","label"])
        finaldf = pd.concat([df_pos, df_neg], ignore_index=True)
        return finaldf

    train_df = parse_dataset(train_csv)
    val_df   = parse_dataset(val_csv)
    test_df  = parse_dataset(test_csv)
    return train_df, val_df, test_df

class MindInteractionDataset(Dataset):
    """
    Basic PyTorch dataset with pointwise (user, item, label).
    """
    def __init__(self, df, user2idx, item2idx):
        self.users = df["user_id"].map(user2idx).values
        self.items = df["item_id"].map(item2idx).values
        self.labels= df["label"].values.astype(float)
        self.times = df["time"].values.astype(int)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return (
            self.users[idx],
            self.items[idx],
            self.labels[idx],
            self.times[idx]
        )

#######################################################
# 2) NeuMF model
#######################################################

class NeuMF(nn.Module):
    def __init__(self, num_users, num_items, emb_dim=8, mlp_hidden=16):
        super().__init__()
        self.user_emb_gmf = nn.Embedding(num_users, emb_dim)
        self.item_emb_gmf = nn.Embedding(num_items, emb_dim)

        self.user_emb_mlp = nn.Embedding(num_users, emb_dim)
        self.item_emb_mlp = nn.Embedding(num_items, emb_dim)

        self.mlp = nn.Sequential(
            nn.Linear(2*emb_dim, mlp_hidden),
            nn.ReLU(),
            nn.Linear(mlp_hidden, mlp_hidden//2),
            nn.ReLU()
        )
        self.final = nn.Linear(emb_dim + mlp_hidden//2, 1)

        nn.init.xavier_uniform_(self.user_emb_gmf.weight)
        nn.init.xavier_uniform_(self.item_emb_gmf.weight)
        nn.init.xavier_uniform_(self.user_emb_mlp.weight)
        nn.init.xavier_uniform_(self.item_emb_mlp.weight)

    def forward(self, user_idx, item_idx):
        u_gmf = self.user_emb_gmf(user_idx)
        i_gmf = self.item_emb_gmf(item_idx)
        gmf_out= u_gmf*i_gmf

        u_mlp = self.user_emb_mlp(user_idx)
        i_mlp = self.item_emb_mlp(item_idx)
        mlp_in= torch.cat([u_mlp, i_mlp], dim=1)
        mlp_out= self.mlp(mlp_in)

        concat = torch.cat([gmf_out, mlp_out], dim=1)
        logit  = self.final(concat)
        return logit.view(-1)

#######################################################
# 3) Training loop
#######################################################

def train_one_epoch(model, loader, optimizer, loss_fn, device):
    model.train()
    total_loss=0
    for batch in loader:
        users, items, labels, _times = batch
        users = users.to(device)
        items = items.to(device)
        labels= labels.float().to(device)

        optimizer.zero_grad()
        preds = model(users, items)
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()
        total_loss+= loss.item()*len(labels)
    return total_loss/len(loader.dataset)

def eval_one_epoch(model, loader, loss_fn, device):
    model.eval()
    total_loss=0
    with torch.no_grad():
        for batch in loader:
            users, items, labels, _times = batch
            users= users.to(device)
            items= items.to(device)
            labels= labels.float().to(device)
            preds= model(users, items)
            loss= loss_fn(preds, labels)
            total_loss+= loss.item()*len(labels)
    return total_loss/len(loader.dataset)

#######################################################
# 4) Combine with GRV, Evaluate Coverage/HR@10
#######################################################

def load_cox_data_and_survival(cox_data_csv, cox_survival_csv, itemHourLog_csv):
    cox_df = pd.read_csv(cox_data_csv)
    if "T_i0" in cox_df.columns:
        t0_map = dict(zip(cox_df["item_id"], cox_df["T_i0"]))
    else:
        hour_df= pd.read_csv(itemHourLog_csv)
        tmp= hour_df.groupby("item_id")["hour_offset"].min().reset_index()
        t0_map= dict(zip(tmp["item_id"], tmp["hour_offset"]))

    surv_df= pd.read_csv(cox_survival_csv)
    grv_cols= [c for c in surv_df.columns if c.startswith("GRV_t")]
    def parse_off(col):
        return int(col.split("t")[-1])

    item_grv={}
    for row in surv_df.itertuples(index=False):
        it= getattr(row,"item_id")
        d= {}
        for c in grv_cols:
            val= getattr(row,c)
            off= parse_off(c)
            d[off]= val
        item_grv[it]= d

    cox_map={}
    for it_id in item_grv:
        T0= t0_map[it_id] if it_id in t0_map else 0
        cox_map[it_id]={
            "T_i0": T0,
            "grv_map": item_grv[it_id]
        }
    return cox_map

def get_grv(cox_map, item_id, current_hour, default_val=0.0):
    if item_id not in cox_map:
        return default_val
    T0= cox_map[item_id]["T_i0"]
    offset= int(current_hour - T0)
    if offset<=0:
        return 0.0
    grv_map= cox_map[item_id]["grv_map"]
    offsets= sorted(grv_map.keys())
    if offset< offsets[0]:
        offset= offsets[0]
    if offset> offsets[-1]:
        offset= offsets[-1]
    return grv_map.get(offset, default_val)

def evaluate_ranking(
    model, df_test, user2idx, item2idx, cox_map,
    gamma=0.0, K=10, device=torch.device("cpu"), new_threshold=100
):
    """
    Evaluate ranking performance using HR@K, NDCG@K, Coverage@K, and New Item Coverage@K.
    """
    df_test = df_test.copy()
    df_test["time_hr"] = (df_test["time"] // 3600).astype(int)

    grouped = df_test.groupby("user_id")
    coverage_items = set()
    new_item_hits = 0

    hits_at_k = 0
    ndcg_at_k = 0
    total_positives = 0

    all_users = list(grouped.groups.keys())
    all_item_ids = list(item2idx.keys())

    rng = np.random.default_rng(0)

    for user_id in tqdm(all_users, desc="EvaluateRanking"):
        g = grouped.get_group(user_id)
        t_hr = g["time_hr"].min()  # Earliest request

        # Build a candidate set: positive items + 50 random items
        pos_items = g[g["label"] == 1]["item_id"].unique()
        candidate_items = np.unique(np.concatenate([pos_items, rng.choice(all_item_ids, size=50, replace=False)]))

        # Convert candidate items to indices, filtering valid ones
        valid_candidates = [(it, item2idx[it]) for it in candidate_items if it in item2idx]
        if not valid_candidates:
            continue

        item_indices = [idx for _, idx in valid_candidates]
        item_ids = [it for it, _ in valid_candidates]

        # Compute model scores
        model.eval()
        with torch.no_grad():
            user_tensor = torch.tensor([user2idx[user_id]] * len(item_indices), dtype=torch.long, device=device)
            item_tensor = torch.tensor(item_indices, dtype=torch.long, device=device)
            preds = model(user_tensor, item_tensor)
        base_scores = preds.cpu().numpy().flatten()

        # Compute final scores using GRV
        final_scores = []
        for i, (it, base_score) in enumerate(zip(item_ids, base_scores)):
            grv_val = get_grv(cox_map, it, t_hr)
            final_score = (1 - gamma) * base_score + gamma * grv_val
            final_scores.append(final_score)

        # Select top-K items
        top_indices = np.argsort(-np.array(final_scores))[:K]
        top_items = [item_ids[i] for i in top_indices]

        # **Coverage Calculation**
        coverage_items.update(top_items)

        # **New Item Coverage Calculation (Fixed)**
        new_items = set(it for it in top_items if it in cox_map and cox_map[it]["T_i0"] >= new_threshold)
        if new_items:
            new_item_hits += 1  # Count users who received at least one new item

        # **HR & NDCG Calculation**
        total_positives += len(pos_items)
        hits = sum(1 for pos_it in pos_items if pos_it in top_items)
        hits_at_k += hits

        # Compute NDCG
        dcg = sum(1.0 / log2(np.where(np.array(top_items) == pos_it)[0][0] + 2) for pos_it in pos_items if pos_it in top_items)
        idcg = sum(1.0 / log2(i + 2) for i in range(len(pos_items))) if len(pos_items) > 0 else 0
        ndcg_at_k += dcg / idcg if idcg > 0 else 0

    # **Normalize Metrics**
    hr = hits_at_k / total_positives if total_positives > 0 else 0
    ndcg = ndcg_at_k / len(all_users) if len(all_users) > 0 else 0
    coverage = len(coverage_items) / len(item2idx)  # ✅ Normalize by total items
    new_item_coverage = new_item_hits / len(all_users)  # ✅ Normalize by total users

    return hr, ndcg, coverage, new_item_coverage

#######################################################
# 5) Main experiment
#######################################################
def neumf_experiment(
    train_csv="train.csv",
    val_csv="val.csv",
    test_csv="test.csv",
    cox_data_csv="./cox_output/cox_data.csv",
    cox_survival_csv="./cox_output/cox_survival.csv",
    itemHourLog_csv="./output/ItemHourLog.csv",
    gamma=0.0,
    epochs=3,
    batch_size=256,
    emb_dim=8,
    mlp_hidden=16,
    use_cuda=True
):

    device= torch.device("cuda:0" if use_cuda and torch.cuda.is_available() else "cpu")
    print(f"[INFO] device={device}")

    # 1) Load data with negative sampling
    train_df, val_df, test_df= load_mind_data_with_neg(train_csv, val_csv, test_csv)
    print(f"[INFO] train={len(train_df)}, val={len(val_df)}, test={len(test_df)}")

    # Build global user/item index
    all_users= pd.concat([train_df["user_id"], val_df["user_id"], test_df["user_id"]]).unique()
    all_items= pd.concat([train_df["item_id"], val_df["item_id"], test_df["item_id"]]).unique()
    user2idx= {u:i for i,u in enumerate(all_users)}
    item2idx= {i:u for u,i in enumerate(all_items)}

    # 2) Build PyTorch datasets
    train_ds= MindInteractionDataset(train_df, user2idx, item2idx)
    val_ds= MindInteractionDataset(val_df, user2idx, item2idx)
    test_ds= MindInteractionDataset(test_df, user2idx, item2idx)

    train_loader= DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader  = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)

    # 3) Build NeuMF
    model= NeuMF(len(user2idx), len(item2idx), emb_dim, mlp_hidden).to(device)
    optimizer= optim.Adam(model.parameters(), lr=1e-3)
    loss_fn= nn.BCEWithLogitsLoss()

    # 4) Train
    for ep in range(epochs):
        tr_loss= train_one_epoch(model, train_loader, optimizer, loss_fn, device)
        vl_loss= eval_one_epoch(model, val_loader,   loss_fn, device)
        print(f"Epoch {ep}: train_loss={tr_loss:.4f}, val_loss={vl_loss:.4f}")

    # 5) Load cox data => cox_map
    cox_map= load_cox_data_and_survival(
        cox_data_csv, cox_survival_csv, itemHourLog_csv
    )

    # 6) Evaluate with GRV => final
    hr, ndcg, cov, newcov= evaluate_ranking(
        model, test_df, user2idx, item2idx, cox_map,
        gamma=gamma,
        K=10,
        device=device,
        new_threshold=100
    )
    print(f"[RESULT] HR@10={hr:.4f}, NDCG@10={ndcg:.4f}, coverage@10={cov}, new_item_coverage@10={newcov}")

In [None]:
neumf_experiment(
    train_csv="train.csv",
    val_csv="dev.csv",
    test_csv="test.csv",
    cox_data_csv="./cox_output/cox_data.csv",
    cox_survival_csv="./cox_output/cox_survival.csv",
    itemHourLog_csv="./output/ItemHourLog.csv",
    gamma=0,
    epochs=3,
    batch_size=256,
    emb_dim=8,
    mlp_hidden=16,
    use_cuda=True
)

In [None]:
neumf_experiment(
    train_csv="train.csv",
    val_csv="dev.csv",
    test_csv="test.csv",
    cox_data_csv="./cox_output/cox_data.csv",
    cox_survival_csv="./cox_output/cox_survival.csv",
    itemHourLog_csv="./output/ItemHourLog.csv",
    gamma=0.3,
    epochs=3,
    batch_size=256,
    emb_dim=8,
    mlp_hidden=16,
    use_cuda=True
)

#### 3.2) GRU4REC

In [None]:
#######################################################
# 1) Load GRV Data (Item Popularity Over Time)
#######################################################

def load_cox_data_and_survival(cox_data_csv, cox_survival_csv, itemHourLog_csv):
    cox_df = pd.read_csv(cox_data_csv)
    if "T_i0" in cox_df.columns:
        t0_map = dict(zip(cox_df["item_id"], cox_df["T_i0"]))
    else:
        hour_df = pd.read_csv(itemHourLog_csv)
        tmp = hour_df.groupby("item_id")["hour_offset"].min().reset_index()
        t0_map = dict(zip(tmp["item_id"], tmp["hour_offset"]))

    surv_df = pd.read_csv(cox_survival_csv)
    grv_cols = [c for c in surv_df.columns if c.startswith("GRV_t")]

    def parse_off(col):
        return int(col.split("t")[-1])

    item_grv = {}
    for row in surv_df.itertuples(index=False):
        it = getattr(row, "item_id")
        d = {parse_off(c): getattr(row, c) for c in grv_cols}
        item_grv[it] = d

    cox_map = {it_id: {"T_i0": t0_map.get(it_id, 0), "grv_map": item_grv[it_id]} for it_id in item_grv}
    return cox_map


def get_grv(cox_map, item_id, current_hour, default_val=0.0):
    if item_id not in cox_map:
        return default_val
    T0 = cox_map[item_id]["T_i0"]
    offset = int(current_hour - T0)
    if offset <= 0:
        return 0.0
    grv_map = cox_map[item_id]["grv_map"]
    offsets = sorted(grv_map.keys())
    offset = min(max(offset, offsets[0]), offsets[-1])
    return grv_map.get(offset, default_val)


#######################################################
# 2) GRU4Rec Model (Session-Based Recommendation)
#######################################################

class GRU4Rec(nn.Module):
    def __init__(self, num_items, emb_dim=16, hidden_size=16, num_layers=1, dropout=0.2):
        super().__init__()
        self.item_emb = nn.Embedding(num_items, emb_dim)
        self.gru = nn.GRU(input_size=emb_dim, hidden_size=hidden_size,
                          num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, num_items)
        nn.init.xavier_uniform_(self.item_emb.weight)
        nn.init.xavier_uniform_(self.fc.weight)

    def forward(self, session_seq):
        embedded = self.item_emb(session_seq)
        gru_out, _ = self.gru(embedded)
        last_out = gru_out[:, -1, :]
        last_out = self.dropout(last_out)
        return self.fc(last_out)


class SessionDataset(Dataset):
    def __init__(self, df, item2idx, session_length=5):
        self.sessions = []
        grouped = df.groupby("user_id")
        for _, group in grouped:
            group = group.sort_values("time")
            items = group["item_id"].values
            if len(items) < session_length:
                continue
            for i in range(len(items) - session_length + 1):
                session_seq = items[i: i + session_length]
                indices = [item2idx[it] for it in session_seq if it in item2idx]
                if len(indices) == session_length:
                    self.sessions.append(indices)
        self.sessions = np.array(self.sessions)

    def __len__(self):
        return len(self.sessions)

    def __getitem__(self, idx):
        session = self.sessions[idx]
        return torch.tensor(session[:-1], dtype=torch.long), torch.tensor(session[-1], dtype=torch.long)


#######################################################
# 3) Training & Evaluation for GRU4Rec
#######################################################

def train_one_epoch_gru4rec(model, loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for inputs, target in loader:
        inputs, target = inputs.to(device), target.to(device)
        optimizer.zero_grad()
        logits = model(inputs)
        loss = loss_fn(logits, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * inputs.size(0)
    return total_loss / len(loader.dataset)


def eval_one_epoch_gru4rec(model, loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for inputs, target in loader:
            inputs, target = inputs.to(device), target.to(device)
            logits = model(inputs)
            loss = loss_fn(logits, target)
            total_loss += loss.item() * inputs.size(0)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == target).sum().item()
    return total_loss / len(loader.dataset), correct / len(loader.dataset)


#######################################################
# 4) GRV-Based Ranking for GRU4Rec
#######################################################

def evaluate_gru4rec_ranking(model, test_df, item2idx, cox_map, gamma=0.3, K=10, device="cpu"):
    """
    Evaluate GRU4Rec performance using HR@K, NDCG@K, Coverage@K, and New Item Coverage@K.
    """
    test_df = test_df.copy()
    test_df["time_hr"] = (test_df["time"] // 3600).astype(int)

    grouped = test_df.groupby("user_id")
    coverage_items = set()
    new_item_hits = 0  # Number of users who received at least one new item

    hits_at_k = 0
    ndcg_at_k = 0  # ✅ Track cumulative NDCG
    total_positives = 0
    all_users = list(grouped.groups.keys())
    all_item_ids = list(item2idx.keys())

    rng = np.random.default_rng(0)

    for user_id in tqdm(all_users, desc="Evaluating GRU4Rec with GRV"):
        g = grouped.get_group(user_id)
        t_hr = g["time_hr"].min()
        pos_items = g[g["label"] == 1]["item_id"].unique()
        candidate_items = np.unique(np.concatenate([pos_items, rng.choice(all_item_ids, size=50, replace=False)]))

        valid_candidates = [(it, item2idx[it]) for it in candidate_items if it in item2idx]
        if not valid_candidates:
            continue

        item_indices = [idx for _, idx in valid_candidates]
        item_ids = [it for it, _ in valid_candidates]

        # Compute model scores
        model.eval()
        with torch.no_grad():
            inputs = torch.tensor(item_indices, dtype=torch.long, device=device).unsqueeze(0)
            logits = model(inputs)
        base_scores = logits.cpu().numpy().flatten()

        # Compute final scores using GRV
        final_scores = []
        for i, (it, base_score) in enumerate(zip(item_ids, base_scores)):
            grv_val = get_grv(cox_map, it, t_hr)
            final_score = (1 - gamma) * base_score + gamma * grv_val
            final_scores.append(final_score)

        # Select top-K items
        top_indices = np.argsort(-np.array(final_scores))[:K]
        top_items = [item_ids[i] for i in top_indices]

        # **Coverage Calculation**
        coverage_items.update(top_items)

        # **New Item Coverage Calculation**
        new_items = set(it for it in top_items if it in cox_map and cox_map[it]["T_i0"] >= 100)
        if new_items:
            new_item_hits += 1  # Count users who received at least one new item

        # **HR Calculation**
        total_positives += len(pos_items)
        hits = sum(1 for pos_it in pos_items if pos_it in top_items)
        hits_at_k += hits

        # **NDCG Calculation**
        dcg = 0.0
        for pos_it in pos_items:
            if pos_it in top_items:
                rank = np.where(np.array(top_items) == pos_it)[0][0] + 1
                dcg += 1 / np.log2(rank + 1)  # ✅ Compute DCG

        # Ideal DCG (iDCG) - best possible ranking
        idcg = sum(1.0 / np.log2(i + 2) for i in range(len(pos_items))) if len(pos_items) > 0 else 0
        ndcg_at_k += dcg / idcg if idcg > 0 else 0  # ✅ Normalize DCG

    # **Normalize Metrics**
    hr = hits_at_k / total_positives if total_positives > 0 else 0
    ndcg = ndcg_at_k / len(all_users) if len(all_users) > 0 else 0  # ✅ Normalize by total users
    coverage = len(coverage_items) / len(item2idx)  # ✅ Normalize by total items
    new_item_coverage = new_item_hits / len(all_users)  # ✅ Normalize by total users

    return hr, ndcg, coverage, new_item_coverage


#######################################################
# 5) Main Experiment for GRU4Rec with GRV
#######################################################

def gru4rec(gamma_val):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    train_df, val_df, test_df = load_mind_data_with_neg("train.csv", "dev.csv", "test.csv")
    item2idx = {i: idx for idx, i in
                enumerate(pd.concat([train_df["item_id"], val_df["item_id"], test_df["item_id"]]).unique())}

    train_ds = SessionDataset(train_df, item2idx)
    train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)

    model = GRU4Rec(len(item2idx)).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()

    for ep in range(3):
        train_one_epoch_gru4rec(model, train_loader, optimizer, loss_fn, device)

    cox_map = load_cox_data_and_survival("./cox_output/cox_data.csv", "./cox_output/cox_survival.csv",
                                         "./output/itemHourLog.csv")

    hr, ndcg, cov, newcov = evaluate_gru4rec_ranking(model, test_df, item2idx, cox_map, gamma=gamma_val, K=10, device=device)
 
    print(f"[RESULT] HR@10={hr:.4f}, NDCG@10={ndcg:.4f}, coverage@10={cov:.4f}, new_item_coverage@10={newcov:.4f}")

In [None]:
gru4rec(0)

In [None]:
gru4rec(0.3)

### 3.3) Tisas

In [None]:
#######################################################
# 1) Load GRV Data (Item Popularity Over Time)
#######################################################

def load_cox_data_and_survival(cox_data_csv, cox_survival_csv, itemHourLog_csv):
    cox_df = pd.read_csv(cox_data_csv)
    if "T_i0" in cox_df.columns:
        t0_map = dict(zip(cox_df["item_id"], cox_df["T_i0"]))
    else:
        hour_df = pd.read_csv(itemHourLog_csv)
        tmp = hour_df.groupby("item_id")["hour_offset"].min().reset_index()
        t0_map = dict(zip(tmp["item_id"], tmp["hour_offset"]))

    surv_df = pd.read_csv(cox_survival_csv)
    grv_cols = [c for c in surv_df.columns if c.startswith("GRV_t")]

    def parse_off(col):
        return int(col.split("t")[-1])

    item_grv = {}
    for row in surv_df.itertuples(index=False):
        it = getattr(row, "item_id")
        d = {parse_off(c): getattr(row, c) for c in grv_cols}
        item_grv[it] = d

    cox_map = {it_id: {"T_i0": t0_map.get(it_id, 0), "grv_map": item_grv[it_id]} for it_id in item_grv}
    return cox_map


def get_grv(cox_map, item_id, current_hour, default_val=0.0):
    if item_id not in cox_map:
        return default_val
    T0 = cox_map[item_id]["T_i0"]
    offset = int(current_hour - T0)
    if offset <= 0:
        return 0.0
    grv_map = cox_map[item_id]["grv_map"]
    offsets = sorted(grv_map.keys())
    offset = min(max(offset, offsets[0]), offsets[-1])
    return grv_map.get(offset, default_val)


#######################################################
# 2) TiSASRec Model (Time-Aware Self-Attention)
#######################################################

class TiSASRec(nn.Module):
    def __init__(self, num_items, emb_dim=16, num_heads=2, num_layers=2, max_seq_len=50, dropout=0.2):
        super().__init__()
        self.max_seq_len = max_seq_len  # Ensure max length is set
        self.item_emb = nn.Embedding(num_items, emb_dim)
        self.pos_emb = nn.Embedding(max_seq_len, emb_dim)
        self.time_emb = nn.Linear(1, emb_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=num_heads, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(emb_dim, num_items)
        self.dropout = nn.Dropout(dropout)
        nn.init.xavier_uniform_(self.item_emb.weight)
        nn.init.xavier_uniform_(self.fc.weight)

    def forward(self, item_seq, time_seq):
        batch_size, seq_len = item_seq.shape
        if seq_len > self.max_seq_len:
            item_seq = item_seq[:, -self.max_seq_len:]  # Truncate to max_seq_len
            time_seq = time_seq[:, -self.max_seq_len:]  # Truncate to match

        item_embeddings = self.item_emb(item_seq)
        pos_indices = torch.arange(item_seq.shape[1], device=item_seq.device).unsqueeze(0).expand_as(item_seq)
        pos_embeddings = self.pos_emb(pos_indices)
        time_embeddings = self.time_emb(time_seq.unsqueeze(-1))

        seq_embeddings = item_embeddings + pos_embeddings + time_embeddings
        seq_embeddings = self.encoder(seq_embeddings)
        last_output = seq_embeddings[:, -1, :]
        return self.fc(self.dropout(last_output))


class TimeAwareSessionDataset(Dataset):
    def __init__(self, df, item2idx, session_length=5):
        self.sessions = []
        self.time_diffs = []
        grouped = df.groupby("user_id")
        for _, group in grouped:
            group = group.sort_values("time")
            items = group["item_id"].values
            times = group["time"].values
            if len(items) < session_length:
                continue
            for i in range(len(items) - session_length + 1):
                session_seq = items[i: i + session_length]
                time_seq = times[i: i + session_length] - times[i]
                indices = [item2idx[it] for it in session_seq if it in item2idx]
                if len(indices) == session_length:
                    self.sessions.append(indices)
                    self.time_diffs.append(time_seq)
        self.sessions = np.array(self.sessions)
        self.time_diffs = np.array(self.time_diffs)

    def __len__(self):
        return len(self.sessions)

    def __getitem__(self, idx):
        session = self.sessions[idx]
        time_diff = self.time_diffs[idx]
        return torch.tensor(session[:-1], dtype=torch.long), torch.tensor(time_diff[:-1],
                                                                          dtype=torch.float), torch.tensor(session[-1],
                                                                                                           dtype=torch.long)


#######################################################
# 3) Training TiSASRec
#######################################################

def train_one_epoch_tisasrec(model, loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for item_seq, time_seq, target in loader:
        item_seq, time_seq, target = item_seq.to(device), time_seq.to(device), target.to(device)
        optimizer.zero_grad()
        logits = model(item_seq, time_seq)
        loss = loss_fn(logits, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * item_seq.size(0)
    return total_loss / len(loader.dataset)


#######################################################
# 4) GRV-Based Ranking for TiSASRec
#######################################################

def evaluate_tisasrec_ranking(model, test_df, item2idx, cox_map, gamma=0.3, K=10, device="cpu"):
    test_df = test_df.copy()
    test_df["time_hr"] = (test_df["time"] // 3600).astype(int)
    grouped = test_df.groupby("user_id")
    coverage_items, newcov = set(), 0  # `newcov` starts as a float
    hits_at_k, ndcg_at_k, total_positives = 0, 0, 0
    all_users = list(grouped.groups.keys())
    all_item_ids = test_df["item_id"].unique()
    rng = np.random.default_rng(0)

    for user_id in tqdm(all_users, desc="Evaluating TiSASRec with GRV"):
        g = grouped.get_group(user_id)
        t_hr = g["time_hr"].min()
        pos_items = g[g["label"] == 1]["item_id"].unique()
        candidate_items = np.unique(np.concatenate([pos_items, rng.choice(all_item_ids, size=50, replace=False)]))

        valid_items = [item2idx[it] for it in candidate_items if it in item2idx]
        if not valid_items:
            continue
        if len(valid_items) > model.max_seq_len:
            valid_items = valid_items[-model.max_seq_len:]

        with torch.no_grad():
            inputs = torch.tensor(valid_items, dtype=torch.long, device=device).unsqueeze(0)
            time_inputs = torch.zeros_like(inputs, dtype=torch.float)
            logits = model(inputs, time_inputs)
            base_scores = logits.cpu().numpy().flatten()

        final_scores = [(1 - gamma) * base_score + gamma * get_grv(cox_map, it, t_hr) for it, base_score in
                        zip(candidate_items, base_scores)]
        top_items = candidate_items[np.argsort(-np.array(final_scores))[:K]]

        # ✅ Normalize Coverage
        coverage_items.update(top_items)

        # ✅ Normalize New Item Coverage
        new_item_count = sum(1 for it in top_items if it in cox_map and cox_map[it]["T_i0"] >= 100)
        newcov += new_item_count / K  # Proportion of new items recommended

        total_positives += len(pos_items)
        hits = sum(1 for pos_it in pos_items if pos_it in top_items)
        hits_at_k += hits
        ndcg_at_k += sum(
            1.0 / np.log2(np.where(top_items == pos_it)[0][0] + 2) for pos_it in pos_items if pos_it in top_items)

    hr = hits_at_k / total_positives if total_positives > 0 else 0
    ndcg = ndcg_at_k / len(all_users) if len(all_users) > 0 else 0
    newcov /= len(all_users)  # Normalize new coverage by total users
    coverage = len(coverage_items) / len(item2idx)  # Normalize coverage by total items

    return hr, ndcg, coverage, newcov  # ✅ Coverage is now a proportion


#######################################################
# 5) Main Experiment for TiSASRec with GRV
#######################################################

def tisas4rec(gamma):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Load data using the correct function
    train_df, val_df, test_df = load_mind_data_with_neg("train.csv", "dev.csv", "test.csv")

    # Debug: Print column names for verification
    print(f"[DEBUG] Columns in train.csv: {train_df.columns}")

    # Ensure 'item_id' column exists
    if 'item_id' not in train_df.columns:
        raise KeyError("Column 'item_id' is missing from train.csv. Check the column names.")

    # Build item2idx mapping
    all_items = pd.concat([train_df["item_id"], val_df["item_id"], test_df["item_id"]]).unique()
    item2idx = {i: idx for idx, i in enumerate(all_items)}

    # Load GRV data
    cox_map = load_cox_data_and_survival(
        "cox_output/cox_data.csv",
        "cox_output/cox_survival.csv",
        "output/ItemHourLog.csv"
    )

    # Build training dataset and loader
    train_ds = TimeAwareSessionDataset(train_df, item2idx)
    train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)

    # Define model
    model = TiSASRec(len(item2idx)).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()

    # Train the model
    for ep in range(3):
        train_one_epoch_tisasrec(model, train_loader, optimizer, loss_fn, device)

    # **Evaluate with GRV-based ranking**
    hr, ndcg, cov, newcov = evaluate_tisasrec_ranking(
        model, test_df, item2idx, cox_map, gamma=gamma, K=10, device=device
    )

    print(f"[RESULT] HR@10={hr:.4f}, NDCG@10={ndcg:.4f}, coverage@10={cov}, new_item_coverage@10={newcov}")


In [None]:
tisas4rec(0)

In [None]:
tisas4rec(0.3)