In [104]:
import torch.nn as nn
import torch
import pandas as pd
import copy
import random
import numpy as np

In [105]:
max_len = 50
embed_dim = 256


In [106]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)
set_seed()

In [107]:
df = pd.read_csv('/Users/baonguyen/IU/thesis/data/clean_data/data_with_bertopic_column.csv')
# df['age'].fillna(df['age'].mode()[0], inplace=True)


In [108]:
df['review_date']=pd.to_datetime(df['review_date'])
df_sorted = df.sort_values('review_date')

In [109]:
# item to index and vice versa
unique_item_id = set(df_sorted['item_id'])
item_to_index = {item:idx +1 for idx , item in enumerate(unique_item_id)}
index_to_item = {idx+1:item for idx , item in enumerate(unique_item_id)}
# topic to index and vice versa
unique_Topic = set(df_sorted['Topic'])
topic_to_index = {topic:idx+1 for idx , topic in enumerate(unique_Topic)}
index_to_topic = {idx+1:topic for idx,topic in enumerate(unique_Topic)}


In [110]:
# Step 1: Group and aggregate
user_item_sequence = (
    df_sorted.groupby('user_id')[['item_id', 'Topic']]
    .agg(list)
    .to_dict(orient='index')
)

# Step 2: Remove users with fewer than 2 item_ids
user_item_sequence = {
    user: val
    for user, val in user_item_sequence.items()
    if len(val['item_id']) >= 2
}


In [111]:
user_item_to_index_sequence = {}
for user,value in user_item_sequence.items():
    user_item_to_index_sequence[user] = {'item_id':[item_to_index[item] for item in value['item_id']],
                                         'Topic':[topic_to_index[topic] for topic in value['Topic']]}


In [112]:


def mask_sequence(sequence: dict, mask_ratio: float):
    labels = {}
    mask_seq = {}
    for user, seq in sequence.items():
        mask_seq[user] = copy.deepcopy(seq)  # Deep copy so original is untouched
        labels[user] = [-100] * len(seq['item_id'])
        for i in range(len(mask_seq[user]['item_id'])):
            if random.random() < mask_ratio:
                labels[user][i] = mask_seq[user]['item_id'][i]  # Save original item id
                mask_seq[user]['item_id'][i] = 0       # Mask the item id
                mask_seq[user]['Topic'][i] = 0
    return mask_seq, labels


In [113]:

# mask_seq , labels = mask_sequence(user_item_to_index_sequence,mask_ratio=0.35)


In [114]:
def padding(mask_seq, labels, max_len=64, pad_item=0, pad_topic=0, pad_label=-100):
    """
    Pads all user sequences in mask_seq and labels to max_len.
    
    Args:
        mask_seq: dict of user_id -> {'item_id': [...], 'Topic': [...]}
        labels: dict of user_id -> [...]
        max_len: desired length after padding
        pad_item: value for padding 'item_id'
        pad_topic: value for padding 'Topic'
        pad_label: value for padding labels

    Returns:
        padded_mask_seq, padded_labels (dicts)
    """
    def pad(seq, max_len, pad_value):
        if len(seq) < max_len:
            return seq + [pad_value] * (max_len - len(seq))
        else:
            return seq[:max_len]
    
    padded_mask_seq = {}
    padded_labels = {}

    for user in mask_seq:
        padded_mask_seq[user] = {
            'item_id': pad(mask_seq[user]['item_id'], max_len, pad_item),
            'Topic': pad(mask_seq[user]['Topic'], max_len, pad_topic)
        }
        padded_labels[user] = pad(labels[user], max_len, pad_label)
    
    return padded_mask_seq, padded_labels


In [115]:
# padded_mask_seq,padded_labels = padding(mask_seq,labels,max_len=max_len)

In [116]:
# import numpy as np
# class SinusoidalPositionalEncoding(nn.Module):
#     def __init__(self, hidden_size, max_len=5000):
#         super(SinusoidalPositionalEncoding, self).__init__()
#         position = torch.arange(0, max_len).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, hidden_size, 2) * -(np.log(10000.0) / hidden_size))
#         pe = torch.zeros(max_len, hidden_size)
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0)
#         self.register_buffer('pe', pe)
#     def forward(self, x):
#         seq_len = x.size(1)
#         return self.pe[:, :seq_len, :]


In [117]:
# tensor_item_ids = torch.stack([
#     torch.tensor(user_seq['item_id']) for user_seq in padded_mask_seq.values()
# ])
# tensor_topic_ids = torch.stack([
#     torch.tensor(user_seq['Topic']) for user_seq in padded_mask_seq.values()
# ])
# tensor_labels = torch.stack([
#     torch.tensor(seq) for seq in padded_labels.values()
#     ])
# train_dataset = torch.utils.data.TensorDataset(
#     tensor_item_ids,
#     tensor_topic_ids,
#     tensor_labels
# )
# train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size=32,shuffle=True)

In [118]:
class differentiable_attn_mask(nn.Module):
    def __init__(self,):
        super(differentiable_attn_mask).__init__()
    pass

# nova bert architecture

In [119]:
class GatingFusor(nn.Module):
    def __init__(self, h):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(h, 1))

    def forward(self, features):  # [B, T, 2, h]
        gates = torch.sigmoid(features @ self.weight)         # [B, T, 2, 1]
        fused = torch.sum(gates * features, dim=2)            # [B, T, h]
        return fused

In [120]:
class NovabertCrossAttention(nn.Module):
    def __init__(self,embedding_dim,num_heads=8):
        super(NovabertCrossAttention,self).__init__()
        self.num_heads = num_heads
        self.head_dim = embedding_dim //num_heads
        self.value_proj = nn.Linear(embedding_dim,embedding_dim)
        self.query_proj = nn.Linear(embedding_dim,embedding_dim)
        self.key_proj = nn.Linear(embedding_dim,embedding_dim)
        self.fusor = GatingFusor(h=embedding_dim)


        self.output_proj = nn.Sequential(
            
        )
    def forward(self,item_embed,topic_embed,attn_mask=None,key_padding_mask=None):
        batch_size , sequence_len , embedding_dim = item_embed.size()
        def reshape(x:torch.tensor):
            return x.view(batch_size,sequence_len,self.num_heads,self.head_dim).transpose(1,2)
        # Batch,num_head,sequence_len,head_dim  (B,H,L,D)
        features = torch.stack([item_embed, topic_embed], dim=2)  # [B, T, 2, h]
        fused_features = self.fusor(features)
        V = self.value_proj(item_embed)
        Q = self.query_proj(fused_features)
        K = self.key_proj(fused_features)
        Q = reshape(Q)
        K = reshape(K)
        V = reshape(V)
        scores = torch.matmul(Q,K.transpose(-2,-1)) / np.sqrt(self.head_dim) # B,H,L,L 
        
        if attn_mask is not None:
            scores += attn_mask.unsqueeze(0)  # Broadcast across batch ??? **********
            pass
        if key_padding_mask is not None:
            key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2) # B,1,1,L
            scores = scores.masked_fill(key_padding_mask,float('-inf'))
            # print(scores)

        attn_weights = torch.softmax(scores,dim=-1)
        attn_weights = torch.nan_to_num(attn_weights, nan=0.0)  # optional safety net
        attn_output = torch.matmul(attn_weights,V) # B,H,L,D

        # concat
        attn_output = attn_output.transpose(1,2).contiguous().view(batch_size,sequence_len,embedding_dim)
        return self.output_proj(attn_output)

            

In [121]:
class NovabertLayer(nn.Module):
    def __init__(self,embedding_dim,num_heads):
        super(NovabertLayer,self).__init__()
        self.cross_attn = NovabertCrossAttention(embedding_dim, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim * 4),
            nn.GELU(),
            nn.Linear(embedding_dim * 4, embedding_dim)
        )
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(0.1)
    def forward(self, id_embed, topic_embed, attention_mask=None,key_padding_mask = None):
        guided = self.cross_attn(id_embed, topic_embed,attention_mask,key_padding_mask)
        x = self.norm1(id_embed + self.dropout(guided))
        x = self.norm2(x + self.dropout(self.ffn(x)))
        return x

In [122]:
class NovabertModel(nn.Module):
    def __init__(self, num_items, num_topics, embedding_dim, max_len=64, num_layers=4, num_heads=8):
        super(NovabertModel, self).__init__()
        self.item_embedding = nn.Embedding(num_items + 1, embedding_dim)
        self.topic_embedding = nn.Embedding(num_topics + 1, embedding_dim)
        self.position_encoding = nn.Embedding(max_len, embedding_dim)

        self.nova_layer = nn.ModuleList([
            NovabertLayer(embedding_dim, num_heads) 
            for _ in range(num_layers)
        ])
        
        self.output_layer = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(embedding_dim, num_items + 1)
        )

    def forward(self, item_ids, topic_ids, attention_mask=None, key_padding_mask=None):
        x = self.item_embedding(item_ids)  # [B, T, D]
        y = self.topic_embedding(topic_ids)

        # Create position indices
        position_ids = torch.arange(item_ids.size(1), dtype=torch.long, device=item_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(item_ids)  # [B, T]

        pos_emb = self.position_encoding(position_ids)

        x = x + pos_emb
        y = y + pos_emb
        
        for layer in self.nova_layer:
            x = layer(x, y, attention_mask, key_padding_mask)
        
        return self.output_layer(x)
    def get_hidden(self, item_ids, topic_ids, attention_mask=None, key_padding_mask=None):
        x = self.item_embedding(item_ids)
        y = self.topic_embedding(topic_ids)

        position_ids = torch.arange(item_ids.size(1), dtype=torch.long, device=item_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(item_ids)
        pos_emb = self.position_encoding(position_ids)

        x = x + pos_emb
        y = y + pos_emb
        
        for layer in self.nova_layer:
            x = layer(x, y, attention_mask, key_padding_mask)
        return x  # [B, T, D]



In [123]:
def hit_ratio(ground_truth:list,prediction:list,k:int):
    hits = 0
    total = len(ground_truth)
    for gt_item, pred in zip(ground_truth, prediction):
        if gt_item in pred[:k]:
            hits += 1
            print(gt_item,pred)
    return hits / total

# -------------------------
def evaluate_model(model, val_item_sequences, k=10):
    model.eval()
    device = 'mps'

    ground_truths = []
    predictions = []

    with torch.no_grad():
        for user, seq in val_item_sequences.items():
            item_seq = seq['item_id']
            topic_seq = seq['Topic']


            # Prepare input and target
            input_items = item_seq[:-1]
            input_topics = topic_seq[:-1]
            target_item = item_seq[-1]

            # Use your own padding utility to ensure correct length
            padded_seq, _ = padding(
                mask_seq={user: {'item_id': input_items, 'Topic': input_topics}},
                labels={user: []},  # empty labels not needed here
                max_len=max_len
            )

            padded_items = padded_seq[user]['item_id']
            padded_topics = padded_seq[user]['Topic']

            item_tensor = torch.tensor([padded_items], dtype=torch.long).to(device)
            topic_tensor = torch.tensor([padded_topics], dtype=torch.long).to(device)
            key_padding_mask = (item_tensor == 0)

            logits = model(item_tensor, topic_tensor, key_padding_mask=key_padding_mask)[:,min(len(item_seq)-1,max_len-1),:]
            probabilities = torch.softmax(logits, dim=-1)
            # print(probabilities.size())



            topk = torch.topk(probabilities, k=k).indices[0].tolist()
            # print(topk)
            ground_truths.append(index_to_item[target_item])
            predictions.append([index_to_item[i] for i in topk])
            # print(ground_truths)
            # print(predictions)
    return hit_ratio(ground_truths, predictions, k)



In [124]:
def jacobian_frobenius_regularization(
    model, item_ids, topic_ids, key_padding_mask, lambda_jacobian=0.1, num_samples=1
):
    x = model.get_hidden(item_ids, topic_ids, key_padding_mask=key_padding_mask)  # [B, T, D]
    mask = ~key_padding_mask
    x_masked = x[mask]  # [num_valid, D]
    if x_masked.numel() == 0:
        return torch.tensor(0.0, device=x.device, requires_grad=True)

    # Get unique non-padding indices
    used_item_indices = torch.unique(item_ids[mask])
    used_item_indices = used_item_indices[used_item_indices != 0]  # Remove pad token

    if used_item_indices.numel() == 0:
        return torch.tensor(0.0, device=x.device, requires_grad=True)

    # Get only the embeddings for items actually used
    sub_embedding = model.item_embedding(used_item_indices)
    sub_embedding.requires_grad_(True)

    reg = 0.0
    for _ in range(num_samples):
        eta = torch.randn_like(x_masked)
        J_eta = torch.autograd.grad(
            outputs=x_masked,
            inputs=sub_embedding,
            grad_outputs=eta,
            create_graph=True,
            retain_graph=True,
            allow_unused=True,
        )[0]
        if J_eta is not None:
            reg += (J_eta ** 2).sum() / x_masked.size(0)
    reg = reg / num_samples
    return lambda_jacobian * reg


In [125]:
import torch.optim as optim
from tqdm import tqdm
import os 
def train_model(train_users,val_users,fold_num):
    train_user_item_to_index_sequence = {user: seq for user, seq in user_item_to_index_sequence.items() if user in train_users}
    mask_seq , labels = mask_sequence(train_user_item_to_index_sequence,mask_ratio=0.6)
    padded_mask_seq,padded_labels = padding(mask_seq,labels,max_len=max_len)

    tensor_item_ids = torch.stack([
        torch.tensor(user_seq['item_id']) for user_seq in padded_mask_seq.values()
    ])

    tensor_topic_ids = torch.stack([
        torch.tensor(user_seq['Topic']) for user_seq in padded_mask_seq.values()
    ])

    tensor_labels = torch.stack([
        torch.tensor(seq) for seq in padded_labels.values()
        ])
        
    train_dataset = torch.utils.data.TensorDataset(
        tensor_item_ids,
        tensor_topic_ids,
        tensor_labels
    )
    train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size=64,shuffle=True)


    # train model 
    device = 'mps'
    model = NovabertModel(len(unique_item_id),len(unique_Topic),embedding_dim=256,max_len=max_len,num_layers=2,num_heads=4).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=0.001)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=-100)
    best_hr = 0
    for epoch in range(5):
        model.train()
        epoch_loss = 0
        for batch in tqdm(train_dataloader,desc=f"Fold {fold_num} Epoch {epoch+1}", unit="batch"):
            item_ids , topic_ids , labels = batch
            item_ids , topic_ids , labels = item_ids.to(device) , topic_ids.to(device),labels.to(device)
            key_padding_mask = (item_ids == 0)
            attn_mask = differentiable_attn_mask()
            optimizer.zero_grad()
            outputs = model(item_ids,topic_ids,key_padding_mask=key_padding_mask)
            # print(outputs.size())
            
            loss = criterion(outputs.view(-1, len(unique_item_id) + 1), labels.view(-1))
            jacobian_loss = jacobian_frobenius_regularization(model, item_ids, topic_ids,key_padding_mask)
            loss += jacobian_loss
            loss.backward()
            torch.mps.empty_cache()
            optimizer.step()
            # print(loss.item())
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss {epoch_loss}")
        # evaluate model
        model.eval()
        # val_user_item_sequences = {user: seq for user, seq in user_item_sequence.items() if user in val_users}
        val_item_sequences = {user: seq for user, seq in user_item_to_index_sequence.items() if user in val_users}
        val_hr = evaluate_model(model, val_item_sequences,k=10)
        print(f"Fold {fold_num} Epoch {epoch+1}, Validation HR@10: {val_hr}")
        if val_hr > best_hr:
            best_hr = val_hr
            save_path = f"models_item_with_novabert_gatingfusor_denoise/fold_{fold_num}"
            os.makedirs(save_path, exist_ok=True)
            torch.save(model.state_dict(), f"{save_path}/best_model.pth")
    return model, best_hr




In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5,shuffle=True,random_state=42)
user_list = list(user_item_sequence.keys())
fold_results = {}
for fold_num , (train_idx,val_idx) in enumerate(kf.split(user_list),1):
    print(f"\nStarting Fold {fold_num}...")
    train_users = [user_list[i] for i in train_idx]
    val_users = [user_list[i] for i in val_idx]
    model,val_hr =  train_model(train_users, val_users, fold_num)
    fold_results[fold_num] = val_hr
    save_path = f"results_item_with_novabert_gatingfusor_denoise/fold_{fold_num}"
    os.makedirs(save_path, exist_ok=True)
    with open(f"{save_path}/results.txt", "w") as f:
        f.write(f"Validation HR@10: {val_hr}\n")
    del model
    torch.mps.empty_cache()
with open("results_item_with_novabert_gatingfusor_denoise/overall_results.txt", "w") as f:
    for fold, hr in fold_results.items():
        f.write(f"Fold {fold}: HR@10 = {hr}\n")
    mean_hr = sum(fold_results.values()) / len(fold_results)
    f.write(f"\nMean HR@10 across folds: {mean_hr}")

In [23]:
# from torchinfo import summary
# model = NovabertModel(len(unique_item_id),len(unique_Topic),embedding_dim=256,max_len=max_len,num_layers=2,num_heads=4)
# summary(model)