In [186]:
import torch.nn as nn
import torch
import pandas as pd
import copy
import random
import numpy as np

In [187]:
max_len = 50
embed_dim = 256


In [188]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)
set_seed()

In [189]:
df = pd.read_csv('/Users/baonguyen/IU/thesis/data/clean_data/data_with_bertopic_column.csv')



In [None]:
df = df.apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else x), axis=0)
df.info()

In [191]:
df['review_date']=pd.to_datetime(df['review_date'])
df_sorted = df.sort_values('review_date')
df_sorted.rename(columns={'rented for':'rented_for','body type':'body_type','bust size':'bust_size'},inplace=True)

In [None]:
df_sorted.columns

In [193]:
side_feature =  ['rented_for','bust_size','Topic']

In [194]:
# item to index and vice versa
unique_item_id = set(df_sorted['item_id'])
item_to_index = {item:idx +1 for idx , item in enumerate(unique_item_id)}
index_to_item = {idx+1:item for idx , item in enumerate(unique_item_id)}

for i in side_feature:
    exec(f'unique_{i} = set(df_sorted[i])')
    exec(f'{i}_to_index = {{i:idx+1 for idx,i in enumerate(unique_{i})}}')
    exec(f'index_to_{i} = {{idx+1:i for idx,i in enumerate(unique_{i})}}')

In [195]:
# Step 1: Group and aggregate
user_item_sequence = (
    df_sorted.groupby('user_id')[['item_id']+side_feature]
    .agg(list)
    .to_dict(orient='index')
)

# Step 2: Remove users with fewer than 2 item_ids
user_item_sequence = {
    user: val
    for user, val in user_item_sequence.items()
    if len(val['item_id']) >= 2
}


In [196]:
user_item_to_index_sequence = {}

for user, value in user_item_sequence.items():
    user_dict = {
        'item_id': [item_to_index[item] for item in value['item_id']]
    }
    for i in side_feature:
        # Dynamically get the correct mapping dict by name
        mapping_dict = globals()[f"{i}_to_index"]
        user_dict[i] = [mapping_dict[a] for a in value[i]]
    user_item_to_index_sequence[user] = user_dict


In [197]:


def mask_sequence(sequence: dict, mask_ratio: float):
    labels = {}
    mask_seq = {}
    for user, seq in sequence.items():
        mask_seq[user] = copy.deepcopy(seq)  # Deep copy so original is untouched
        labels[user] = [-100] * len(seq['item_id'])
        for i in range(len(mask_seq[user]['item_id'])):
            if random.random() < mask_ratio:
                labels[user][i] = mask_seq[user]['item_id'][i]  # Save original item id
                mask_seq[user]['item_id'][i] = 0       # Mask the item id
                for feature in side_feature:
                    mask_seq[user][feature][i] = 0
                
    return mask_seq, labels


In [198]:

mask_seq , labels = mask_sequence(user_item_to_index_sequence,mask_ratio=0.35)


In [199]:
def padding(mask_seq, labels, max_len=64, pad_item = 0, pad_label=-100):
    """
    Pads all user sequences in mask_seq and labels to max_len.
    
    Args:
        mask_seq: dict of user_id -> 
        labels: dict of user_id -> [...]
        max_len: desired length after padding
        pad_item: value for padding 'item_id'
        pad_topic: value for padding 'Topic'
        pad_label: value for padding labels

    Returns:
        padded_mask_seq, padded_labels (dicts)
    """
    def pad(seq, max_len, pad_value):
        if len(seq) < max_len:
            return seq + [pad_value] * (max_len - len(seq))
        else:
            return seq[len(seq)-max_len:len(seq)]
    
    padded_mask_seq = {}
    padded_labels = {}

    for user in mask_seq:
        padded_mask_seq[user] = {
            **{'item_id': pad(mask_seq[user]['item_id'], max_len, pad_item)},
            **{i: pad(mask_seq[user][i], max_len, pad_item) for i in side_feature}
        }

        padded_labels[user] = pad(labels[user], max_len, pad_label)
    
    return padded_mask_seq, padded_labels


In [200]:
padded_mask_seq,padded_labels = padding(mask_seq,labels,max_len=max_len)

In [201]:
# import numpy as np
# class SinusoidalPositionalEncoding(nn.Module):
#     def __init__(self, hidden_size, max_len=5000):
#         super(SinusoidalPositionalEncoding, self).__init__()
#         position = torch.arange(0, max_len).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, hidden_size, 2) * -(np.log(10000.0) / hidden_size))
#         pe = torch.zeros(max_len, hidden_size)
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0)
#         self.register_buffer('pe', pe)
#     def forward(self, x):
#         seq_len = x.size(1)
#         return self.pe[:, :seq_len, :]


In [202]:
# tensor_item_ids = torch.stack([
#     torch.tensor(user_seq['item_id']) for user_seq in padded_mask_seq.values()
# ])
# tensor_topic_ids = torch.stack([
#     torch.tensor(user_seq['Topic']) for user_seq in padded_mask_seq.values()
# ])
# tensor_labels = torch.stack([
#     torch.tensor(seq) for seq in padded_labels.values()
#     ])
# train_dataset = torch.utils.data.TensorDataset(
#     tensor_item_ids,
#     tensor_topic_ids,
#     tensor_labels
# )
# train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size=32,shuffle=True)

In [203]:
class differentiable_attn_mask(nn.Module):
    def __init__(self,):
        super(differentiable_attn_mask).__init__()
    pass

# nova bert architecture

In [204]:
class GatingFusor(nn.Module):
    def __init__(self, h):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(h, 1))

    def forward(self, features):   
        gates = torch.sigmoid(features @ self.weight)         
        fused = torch.sum(gates * features, dim=2)            
        return fused

In [205]:
class NovabertEmbedding(nn.Module):
    def __init__(self,num_item,num_side_feature_ids:dict,embedding_dim,max_len=64):
        super(NovabertEmbedding,self).__init__()
        self.item_embedding = nn.Embedding(num_item+1,embedding_dim)
        self.side_embedding_  = nn.ModuleDict()
        for feat_name,num_feat in num_side_feature_ids.items():
            self.side_embedding_[feat_name]=nn.Embedding(num_feat+1,embedding_dim)
        self.position_encoding = nn.Embedding(max_len, embedding_dim)
    def forward(self, item_ids, side_feature_ids: dict):
        position_ids = torch.arange(item_ids.size(1), dtype=torch.long, device=item_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(item_ids)
        item_embed = self.item_embedding(item_ids)
        pos_embed = self.position_encoding(position_ids)
        item_embed = item_embed + pos_embed
        side_emb_list = []

        for feat_name in self.side_embedding_:
            
            feat_ids = side_feature_ids[feat_name]   # <-- Access by key, get tensor
            side_embed = self.side_embedding_[feat_name](feat_ids) + pos_embed
            side_emb_list.append(side_embed)
            
        return item_embed, side_emb_list


In [206]:
class NovabertCrossAttention(nn.Module):
    def __init__(self,embedding_dim,num_heads=8):
        super(NovabertCrossAttention,self).__init__()
        self.num_heads = num_heads
        self.head_dim = embedding_dim //num_heads
        self.value_proj = nn.Linear(embedding_dim,embedding_dim)
        self.query_proj = nn.Linear(embedding_dim,embedding_dim)
        self.key_proj = nn.Linear(embedding_dim,embedding_dim)
        self.fusor = GatingFusor(h=embedding_dim)


        self.output_proj = nn.Sequential(
              
        )
    def forward(self,item_embed,side_feature_embed:list,attn_mask=None,key_padding_mask=None):
        batch_size , sequence_len , embedding_dim = item_embed.size()
        def reshape(x:torch.tensor):
            return x.view(batch_size,sequence_len,self.num_heads,self.head_dim).transpose(1,2)
        # Batch,num_head,sequence_len,head_dim  (B,H,L,D)
        # print(*side_feature_embed)
        
        
        features = torch.stack([item_embed] + side_feature_embed, dim=2)

        fused_features = self.fusor(features)
        V = self.value_proj(item_embed)
        Q = self.query_proj(fused_features)
        K = self.key_proj(fused_features)
        Q = reshape(Q)
        K = reshape(K)
        V = reshape(V)
        scores = torch.matmul(Q,K.transpose(-2,-1)) / np.sqrt(self.head_dim) # B,H,L,L 
        
        if attn_mask is not None:
            scores += attn_mask.unsqueeze(0)  # Broadcast across batch ??? **********
            pass
        if key_padding_mask is not None:
            key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2) # B,1,1,L
            scores = scores.masked_fill(key_padding_mask,float('-inf'))
            # print(scores)

        attn_weights = torch.softmax(scores,dim=-1)
        attn_weights = torch.nan_to_num(attn_weights, nan=0.0)
        attn_output = torch.matmul(attn_weights,V) 

        # concat
        attn_output = attn_output.transpose(1,2).contiguous().view(batch_size,sequence_len,embedding_dim)
        return self.output_proj(attn_output)

            

In [207]:
class NovabertLayer(nn.Module):
    def __init__(self,embedding_dim,num_heads):
        super(NovabertLayer,self).__init__()
        self.cross_attn = NovabertCrossAttention(embedding_dim, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim * 4),
            nn.GELU(),
            nn.Linear(embedding_dim * 4, embedding_dim)
        )
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(0.1)
    def forward(self, id_embed,side_feature_embed:list , attention_mask=None,key_padding_mask = None):
        guided = self.cross_attn(id_embed,side_feature_embed,attention_mask,key_padding_mask)
        x = self.norm1(id_embed + self.dropout(guided))
        x = self.norm2(x + self.dropout(self.ffn(x)))
        return x

In [208]:
class NovabertModel(nn.Module):
    def __init__(self, num_items,num_side_feature_ids:dict, embedding_dim, max_len=64, num_layers=4, num_heads=8):
        super(NovabertModel, self).__init__()
        self.embedding = NovabertEmbedding(num_item=num_items,
                                           num_side_feature_ids=num_side_feature_ids,
                                           embedding_dim=embedding_dim,
                                           max_len=max_len)
        self.nova_layer = nn.ModuleList([
            NovabertLayer(embedding_dim, num_heads) 
            for _ in range(num_layers)
        ])
        
        self.output_layer = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(embedding_dim, num_items + 1)
        )

    def forward(self, item_ids, side_feature_ids: dict, attention_mask=None, key_padding_mask=None):
        x, y = self.embedding(item_ids, side_feature_ids)
        for layer in self.nova_layer:
            x = layer(x, y, attention_mask, key_padding_mask)
        return self.output_layer(x)



In [209]:
def hit_ratio(ground_truth:list,prediction:list,k:int):
    hits = 0
    total = len(ground_truth)
    for gt_item, pred in zip(ground_truth, prediction):
        
        if gt_item in pred[:k]:
            hits += 1
    print(f'{hits=},{total=}')            
    return hits / total

# -------------------------
def evaluate_model(model, val_item_sequences, k=10, max_len=64,  index_to_item=None, device='mps'):
    """
    Evaluate model hit ratio@k on validation data.

    Args:
        model: The trained model.
        val_item_sequences: Dict of user_id -> {'item_id': [...], <feat1>: [...], <feat2>: [...], ...}
        k: Top-k for hit ratio.
        max_len: Sequence length for padding.
        side_feature: List of feature names, e.g. ['Topic', 'category'].
        index_to_item: Dict mapping item indices back to original item ids.
        device: Device to run the model on.

    Returns:
        Hit ratio@k.
    """
    model.eval()
    ground_truths = []
    predictions = []

    with torch.no_grad():
        for user, seq in val_item_sequences.items():
            item_seq = seq['item_id']
            # Prepare input and target
            input_items = item_seq[:-1]
            target_item = item_seq[-1]
            # Pad input sequence (handle all side features)
            mask_seq = {user: {'item_id': input_items}}
            for feat in (side_feature or []):
                mask_seq[user][feat] = seq[feat][:-1]
            padded_seq, _ = padding(
                mask_seq=mask_seq,
                labels={user: []},
                max_len=max_len
            )
            padded_items = padded_seq[user]['item_id']
            # Prepare side feature tensors as a dict
            side_input_dict = {
                feat: torch.tensor([padded_seq[user][feat]], dtype=torch.long).to(device)
                for feat in (side_feature or [])
            }
            item_tensor = torch.tensor([padded_items], dtype=torch.long).to(device)
            key_padding_mask = (item_tensor == 0)
            # Model call
            logits = model(item_tensor, side_input_dict, key_padding_mask=key_padding_mask)[:, min(len(item_seq)-1, max_len-1), :]
            probabilities = torch.softmax(logits, dim=-1)
            topk = torch.topk(probabilities, k=k).indices[0].tolist()
            ground_truths.append(index_to_item[target_item])
            predictions.append([index_to_item[i] for i in topk])
    return hit_ratio(ground_truths, predictions, k)


In [210]:
import torch
import torch.optim as optim
from tqdm import tqdm
import os

def train_model(
    train_users, val_users, fold_num,
    user_item_to_index_sequence, side_feature, unique_item_id,
    embedding_dim=256, max_len=max_len, num_layers=2, num_heads=4, mask_ratio=0.6
):
    train_user_item_to_index_sequence = {user: seq for user, seq in user_item_to_index_sequence.items() if user in train_users}
    mask_seq, labels = mask_sequence(train_user_item_to_index_sequence, mask_ratio=mask_ratio)
    padded_mask_seq, padded_labels = padding(mask_seq, labels, max_len=max_len)

    tensor_item_ids = torch.stack([
        torch.tensor(user_seq['item_id']) for user_seq in padded_mask_seq.values()
    ])
    tensor_side_feats = {
        feat: torch.stack([
            torch.tensor(user_seq[feat]) for user_seq in padded_mask_seq.values()
        ])
        for feat in side_feature
    }

    tensor_labels = torch.stack([
        torch.tensor(seq) for seq in padded_labels.values()
        ])
    train_dataset = torch.utils.data.TensorDataset(
        tensor_item_ids,
        *(tensor_side_feats[feat] for feat in side_feature),
        tensor_labels
    )
    train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size=64,shuffle=True)
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')
    
    num_side_feature_ids = {feat : len(globals()[f'unique_{feat}']) for feat in side_feature} 
    model = NovabertModel(len(unique_item_id),num_side_feature_ids=num_side_feature_ids,embedding_dim=embedding_dim,max_len=max_len,num_layers=num_layers,num_heads=num_heads).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=0.001)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=-100)
    best_hr = 0

    for epoch in range(5):
        model.train()
        epoch_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Fold {fold_num} Epoch {epoch+1}", unit="batch"):
            item_ids = batch[0]
            side_ids = {feat: batch[i+1] for i, feat in enumerate(side_feature)}
            labels = batch[-1]

            # .to(device)
            item_ids = item_ids.to(device)
            side_ids = {feat: tensor.to(device) for feat, tensor in side_ids.items()}
            labels = labels.to(device)

            key_padding_mask = (item_ids == 0)
            optimizer.zero_grad()
            outputs = model(item_ids, side_ids, key_padding_mask=key_padding_mask)
            loss = criterion(outputs.view(-1, len(unique_item_id)+1), labels.view(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss {epoch_loss:.4f}")

        model.eval()
        val_item_sequences = {user: seq for user, seq in user_item_to_index_sequence.items() if user in val_users}
        val_hr = evaluate_model(
            model,
            val_item_sequences,
            k=10,
            max_len=max_len,
            index_to_item=index_to_item,
            device=device
        )
        print(f"Fold {fold_num} Epoch {epoch+1}, Validation HR@10: {val_hr:.4f}")

        if val_hr > best_hr:
            best_hr = val_hr
            save_path = f"models_item_with_novabert_gatingfusor_/fold_{fold_num}"
            os.makedirs(save_path, exist_ok=True)
            torch.save(model.state_dict(), f"{save_path}/best_model.pth")
    return model, best_hr


In [None]:
from sklearn.model_selection import KFold
import os
import torch

kf = KFold(n_splits=5, shuffle=True, random_state=42)
user_list = list(user_item_sequence.keys())
fold_results = {}

for fold_num, (train_idx, val_idx) in enumerate(kf.split(user_list), 1):
    print(f"\nStarting Fold {fold_num}...")
    train_users = [user_list[i] for i in train_idx]
    val_users = [user_list[i] for i in val_idx]

    # Train model on this fold
    model, val_hr = train_model(
        train_users=train_users,
        val_users=val_users,
        fold_num=fold_num,
        user_item_to_index_sequence=user_item_to_index_sequence,
        side_feature=side_feature,
        unique_item_id=unique_item_id,
    )
    fold_results[fold_num] = val_hr

    save_path = f"results_item_with_novabert_gatingfusor_/fold_{fold_num}"
    os.makedirs(save_path, exist_ok=True)
    with open(f"{save_path}/results.txt", "w") as f:
        f.write(f"Validation HR@10: {val_hr}\n")

    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    elif hasattr(torch, 'mps') and torch.backends.mps.is_available():
        torch.mps.empty_cache()

with open("results_item_with_novabert_gatingfusor_/overall_results.txt", "w") as f:
    for fold, hr in fold_results.items():
        f.write(f"Fold {fold}: HR@10 = {hr}\n")
    mean_hr = sum(fold_results.values()) / len(fold_results)
    f.write(f"\nMean HR@10 across folds: {mean_hr}\n")

print(f"\nMean HR@10 across folds: {mean_hr:.4f}")
