In [1]:
import torch.nn as nn
import torch
import pandas as pd
import copy
import random
import numpy as np

In [2]:
max_len = 50
embed_dim = 256


In [3]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)
set_seed()

In [4]:
df = pd.read_csv('/Users/baonguyen/IU/thesis/data/clean_data/data_with_bertopic_column.csv')

In [5]:
df['review_date']=pd.to_datetime(df['review_date'])
df_sorted = df.sort_values('review_date')

In [6]:
unique_item_id = set(df_sorted['item_id'])
item_to_index = {item:idx +1 for idx , item in enumerate(unique_item_id)}
index_to_item = {idx+1:item for idx , item in enumerate(unique_item_id)}

In [7]:
# Step 1: Group and aggregate
user_item_sequence = (
    df_sorted.groupby('user_id')[['item_id']]
    .agg(list)
    .to_dict(orient='index')
)

# Step 2: Remove users with fewer than 2 item_ids
user_item_sequence = {
    user: val
    for user, val in user_item_sequence.items()
    if len(val['item_id']) >= 2
}


In [8]:
user_item_to_index_sequence = {}
for user,value in user_item_sequence.items():
    user_item_to_index_sequence[user] = {'item_id':[item_to_index[item] for item in value['item_id']]}

In [9]:


def mask_sequence(sequence: dict, mask_ratio: float):
    labels = {}
    mask_seq = {}
    for user, seq in sequence.items():
        mask_seq[user] = copy.deepcopy(seq)  # Deep copy so original is untouched
        labels[user] = [-100] * len(seq['item_id'])
        for i in range(len(mask_seq[user]['item_id'])):
            if random.random() < mask_ratio:
                labels[user][i] = mask_seq[user]['item_id'][i]  # Save original item id
                mask_seq[user]['item_id'][i] = 0       # Mask the item id
               
    return mask_seq, labels


In [10]:
def padding(mask_seq, labels, max_len=64, pad_item=0, pad_topic=0, pad_label=-100):
    """
    Pads all user sequences in mask_seq and labels to max_len.
    
    Args:
        mask_seq: dict of user_id -> {'item_id': [...], 'Topic': [...]}
        labels: dict of user_id -> [...]
        max_len: desired length after padding
        pad_item: value for padding 'item_id'
        pad_topic: value for padding 'Topic'
        pad_label: value for padding labels

    Returns:
        padded_mask_seq, padded_labels (dicts)
    """
    def pad(seq, max_len, pad_value):
        if len(seq) < max_len:
            return seq + [pad_value] * (max_len - len(seq))
        else:
            return seq[len(seq)-max_len:len(seq)]
    
    padded_mask_seq = {}
    padded_labels = {}

    for user in mask_seq:
        padded_mask_seq[user] = {
            'item_id': pad(mask_seq[user]['item_id'], max_len, pad_item)
        }
        padded_labels[user] = pad(labels[user], max_len, pad_label)
    
    return padded_mask_seq, padded_labels


# bert architect


In [11]:
class BertEmbeddings(nn.Module):
    def __init__(self,vocab_size , hidden_size,max_len,dropout):
        super().__init__()
        self.max_len = max_len
        self.word_embeddings = nn.Embedding(vocab_size,hidden_size)
        self.position_encoding = nn.Embedding(max_len,hidden_size)
        self.LayerNorm = nn.LayerNorm(hidden_size)
        self.Dropout = nn.Dropout(dropout)

    def forward(self,input_ids):
        position_ids = torch.arange(self.max_len, dtype=torch.long, device=input_ids.device).unsqueeze(0)
        word_emb = self.word_embeddings(input_ids)
        pos_emb = self.position_encoding(position_ids)
        embeddings = word_emb + pos_emb
        embeddings = self.LayerNorm(embeddings)
        return self.Dropout(embeddings)

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class BertSdpaSelfAttention(nn.Module):
    def __init__(self, hidden_size=512, num_heads=8, dropout=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads

        self.q_proj = nn.Linear(hidden_size, hidden_size)
        self.k_proj = nn.Linear(hidden_size, hidden_size)
        self.v_proj = nn.Linear(hidden_size, hidden_size)
        self.attn_dropout = nn.Dropout(dropout)

    def forward(self, x, attention_mask=None,key_padding_mask=None):
        B, T, C = x.size()

        # Linear projection and reshape
        q = self.q_proj(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled Dot-Product Attention
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if attention_mask is not None:
            scores += attention_mask
        if key_padding_mask is not None:
            key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2)
            scores = scores.masked_fill(key_padding_mask,float('-inf'))
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = torch.nan_to_num(attn_weights, nan=0.0)
        attn_weights = self.attn_dropout(attn_weights)

        context = torch.matmul(attn_weights, v)  # [B, H, T, D]
        context = context.transpose(1, 2).reshape(B, T, C)
        return context

class BertSelfOutput(nn.Module):
    def __init__(self, hidden_size=512, dropout=0.1):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.LayerNorm = nn.LayerNorm(hidden_size)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        return self.LayerNorm(hidden_states + input_tensor)

class BertAttention(nn.Module):
    def __init__(self, hidden_size=512, num_heads=8, dropout=0.1):
        super().__init__()
        self.self = BertSdpaSelfAttention(hidden_size, num_heads, dropout)
        self.output = BertSelfOutput(hidden_size, dropout)

    def forward(self, hidden_states, attention_mask=None,key_padding_mask=None):
        self_output = self.self(hidden_states, attention_mask,key_padding_mask)
        return self.output(self_output, hidden_states)

class BertIntermediate(nn.Module):
    def __init__(self, hidden_size=512, intermediate_size=3072):
        super().__init__()
        self.dense = nn.Linear(hidden_size, intermediate_size)
        self.activation = nn.GELU()

    def forward(self, hidden_states):
        return self.activation(self.dense(hidden_states))

class BertOutput(nn.Module):
    def __init__(self, intermediate_size=3072, hidden_size=512, dropout=0.1):
        super().__init__()
        self.dense = nn.Linear(intermediate_size, hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.LayerNorm = nn.LayerNorm(hidden_size)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        return self.LayerNorm(hidden_states + input_tensor)

class BertLayer(nn.Module):
    def __init__(self, hidden_size=512, intermediate_size=3072, num_heads=8, dropout=0.1):
        super().__init__()
        self.attention = BertAttention(hidden_size, num_heads, dropout)
        self.intermediate = BertIntermediate(hidden_size, intermediate_size)
        self.output = BertOutput(intermediate_size, hidden_size, dropout)

    def forward(self, hidden_states, attention_mask=None,key_padding_mask=None):
        attention_output = self.attention(hidden_states, attention_mask,key_padding_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output


In [13]:
import torch
import torch.nn as nn

class BertEncoder(nn.Module):
    def __init__(self, num_layers=2, hidden_size=512, intermediate_size=3072, num_heads=8, dropout=0.1):
        super().__init__()
        self.layer = nn.ModuleList([
            BertLayer(hidden_size, intermediate_size, num_heads, dropout)
            for _ in range(num_layers)
        ])

    def forward(self, hidden_states, attention_mask=None,key_padding_mask=None):
        for layer_module in self.layer:
            hidden_states = layer_module(hidden_states, attention_mask,key_padding_mask)
        return hidden_states


In [14]:
import torch
import torch.nn as nn



class BertModel(nn.Module):
    def __init__(self, 
                 vocab_size=30522,
                 hidden_size=512,
                 intermediate_size=3072,
                 num_heads=8,
                 num_layers=2,
                 max_len=512,
                 dropout=0.1):
        super().__init__()
        self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_len, dropout=dropout)
        self.encoder = BertEncoder(num_layers, hidden_size, intermediate_size, num_heads, dropout)
        self.output_layer = nn.Sequential(
           nn.Dropout(dropout),
            nn.Linear(hidden_size, vocab_size)
        )

    def forward(self, input_ids, attention_mask=None,key_padding_mask=None):
        

        embedding_output = self.embeddings(input_ids)
        encoder_output = self.encoder(embedding_output, attention_mask,key_padding_mask)
        output = self.output_layer(encoder_output)
        return output


In [None]:
from torchinfo import summary
model = BertModel()
summary(model,depth=6)

In [16]:
def hit_ratio(ground_truth:list,prediction:list,k:int):
    hits = 0
    total = len(ground_truth)
    for gt_item, pred in zip(ground_truth, prediction):
        if gt_item in pred[:k]:
            hits += 1
    return hits / total

# -------------------------
def evaluate_model(model, val_item_sequences, k=10):
    model.eval()
    device = 'mps'

    ground_truths = []
    predictions = []

    with torch.no_grad():
        for user, seq in val_item_sequences.items():
            item_seq = seq['item_id']
            


            # Prepare input and target
            input_items = item_seq[:-1]
            
            target_item = item_seq[-1]

            # Use your own padding utility to ensure correct length
            padded_seq, _ = padding(
                mask_seq={user: {'item_id': input_items}},
                labels={user: []},  # empty labels not needed here
                max_len=max_len
            )
            
            padded_items = padded_seq[user]['item_id']
            # padded_topics = padded_seq[user]['Topic']

            item_tensor = torch.tensor([padded_items], dtype=torch.long).to(device)
            
            key_padding_mask = (item_tensor == 0)

            logits = model(item_tensor, key_padding_mask=key_padding_mask)[:,min(len(item_seq)-1,max_len-1),:]
            probabilities = torch.softmax(logits, dim=-1)
            # print(probabilities.size())
            
            

            topk = torch.topk(probabilities, k=k).indices[0].tolist()
            # print(topk)
            ground_truths.append(index_to_item[target_item])
            predictions.append([index_to_item[i] for i in topk])

            # print(ground_truths)
            # print(predictions)
    return hit_ratio(ground_truths, predictions, k)



In [17]:
import torch.optim as optim
from tqdm import tqdm
import os 
def train_model(train_users,val_users,fold_num):
    train_user_item_to_index_sequence = {user: seq for user, seq in user_item_to_index_sequence.items() if user in train_users}
    mask_seq , labels = mask_sequence(train_user_item_to_index_sequence,mask_ratio=0.6)
    padded_mask_seq,padded_labels = padding(mask_seq,labels,max_len=max_len)

    tensor_item_ids = torch.stack([
        torch.tensor(user_seq['item_id']) for user_seq in padded_mask_seq.values()
    ])



    tensor_labels = torch.stack([
        torch.tensor(seq) for seq in padded_labels.values()
        ])
        
    train_dataset = torch.utils.data.TensorDataset(
        tensor_item_ids,

        tensor_labels
    )
    train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size=64,shuffle=True)


    # train model 
    device = 'mps'
    model = BertModel(vocab_size=len(unique_item_id)+1,hidden_size=256,intermediate_size=256*12,num_heads=4,num_layers=2,max_len=50).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=0.001)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=-100)
    best_hr = 0
    for epoch in range(5):
        model.train()
        epoch_loss = 0
        for batch in tqdm(train_dataloader,desc=f"Fold {fold_num} Epoch {epoch+1}", unit="batch"):
            item_ids  , labels = batch
            item_ids  , labels = item_ids.to(device) , labels.to(device)
            key_padding_mask = (item_ids == 0)
            
            optimizer.zero_grad()
            outputs = model(item_ids,key_padding_mask=key_padding_mask)
            # print(outputs.size()
            loss = criterion(outputs.view(-1, len(unique_item_id)+1), labels.view(-1))
            loss.backward()
            torch.mps.empty_cache()
            optimizer.step()
            # print(loss.item())
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss {epoch_loss}")
        # evaluate model
        model.eval()
        # val_user_item_sequences = {user: seq for user, seq in user_item_sequence.items() if user in val_users}
        val_item_sequences = {user: seq for user, seq in user_item_to_index_sequence.items() if user in val_users}
        val_hr = evaluate_model(model, val_item_sequences,k=10)
        print(f"Fold {fold_num} Epoch {epoch+1}, Validation HR@10: {val_hr}")
        if val_hr > best_hr:
            best_hr = val_hr
            save_path = f"models_item_with_bert/fold_{fold_num}"
            os.makedirs(save_path, exist_ok=True)
            torch.save(model.state_dict(), f"{save_path}/best_model.pth")
    return model, best_hr




In [None]:
from sklearn.model_selection import KFold
set_seed()
kf = KFold(n_splits=5,shuffle=True,random_state=42)
user_list = list(user_item_sequence.keys())
fold_results = {}
for fold_num , (train_idx,val_idx) in enumerate(kf.split(user_list),1):
    print(f"\nStarting Fold {fold_num}...")
    train_users = [user_list[i] for i in train_idx]
    val_users = [user_list[i] for i in val_idx]
    model,val_hr =  train_model(train_users, val_users, fold_num)
    fold_results[fold_num] = val_hr
    save_path = f"results_item_with_bert/fold_{fold_num}"
    os.makedirs(save_path, exist_ok=True)
    with open(f"{save_path}/results.txt", "w") as f:
        f.write(f"Validation HR@10: {val_hr}\n")
    del model
    torch.mps.empty_cache()
with open("results_item_with_bert/overall_results.txt", "w") as f:
    for fold, hr in fold_results.items():
        f.write(f"Fold {fold}: HR@10 = {hr}\n")
    mean_hr = sum(fold_results.values()) / len(fold_results)
    f.write(f"\nMean HR@10 across folds: {mean_hr}")