In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load dataset
file_path = r"C:\Users\user\Desktop\CW4\online+retail\Online Retail.xlsx"
df = pd.read_excel(file_path, sheet_name='Online Retail')

# Cleaning
df_cleaned = df.dropna(subset=['CustomerID', 'Description'])
df_cleaned = df_cleaned[df_cleaned['Quantity'] > 0]

# Encoding
df_sorted = df_cleaned.sort_values(by=['CustomerID', 'InvoiceDate'])

# Get all unique descriptions for fitting
all_items = df_sorted['Description'].unique()

# Fit LabelEncoder on all possible items
item_encoder = LabelEncoder()
item_encoder.fit(all_items)

# Encode the ItemID column
df_sorted['ItemID'] = item_encoder.transform(df_sorted['Description'])

# Grouping by Customer
sequential_data = df_sorted.groupby('CustomerID')['ItemID'].apply(list).reset_index(name='ItemSequence')

# Minimum sequence length
min_sequence_length = 3
sequential_data = sequential_data[sequential_data['ItemSequence'].apply(len) >= min_sequence_length]

# Pad and create sequences
item_sequences = sequential_data['ItemSequence'].tolist()
sequence_length = 30
padded_sequences = pad_sequences(item_sequences, maxlen=sequence_length, padding='pre')

def create_sequences(sequences, seq_length=30):
    X, y = [], []
    for seq in sequences:
        for i in range(max(1, len(seq) - seq_length + 1)):
            X.append(seq[i:i + seq_length])
            y.append(seq[min(i + seq_length, len(seq) - 1)])  # Adjust for boundaries
    return np.array(X), np.array(y)

X, y = create_sequences(padded_sequences, seq_length=sequence_length)

# Dataset splitting
split_ratio = 0.8
split_index = int(len(X) * split_ratio)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Dynamically calculate the unique item count
unique_items = np.unique(np.concatenate([X.flatten(), y]))
unique_item_count = len(unique_items)

# Fit LabelEncoder on all items in both X and y
item_encoder = LabelEncoder()
item_encoder.fit(unique_items)

# Re-encode y to ensure compatibility with unique items
y_train = item_encoder.transform(y_train)
y_test = item_encoder.transform(y_test)

# Ensure all labels are within the valid range
assert max(y_train) < unique_item_count, "y_train contains labels outside the valid range!"
assert max(y_test) < unique_item_count, "y_test contains labels outside the valid range!"

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")
print(f"Vocabulary Size: {unique_item_count}")


X_train shape: (3371, 30), X_test shape: (843, 30)
y_train shape: (3371,), y_test shape: (843,)
Vocabulary Size: 3463


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer
import numpy as np
import pandas as pd
import time

# CPU Optimization
torch.set_num_threads(16)

# Hyperparameters
batch_size = 32
epochs = 10
learning_rate = 5e-5
sequence_length = 30
k = 10

# BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=unique_item_count,
    ignore_mismatched_sizes=True
)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Prepare DataLoader
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Prepare inputs for BERT
def prepare_inputs(X):
    return [" ".join(map(str, seq)) for seq in X]

X_train_str = prepare_inputs(X_train)
X_test_str = prepare_inputs(X_test)

train_encodings = tokenizer(X_train_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')
test_encodings = tokenizer(X_test_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training Function with Epoch Display
def train_model(model, train_loader, optimizer, epochs=10):
    model.train()
    device = torch.device('cpu')  # Use CPU
    model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    steps_per_epoch = len(train_loader)

    for epoch in range(epochs):
        total_loss = 0
        start_time = time.time()
        print(f"\nEpoch {epoch + 1}/{epochs}")

        for step, batch in enumerate(train_loader, start=1):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Countdown Display
            elapsed_time = time.time() - start_time
            steps_remaining = steps_per_epoch - step
            time_per_step = elapsed_time / max(1, step)
            estimated_time_remaining = steps_remaining * time_per_step

            print(
                f"\rEpoch {epoch + 1}/{epochs} - {step}/{steps_per_epoch} ━━━━━━ "
                f"{int(estimated_time_remaining)}s remaining - loss: {total_loss / step:.4f}", end=""
            )

        print(f"\nEpoch {epoch + 1}/{epochs} - Average Loss: {total_loss / steps_per_epoch:.4f}")

# Evaluation Function
def evaluate_model(model, test_loader, y_true, k=10):
    model.eval()
    device = torch.device('cpu')
    model.to(device)
    total_precision, total_recall, total_hits, total_mrr = 0, 0, 0, 0
    total_users = len(y_true)

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, top_k_indices = torch.topk(logits, k, dim=1)

            for idx, label in enumerate(labels):
                predictions = top_k_indices[idx].cpu().numpy()
                label = label.item()
                if label in predictions:
                    rank = np.where(predictions == label)[0][0] + 1
                    total_hits += 1
                    total_mrr += 1 / rank

                precision_k = len(set(predictions) & {label}) / k
                recall_k = len(set(predictions) & {label}) / 1
                total_precision += precision_k
                total_recall += recall_k

    precision = total_precision / total_users
    recall = total_recall / total_users
    hit_rate = total_hits / total_users
    mrr = total_mrr / total_users

    print(f"Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}, Hit Rate: {hit_rate:.4f}, MRR: {mrr:.4f}")
    return precision, recall, hit_rate, mrr

# Train the Model
train_model(model, train_loader, optimizer, epochs=epochs)

# Evaluate the Model
precision, recall, hit_rate, mrr = evaluate_model(model, test_loader, y_test, k=k)

# Save Results
results = pd.DataFrame([{
    "Precision@10": precision,
    "Recall@10": recall,
    "Hit Rate": hit_rate,
    "MRR": mrr
}])
results.to_csv(r"C:\Users\user\Desktop\CW4\online+retail\bert4rec_results.csv", index=False)
print("Results saved.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/10
Epoch 1/10 - 106/106 ━━━━━━ 0s remaining - loss: 7.6337
Epoch 1/10 - Average Loss: 7.6337

Epoch 2/10
Epoch 2/10 - 106/106 ━━━━━━ 0s remaining - loss: 6.8552
Epoch 2/10 - Average Loss: 6.8552

Epoch 3/10
Epoch 3/10 - 106/106 ━━━━━━ 0s remaining - loss: 6.7312
Epoch 3/10 - Average Loss: 6.7312

Epoch 4/10
Epoch 4/10 - 106/106 ━━━━━━ 0s remaining - loss: 6.6954
Epoch 4/10 - Average Loss: 6.6954

Epoch 5/10
Epoch 5/10 - 106/106 ━━━━━━ 0s remaining - loss: 6.6672
Epoch 5/10 - Average Loss: 6.6672

Epoch 6/10
Epoch 6/10 - 106/106 ━━━━━━ 0s remaining - loss: 6.6178
Epoch 6/10 - Average Loss: 6.6178

Epoch 7/10
Epoch 7/10 - 106/106 ━━━━━━ 0s remaining - loss: 6.5489
Epoch 7/10 - Average Loss: 6.5489

Epoch 8/10
Epoch 8/10 - 106/106 ━━━━━━ 0s remaining - loss: 6.4848
Epoch 8/10 - Average Loss: 6.4848

Epoch 9/10
Epoch 9/10 - 106/106 ━━━━━━ 0s remaining - loss: 6.4681
Epoch 9/10 - Average Loss: 6.4681

Epoch 10/10
Epoch 10/10 - 106/106 ━━━━━━ 0s remaining - loss: 6.4641
Epoch 10/10 

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer
import numpy as np
import pandas as pd
import time

# CPU Optimization
torch.set_num_threads(16)

# Hyperparameters
batch_size = 64
epochs = 15
learning_rate = 1e-5
sequence_length = 30
k = 10

# BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=unique_item_count,
    ignore_mismatched_sizes=True
)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Prepare DataLoader
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Prepare inputs for BERT
def prepare_inputs(X):
    return [" ".join(map(str, seq)) for seq in X]

X_train_str = prepare_inputs(X_train)
X_test_str = prepare_inputs(X_test)

train_encodings = tokenizer(X_train_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')
test_encodings = tokenizer(X_test_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training Function with Epoch Display
def train_model(model, train_loader, optimizer, epochs=10):
    model.train()
    device = torch.device('cpu')  # Use CPU
    model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    steps_per_epoch = len(train_loader)

    for epoch in range(epochs):
        total_loss = 0
        start_time = time.time()
        print(f"\nEpoch {epoch + 1}/{epochs}")

        for step, batch in enumerate(train_loader, start=1):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Countdown Display
            elapsed_time = time.time() - start_time
            steps_remaining = steps_per_epoch - step
            time_per_step = elapsed_time / max(1, step)
            estimated_time_remaining = steps_remaining * time_per_step

            print(
                f"\rEpoch {epoch + 1}/{epochs} - {step}/{steps_per_epoch} ━━━━━━ "
                f"{int(estimated_time_remaining)}s remaining - loss: {total_loss / step:.4f}", end=""
            )

        print(f"\nEpoch {epoch + 1}/{epochs} - Average Loss: {total_loss / steps_per_epoch:.4f}")

# Evaluation Function
def evaluate_model(model, test_loader, y_true, k=10):
    model.eval()
    device = torch.device('cpu')
    model.to(device)
    total_precision, total_recall, total_hits, total_mrr = 0, 0, 0, 0
    total_users = len(y_true)

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, top_k_indices = torch.topk(logits, k, dim=1)

            for idx, label in enumerate(labels):
                predictions = top_k_indices[idx].cpu().numpy()
                label = label.item()
                if label in predictions:
                    rank = np.where(predictions == label)[0][0] + 1
                    total_hits += 1
                    total_mrr += 1 / rank

                precision_k = len(set(predictions) & {label}) / k
                recall_k = len(set(predictions) & {label}) / 1
                total_precision += precision_k
                total_recall += recall_k

    precision = total_precision / total_users
    recall = total_recall / total_users
    hit_rate = total_hits / total_users
    mrr = total_mrr / total_users

    print(f"Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}, Hit Rate: {hit_rate:.4f}, MRR: {mrr:.4f}")
    return precision, recall, hit_rate, mrr

# Train the Model
train_model(model, train_loader, optimizer, epochs=epochs)

# Evaluate the Model
precision, recall, hit_rate, mrr = evaluate_model(model, test_loader, y_test, k=k)

# Save Results
results = pd.DataFrame([{
    "Precision@10": precision,
    "Recall@10": recall,
    "Hit Rate": hit_rate,
    "MRR": mrr
}])
results.to_csv(r"C:\Users\user\Desktop\CW4\online+retail\bert4rec_results2.csv", index=False)
print("Results saved.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/15
Epoch 1/15 - 53/53 ━━━━━━ 0s remaining - loss: 8.0923
Epoch 1/15 - Average Loss: 8.0923

Epoch 2/15
Epoch 2/15 - 53/53 ━━━━━━ 0s remaining - loss: 7.8102
Epoch 2/15 - Average Loss: 7.8102

Epoch 3/15
Epoch 3/15 - 53/53 ━━━━━━ 0s remaining - loss: 7.5384
Epoch 3/15 - Average Loss: 7.5384

Epoch 4/15
Epoch 4/15 - 53/53 ━━━━━━ 0s remaining - loss: 7.3227
Epoch 4/15 - Average Loss: 7.3227

Epoch 5/15
Epoch 5/15 - 53/53 ━━━━━━ 0s remaining - loss: 7.1493
Epoch 5/15 - Average Loss: 7.1493

Epoch 6/15
Epoch 6/15 - 53/53 ━━━━━━ 0s remaining - loss: 7.0205
Epoch 6/15 - Average Loss: 7.0205

Epoch 7/15
Epoch 7/15 - 53/53 ━━━━━━ 0s remaining - loss: 6.9169
Epoch 7/15 - Average Loss: 6.9169

Epoch 8/15
Epoch 8/15 - 53/53 ━━━━━━ 0s remaining - loss: 6.8369
Epoch 8/15 - Average Loss: 6.8369

Epoch 9/15
Epoch 9/15 - 53/53 ━━━━━━ 0s remaining - loss: 6.7661
Epoch 9/15 - Average Loss: 6.7661

Epoch 10/15
Epoch 10/15 - 53/53 ━━━━━━ 0s remaining - loss: 6.7088
Epoch 10/15 - Average Loss: 6.70

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer
import numpy as np
import pandas as pd
import time

# CPU Optimization
torch.set_num_threads(16)

# Hyperparameters
batch_size = 32
epochs = 20
learning_rate = 1e-4
sequence_length = 30
k = 10

# BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=unique_item_count,
    ignore_mismatched_sizes=True
)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Prepare DataLoader
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Prepare inputs for BERT
def prepare_inputs(X):
    return [" ".join(map(str, seq)) for seq in X]

X_train_str = prepare_inputs(X_train)
X_test_str = prepare_inputs(X_test)

train_encodings = tokenizer(X_train_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')
test_encodings = tokenizer(X_test_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training Function with Epoch Display
def train_model(model, train_loader, optimizer, epochs=10):
    model.train()
    device = torch.device('cpu')  # Use CPU
    model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    steps_per_epoch = len(train_loader)

    for epoch in range(epochs):
        total_loss = 0
        start_time = time.time()
        print(f"\nEpoch {epoch + 1}/{epochs}")

        for step, batch in enumerate(train_loader, start=1):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Countdown Display
            elapsed_time = time.time() - start_time
            steps_remaining = steps_per_epoch - step
            time_per_step = elapsed_time / max(1, step)
            estimated_time_remaining = steps_remaining * time_per_step

            print(
                f"\rEpoch {epoch + 1}/{epochs} - {step}/{steps_per_epoch} ━━━━━━ "
                f"{int(estimated_time_remaining)}s remaining - loss: {total_loss / step:.4f}", end=""
            )

        print(f"\nEpoch {epoch + 1}/{epochs} - Average Loss: {total_loss / steps_per_epoch:.4f}")

# Evaluation Function
def evaluate_model(model, test_loader, y_true, k=10):
    model.eval()
    device = torch.device('cpu')
    model.to(device)
    total_precision, total_recall, total_hits, total_mrr = 0, 0, 0, 0
    total_users = len(y_true)

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, top_k_indices = torch.topk(logits, k, dim=1)

            for idx, label in enumerate(labels):
                predictions = top_k_indices[idx].cpu().numpy()
                label = label.item()
                if label in predictions:
                    rank = np.where(predictions == label)[0][0] + 1
                    total_hits += 1
                    total_mrr += 1 / rank

                precision_k = len(set(predictions) & {label}) / k
                recall_k = len(set(predictions) & {label}) / 1
                total_precision += precision_k
                total_recall += recall_k

    precision = total_precision / total_users
    recall = total_recall / total_users
    hit_rate = total_hits / total_users
    mrr = total_mrr / total_users

    print(f"Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}, Hit Rate: {hit_rate:.4f}, MRR: {mrr:.4f}")
    return precision, recall, hit_rate, mrr

# Train the Model
train_model(model, train_loader, optimizer, epochs=epochs)

# Evaluate the Model
precision, recall, hit_rate, mrr = evaluate_model(model, test_loader, y_test, k=k)

# Save Results
results = pd.DataFrame([{
    "Precision@10": precision,
    "Recall@10": recall,
    "Hit Rate": hit_rate,
    "MRR": mrr
}])
results.to_csv(r"C:\Users\user\Desktop\CW4\online+retail\bert4rec_results3.csv", index=False)
print("Results saved.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/20
Epoch 1/20 - 106/106 ━━━━━━ 0s remaining - loss: 7.5435
Epoch 1/20 - Average Loss: 7.5435

Epoch 2/20
Epoch 2/20 - 106/106 ━━━━━━ 0s remaining - loss: 6.8188
Epoch 2/20 - Average Loss: 6.8188

Epoch 3/20
Epoch 3/20 - 106/106 ━━━━━━ 0s remaining - loss: 6.7298
Epoch 3/20 - Average Loss: 6.7298

Epoch 4/20
Epoch 4/20 - 106/106 ━━━━━━ 0s remaining - loss: 6.6980
Epoch 4/20 - Average Loss: 6.6980

Epoch 5/20
Epoch 5/20 - 106/106 ━━━━━━ 0s remaining - loss: 6.6801
Epoch 5/20 - Average Loss: 6.6801

Epoch 6/20
Epoch 6/20 - 106/106 ━━━━━━ 0s remaining - loss: 6.6730
Epoch 6/20 - Average Loss: 6.6730

Epoch 7/20
Epoch 7/20 - 106/106 ━━━━━━ 0s remaining - loss: 6.6597
Epoch 7/20 - Average Loss: 6.6597

Epoch 8/20
Epoch 8/20 - 106/106 ━━━━━━ 0s remaining - loss: 6.6500
Epoch 8/20 - Average Loss: 6.6500

Epoch 9/20
Epoch 9/20 - 106/106 ━━━━━━ 0s remaining - loss: 6.6562
Epoch 9/20 - Average Loss: 6.6562

Epoch 10/20
Epoch 10/20 - 106/106 ━━━━━━ 0s remaining - loss: 6.6545
Epoch 10/20 

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load dataset
file_path = r"C:\Users\user\Desktop\CW4\online+retail\Online Retail.xlsx"
df = pd.read_excel(file_path, sheet_name='Online Retail')

# Cleaning
df_cleaned = df.dropna(subset=['CustomerID', 'Description'])
df_cleaned = df_cleaned[df_cleaned['Quantity'] > 0]

# Encoding
df_sorted = df_cleaned.sort_values(by=['CustomerID', 'InvoiceDate'])

# Get all unique descriptions for fitting
all_items = df_sorted['Description'].unique()

# Fit LabelEncoder on all possible items
item_encoder = LabelEncoder()
item_encoder.fit(all_items)

# Encode the ItemID column
df_sorted['ItemID'] = item_encoder.transform(df_sorted['Description'])

# Grouping by Customer
sequential_data = df_sorted.groupby('CustomerID')['ItemID'].apply(list).reset_index(name='ItemSequence')

# Minimum sequence length
min_sequence_length = 3
sequential_data = sequential_data[sequential_data['ItemSequence'].apply(len) >= min_sequence_length]

# Pad and create sequences
item_sequences = sequential_data['ItemSequence'].tolist()
sequence_length = 15
padded_sequences = pad_sequences(item_sequences, maxlen=sequence_length, padding='pre')

def create_sequences(sequences, seq_length=30):
    X, y = [], []
    for seq in sequences:
        for i in range(max(1, len(seq) - seq_length + 1)):
            X.append(seq[i:i + seq_length])
            y.append(seq[min(i + seq_length, len(seq) - 1)])  # Adjust for boundaries
    return np.array(X), np.array(y)

X, y = create_sequences(padded_sequences, seq_length=sequence_length)

# Dataset splitting
split_ratio = 0.8
split_index = int(len(X) * split_ratio)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Dynamically calculate the unique item count
unique_items = np.unique(np.concatenate([X.flatten(), y]))
unique_item_count = len(unique_items)

# Fit LabelEncoder on all items in both X and y
item_encoder = LabelEncoder()
item_encoder.fit(unique_items)

# Re-encode y to ensure compatibility with unique items
y_train = item_encoder.transform(y_train)
y_test = item_encoder.transform(y_test)

# Ensure all labels are within the valid range
assert max(y_train) < unique_item_count, "y_train contains labels outside the valid range!"
assert max(y_test) < unique_item_count, "y_test contains labels outside the valid range!"

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")
print(f"Vocabulary Size: {unique_item_count}")


X_train shape: (3371, 15), X_test shape: (843, 15)
y_train shape: (3371,), y_test shape: (843,)
Vocabulary Size: 3201


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer
import numpy as np
import pandas as pd
import time

# CPU Optimization
torch.set_num_threads(16)

# Hyperparameters
batch_size = 32
epochs = 8
learning_rate = 5e-5
sequence_length = 15
k = 10

# BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=unique_item_count,
    ignore_mismatched_sizes=True
)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Prepare DataLoader
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Prepare inputs for BERT
def prepare_inputs(X):
    return [" ".join(map(str, seq)) for seq in X]

X_train_str = prepare_inputs(X_train)
X_test_str = prepare_inputs(X_test)

train_encodings = tokenizer(X_train_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')
test_encodings = tokenizer(X_test_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training Function with Epoch Display
def train_model(model, train_loader, optimizer, epochs=10):
    model.train()
    device = torch.device('cpu')  # Use CPU
    model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    steps_per_epoch = len(train_loader)

    for epoch in range(epochs):
        total_loss = 0
        start_time = time.time()
        print(f"\nEpoch {epoch + 1}/{epochs}")

        for step, batch in enumerate(train_loader, start=1):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Countdown Display
            elapsed_time = time.time() - start_time
            steps_remaining = steps_per_epoch - step
            time_per_step = elapsed_time / max(1, step)
            estimated_time_remaining = steps_remaining * time_per_step

            print(
                f"\rEpoch {epoch + 1}/{epochs} - {step}/{steps_per_epoch} ━━━━━━ "
                f"{int(estimated_time_remaining)}s remaining - loss: {total_loss / step:.4f}", end=""
            )

        print(f"\nEpoch {epoch + 1}/{epochs} - Average Loss: {total_loss / steps_per_epoch:.4f}")

# Evaluation Function
def evaluate_model(model, test_loader, y_true, k=10):
    model.eval()
    device = torch.device('cpu')
    model.to(device)
    total_precision, total_recall, total_hits, total_mrr = 0, 0, 0, 0
    total_users = len(y_true)

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, top_k_indices = torch.topk(logits, k, dim=1)

            for idx, label in enumerate(labels):
                predictions = top_k_indices[idx].cpu().numpy()
                label = label.item()
                if label in predictions:
                    rank = np.where(predictions == label)[0][0] + 1
                    total_hits += 1
                    total_mrr += 1 / rank

                precision_k = len(set(predictions) & {label}) / k
                recall_k = len(set(predictions) & {label}) / 1
                total_precision += precision_k
                total_recall += recall_k

    precision = total_precision / total_users
    recall = total_recall / total_users
    hit_rate = total_hits / total_users
    mrr = total_mrr / total_users

    print(f"Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}, Hit Rate: {hit_rate:.4f}, MRR: {mrr:.4f}")
    return precision, recall, hit_rate, mrr

# Train the Model
train_model(model, train_loader, optimizer, epochs=epochs)

# Evaluate the Model
precision, recall, hit_rate, mrr = evaluate_model(model, test_loader, y_test, k=k)

# Save Results
results = pd.DataFrame([{
    "Precision@10": precision,
    "Recall@10": recall,
    "Hit Rate": hit_rate,
    "MRR": mrr
}])
results.to_csv(r"C:\Users\user\Desktop\CW4\online+retail\bert4rec_results4.csv", index=False)
print("Results saved.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/8
Epoch 1/8 - 106/106 ━━━━━━ 0s remaining - loss: 7.6166
Epoch 1/8 - Average Loss: 7.6166

Epoch 2/8
Epoch 2/8 - 106/106 ━━━━━━ 0s remaining - loss: 6.8303
Epoch 2/8 - Average Loss: 6.8303

Epoch 3/8
Epoch 3/8 - 106/106 ━━━━━━ 0s remaining - loss: 6.7235
Epoch 3/8 - Average Loss: 6.7235

Epoch 4/8
Epoch 4/8 - 106/106 ━━━━━━ 0s remaining - loss: 6.6858
Epoch 4/8 - Average Loss: 6.6858

Epoch 5/8
Epoch 5/8 - 106/106 ━━━━━━ 0s remaining - loss: 6.6644
Epoch 5/8 - Average Loss: 6.6644

Epoch 6/8
Epoch 6/8 - 106/106 ━━━━━━ 0s remaining - loss: 6.6485
Epoch 6/8 - Average Loss: 6.6485

Epoch 7/8
Epoch 7/8 - 106/106 ━━━━━━ 0s remaining - loss: 6.6278
Epoch 7/8 - Average Loss: 6.6278

Epoch 8/8
Epoch 8/8 - 106/106 ━━━━━━ 0s remaining - loss: 6.6000
Epoch 8/8 - Average Loss: 6.6000
Precision@10: 0.0063, Recall@10: 0.0629, Hit Rate: 0.0629, MRR: 0.0203
Results saved.


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load dataset
file_path = r"C:\Users\user\Desktop\CW4\online+retail\Online Retail.xlsx"
df = pd.read_excel(file_path, sheet_name='Online Retail')

# Cleaning
df_cleaned = df.dropna(subset=['CustomerID', 'Description'])
df_cleaned = df_cleaned[df_cleaned['Quantity'] > 0]

# Encoding
df_sorted = df_cleaned.sort_values(by=['CustomerID', 'InvoiceDate'])

# Get all unique descriptions for fitting
all_items = df_sorted['Description'].unique()

# Fit LabelEncoder on all possible items
item_encoder = LabelEncoder()
item_encoder.fit(all_items)

# Encode the ItemID column
df_sorted['ItemID'] = item_encoder.transform(df_sorted['Description'])

# Grouping by Customer
sequential_data = df_sorted.groupby('CustomerID')['ItemID'].apply(list).reset_index(name='ItemSequence')

# Minimum sequence length
min_sequence_length = 3
sequential_data = sequential_data[sequential_data['ItemSequence'].apply(len) >= min_sequence_length]

# Pad and create sequences
item_sequences = sequential_data['ItemSequence'].tolist()
sequence_length = 50
padded_sequences = pad_sequences(item_sequences, maxlen=sequence_length, padding='pre')

def create_sequences(sequences, seq_length=30):
    X, y = [], []
    for seq in sequences:
        for i in range(max(1, len(seq) - seq_length + 1)):
            X.append(seq[i:i + seq_length])
            y.append(seq[min(i + seq_length, len(seq) - 1)])  # Adjust for boundaries
    return np.array(X), np.array(y)

X, y = create_sequences(padded_sequences, seq_length=sequence_length)

# Dataset splitting
split_ratio = 0.8
split_index = int(len(X) * split_ratio)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Dynamically calculate the unique item count
unique_items = np.unique(np.concatenate([X.flatten(), y]))
unique_item_count = len(unique_items)

# Fit LabelEncoder on all items in both X and y
item_encoder = LabelEncoder()
item_encoder.fit(unique_items)

# Re-encode y to ensure compatibility with unique items
y_train = item_encoder.transform(y_train)
y_test = item_encoder.transform(y_test)

# Ensure all labels are within the valid range
assert max(y_train) < unique_item_count, "y_train contains labels outside the valid range!"
assert max(y_test) < unique_item_count, "y_test contains labels outside the valid range!"

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")
print(f"Vocabulary Size: {unique_item_count}")


X_train shape: (3371, 50), X_test shape: (843, 50)
y_train shape: (3371,), y_test shape: (843,)
Vocabulary Size: 3581


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer
import numpy as np
import pandas as pd
import time

# CPU Optimization
torch.set_num_threads(16)

# Hyperparameters
batch_size = 64
epochs = 10
learning_rate = 3e-5
sequence_length = 50
k = 10

# BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=unique_item_count,
    ignore_mismatched_sizes=True
)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Prepare DataLoader
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Prepare inputs for BERT
def prepare_inputs(X):
    return [" ".join(map(str, seq)) for seq in X]

X_train_str = prepare_inputs(X_train)
X_test_str = prepare_inputs(X_test)

train_encodings = tokenizer(X_train_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')
test_encodings = tokenizer(X_test_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training Function with Epoch Display
def train_model(model, train_loader, optimizer, epochs=10):
    model.train()
    device = torch.device('cpu')  # Use CPU
    model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    steps_per_epoch = len(train_loader)

    for epoch in range(epochs):
        total_loss = 0
        start_time = time.time()
        print(f"\nEpoch {epoch + 1}/{epochs}")

        for step, batch in enumerate(train_loader, start=1):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Countdown Display
            elapsed_time = time.time() - start_time
            steps_remaining = steps_per_epoch - step
            time_per_step = elapsed_time / max(1, step)
            estimated_time_remaining = steps_remaining * time_per_step

            print(
                f"\rEpoch {epoch + 1}/{epochs} - {step}/{steps_per_epoch} ━━━━━━ "
                f"{int(estimated_time_remaining)}s remaining - loss: {total_loss / step:.4f}", end=""
            )

        print(f"\nEpoch {epoch + 1}/{epochs} - Average Loss: {total_loss / steps_per_epoch:.4f}")

# Evaluation Function
def evaluate_model(model, test_loader, y_true, k=10):
    model.eval()
    device = torch.device('cpu')
    model.to(device)
    total_precision, total_recall, total_hits, total_mrr = 0, 0, 0, 0
    total_users = len(y_true)

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, top_k_indices = torch.topk(logits, k, dim=1)

            for idx, label in enumerate(labels):
                predictions = top_k_indices[idx].cpu().numpy()
                label = label.item()
                if label in predictions:
                    rank = np.where(predictions == label)[0][0] + 1
                    total_hits += 1
                    total_mrr += 1 / rank

                precision_k = len(set(predictions) & {label}) / k
                recall_k = len(set(predictions) & {label}) / 1
                total_precision += precision_k
                total_recall += recall_k

    precision = total_precision / total_users
    recall = total_recall / total_users
    hit_rate = total_hits / total_users
    mrr = total_mrr / total_users

    print(f"Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}, Hit Rate: {hit_rate:.4f}, MRR: {mrr:.4f}")
    return precision, recall, hit_rate, mrr

# Train the Model
train_model(model, train_loader, optimizer, epochs=epochs)

# Evaluate the Model
precision, recall, hit_rate, mrr = evaluate_model(model, test_loader, y_test, k=k)

# Save Results
results = pd.DataFrame([{
    "Precision@10": precision,
    "Recall@10": recall,
    "Hit Rate": hit_rate,
    "MRR": mrr
}])
results.to_csv(r"C:\Users\user\Desktop\CW4\online+retail\bert4rec_results5.csv", index=False)
print("Results saved.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/10
Epoch 1/10 - 53/53 ━━━━━━ 0s remaining - loss: 7.9491
Epoch 1/10 - Average Loss: 7.9491

Epoch 2/10
Epoch 2/10 - 53/53 ━━━━━━ 0s remaining - loss: 7.2850
Epoch 2/10 - Average Loss: 7.2850

Epoch 3/10
Epoch 3/10 - 53/53 ━━━━━━ 0s remaining - loss: 6.9244
Epoch 3/10 - Average Loss: 6.9244

Epoch 4/10
Epoch 4/10 - 53/53 ━━━━━━ 0s remaining - loss: 6.7692
Epoch 4/10 - Average Loss: 6.7692

Epoch 5/10
Epoch 5/10 - 53/53 ━━━━━━ 0s remaining - loss: 6.6649
Epoch 5/10 - Average Loss: 6.6649

Epoch 6/10
Epoch 6/10 - 53/53 ━━━━━━ 0s remaining - loss: 6.5925
Epoch 6/10 - Average Loss: 6.5925

Epoch 7/10
Epoch 7/10 - 53/53 ━━━━━━ 0s remaining - loss: 6.5285
Epoch 7/10 - Average Loss: 6.5285

Epoch 8/10
Epoch 8/10 - 53/53 ━━━━━━ 0s remaining - loss: 6.4726
Epoch 8/10 - Average Loss: 6.4726

Epoch 9/10
Epoch 9/10 - 53/53 ━━━━━━ 0s remaining - loss: 6.4180
Epoch 9/10 - Average Loss: 6.4180

Epoch 10/10
Epoch 10/10 - 53/53 ━━━━━━ 0s remaining - loss: 6.3657
Epoch 10/10 - Average Loss: 6.36

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load dataset
file_path = r"C:\Users\user\Desktop\CW4\online+retail\Online Retail.xlsx"
df = pd.read_excel(file_path, sheet_name='Online Retail')

# Cleaning
df_cleaned = df.dropna(subset=['CustomerID', 'Description'])
df_cleaned = df_cleaned[df_cleaned['Quantity'] > 0]

# Encoding
df_sorted = df_cleaned.sort_values(by=['CustomerID', 'InvoiceDate'])

# Get all unique descriptions for fitting
all_items = df_sorted['Description'].unique()

# Fit LabelEncoder on all possible items
item_encoder = LabelEncoder()
item_encoder.fit(all_items)

# Encode the ItemID column
df_sorted['ItemID'] = item_encoder.transform(df_sorted['Description'])

# Grouping by Customer
sequential_data = df_sorted.groupby('CustomerID')['ItemID'].apply(list).reset_index(name='ItemSequence')

# Minimum sequence length
min_sequence_length = 3
sequential_data = sequential_data[sequential_data['ItemSequence'].apply(len) >= min_sequence_length]

# Pad and create sequences
item_sequences = sequential_data['ItemSequence'].tolist()
sequence_length = 40
padded_sequences = pad_sequences(item_sequences, maxlen=sequence_length, padding='pre')

def create_sequences(sequences, seq_length=30):
    X, y = [], []
    for seq in sequences:
        for i in range(max(1, len(seq) - seq_length + 1)):
            X.append(seq[i:i + seq_length])
            y.append(seq[min(i + seq_length, len(seq) - 1)])  # Adjust for boundaries
    return np.array(X), np.array(y)

X, y = create_sequences(padded_sequences, seq_length=sequence_length)

# Dataset splitting
split_ratio = 0.8
split_index = int(len(X) * split_ratio)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Dynamically calculate the unique item count
unique_items = np.unique(np.concatenate([X.flatten(), y]))
unique_item_count = len(unique_items)

# Fit LabelEncoder on all items in both X and y
item_encoder = LabelEncoder()
item_encoder.fit(unique_items)

# Re-encode y to ensure compatibility with unique items
y_train = item_encoder.transform(y_train)
y_test = item_encoder.transform(y_test)

# Ensure all labels are within the valid range
assert max(y_train) < unique_item_count, "y_train contains labels outside the valid range!"
assert max(y_test) < unique_item_count, "y_test contains labels outside the valid range!"

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")
print(f"Vocabulary Size: {unique_item_count}")


X_train shape: (3371, 40), X_test shape: (843, 40)
y_train shape: (3371,), y_test shape: (843,)
Vocabulary Size: 3533


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer
import numpy as np
import pandas as pd
import time

# CPU Optimization
torch.set_num_threads(16)

# Hyperparameters
batch_size = 64
epochs = 20
learning_rate = 3e-5
sequence_length = 40
k = 10

# BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=unique_item_count,
    ignore_mismatched_sizes=True
)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Prepare DataLoader
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Prepare inputs for BERT
def prepare_inputs(X):
    return [" ".join(map(str, seq)) for seq in X]

X_train_str = prepare_inputs(X_train)
X_test_str = prepare_inputs(X_test)

train_encodings = tokenizer(X_train_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')
test_encodings = tokenizer(X_test_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training Function with Epoch Display
def train_model(model, train_loader, optimizer, epochs=10):
    model.train()
    device = torch.device('cpu')  # Use CPU
    model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    steps_per_epoch = len(train_loader)

    for epoch in range(epochs):
        total_loss = 0
        start_time = time.time()
        print(f"\nEpoch {epoch + 1}/{epochs}")

        for step, batch in enumerate(train_loader, start=1):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Countdown Display
            elapsed_time = time.time() - start_time
            steps_remaining = steps_per_epoch - step
            time_per_step = elapsed_time / max(1, step)
            estimated_time_remaining = steps_remaining * time_per_step

            print(
                f"\rEpoch {epoch + 1}/{epochs} - {step}/{steps_per_epoch} ━━━━━━ "
                f"{int(estimated_time_remaining)}s remaining - loss: {total_loss / step:.4f}", end=""
            )

        print(f"\nEpoch {epoch + 1}/{epochs} - Average Loss: {total_loss / steps_per_epoch:.4f}")

# Evaluation Function
def evaluate_model(model, test_loader, y_true, k=10):
    model.eval()
    device = torch.device('cpu')
    model.to(device)
    total_precision, total_recall, total_hits, total_mrr = 0, 0, 0, 0
    total_users = len(y_true)

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, top_k_indices = torch.topk(logits, k, dim=1)

            for idx, label in enumerate(labels):
                predictions = top_k_indices[idx].cpu().numpy()
                label = label.item()
                if label in predictions:
                    rank = np.where(predictions == label)[0][0] + 1
                    total_hits += 1
                    total_mrr += 1 / rank

                precision_k = len(set(predictions) & {label}) / k
                recall_k = len(set(predictions) & {label}) / 1
                total_precision += precision_k
                total_recall += recall_k

    precision = total_precision / total_users
    recall = total_recall / total_users
    hit_rate = total_hits / total_users
    mrr = total_mrr / total_users

    print(f"Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}, Hit Rate: {hit_rate:.4f}, MRR: {mrr:.4f}")
    return precision, recall, hit_rate, mrr

# Train the Model
train_model(model, train_loader, optimizer, epochs=epochs)

# Evaluate the Model
precision, recall, hit_rate, mrr = evaluate_model(model, test_loader, y_test, k=k)

# Save Results
results = pd.DataFrame([{
    "Precision@10": precision,
    "Recall@10": recall,
    "Hit Rate": hit_rate,
    "MRR": mrr
}])
results.to_csv(r"C:\Users\user\Desktop\CW4\online+retail\bert4rec_results6.csv", index=False)
print("Results saved.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/20
Epoch 1/20 - 53/53 ━━━━━━ 0s remaining - loss: 7.9619
Epoch 1/20 - Average Loss: 7.9619

Epoch 2/20
Epoch 2/20 - 53/53 ━━━━━━ 0s remaining - loss: 7.3467
Epoch 2/20 - Average Loss: 7.3467

Epoch 3/20
Epoch 3/20 - 53/53 ━━━━━━ 0s remaining - loss: 6.9509
Epoch 3/20 - Average Loss: 6.9509

Epoch 4/20
Epoch 4/20 - 53/53 ━━━━━━ 0s remaining - loss: 6.7492
Epoch 4/20 - Average Loss: 6.7492

Epoch 5/20
Epoch 5/20 - 53/53 ━━━━━━ 0s remaining - loss: 6.6392
Epoch 5/20 - Average Loss: 6.6392

Epoch 6/20
Epoch 6/20 - 53/53 ━━━━━━ 0s remaining - loss: 6.5597
Epoch 6/20 - Average Loss: 6.5597

Epoch 7/20
Epoch 7/20 - 53/53 ━━━━━━ 0s remaining - loss: 6.4972
Epoch 7/20 - Average Loss: 6.4972

Epoch 8/20
Epoch 8/20 - 53/53 ━━━━━━ 0s remaining - loss: 6.4333
Epoch 8/20 - Average Loss: 6.4333

Epoch 9/20
Epoch 9/20 - 53/53 ━━━━━━ 0s remaining - loss: 6.3701
Epoch 9/20 - Average Loss: 6.3701

Epoch 10/20
Epoch 10/20 - 53/53 ━━━━━━ 0s remaining - loss: 6.3155
Epoch 10/20 - Average Loss: 6.31

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load dataset
file_path = r"C:\Users\user\Desktop\CW4\online+retail\Online Retail.xlsx"
df = pd.read_excel(file_path, sheet_name='Online Retail')

# Cleaning
df_cleaned = df.dropna(subset=['CustomerID', 'Description'])
df_cleaned = df_cleaned[df_cleaned['Quantity'] > 0]

# Encoding
df_sorted = df_cleaned.sort_values(by=['CustomerID', 'InvoiceDate'])

# Get all unique descriptions for fitting
all_items = df_sorted['Description'].unique()

# Fit LabelEncoder on all possible items
item_encoder = LabelEncoder()
item_encoder.fit(all_items)

# Encode the ItemID column
df_sorted['ItemID'] = item_encoder.transform(df_sorted['Description'])

# Grouping by Customer
sequential_data = df_sorted.groupby('CustomerID')['ItemID'].apply(list).reset_index(name='ItemSequence')

# Minimum sequence length
min_sequence_length = 3
sequential_data = sequential_data[sequential_data['ItemSequence'].apply(len) >= min_sequence_length]

# Pad and create sequences
item_sequences = sequential_data['ItemSequence'].tolist()
sequence_length = 50
padded_sequences = pad_sequences(item_sequences, maxlen=sequence_length, padding='pre')

def create_sequences(sequences, seq_length=30):
    X, y = [], []
    for seq in sequences:
        for i in range(max(1, len(seq) - seq_length + 1)):
            X.append(seq[i:i + seq_length])
            y.append(seq[min(i + seq_length, len(seq) - 1)])  # Adjust for boundaries
    return np.array(X), np.array(y)

X, y = create_sequences(padded_sequences, seq_length=sequence_length)

# Dataset splitting
split_ratio = 0.8
split_index = int(len(X) * split_ratio)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Dynamically calculate the unique item count
unique_items = np.unique(np.concatenate([X.flatten(), y]))
unique_item_count = len(unique_items)

# Fit LabelEncoder on all items in both X and y
item_encoder = LabelEncoder()
item_encoder.fit(unique_items)

# Re-encode y to ensure compatibility with unique items
y_train = item_encoder.transform(y_train)
y_test = item_encoder.transform(y_test)

# Ensure all labels are within the valid range
assert max(y_train) < unique_item_count, "y_train contains labels outside the valid range!"
assert max(y_test) < unique_item_count, "y_test contains labels outside the valid range!"

print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")
print(f"Vocabulary Size: {unique_item_count}")


X_train shape: (3371, 50), X_test shape: (843, 50)
y_train shape: (3371,), y_test shape: (843,)
Vocabulary Size: 3581


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer
import numpy as np
import pandas as pd
import time

# CPU Optimization
torch.set_num_threads(16)

# Hyperparameters
batch_size = 32
epochs = 15
learning_rate = 1e-5
sequence_length = 50
k = 10

# BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=unique_item_count,
    ignore_mismatched_sizes=True
)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Prepare DataLoader
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Prepare inputs for BERT
def prepare_inputs(X):
    return [" ".join(map(str, seq)) for seq in X]

X_train_str = prepare_inputs(X_train)
X_test_str = prepare_inputs(X_test)

train_encodings = tokenizer(X_train_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')
test_encodings = tokenizer(X_test_str, truncation=True, padding=True, max_length=sequence_length, return_tensors='pt')

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training Function with Epoch Display
def train_model(model, train_loader, optimizer, epochs=10):
    model.train()
    device = torch.device('cpu')  # Use CPU
    model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    steps_per_epoch = len(train_loader)

    for epoch in range(epochs):
        total_loss = 0
        start_time = time.time()
        print(f"\nEpoch {epoch + 1}/{epochs}")

        for step, batch in enumerate(train_loader, start=1):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Countdown Display
            elapsed_time = time.time() - start_time
            steps_remaining = steps_per_epoch - step
            time_per_step = elapsed_time / max(1, step)
            estimated_time_remaining = steps_remaining * time_per_step

            print(
                f"\rEpoch {epoch + 1}/{epochs} - {step}/{steps_per_epoch} ━━━━━━ "
                f"{int(estimated_time_remaining)}s remaining - loss: {total_loss / step:.4f}", end=""
            )

        print(f"\nEpoch {epoch + 1}/{epochs} - Average Loss: {total_loss / steps_per_epoch:.4f}")

# Evaluation Function
def evaluate_model(model, test_loader, y_true, k=10):
    model.eval()
    device = torch.device('cpu')
    model.to(device)
    total_precision, total_recall, total_hits, total_mrr = 0, 0, 0, 0
    total_users = len(y_true)

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, top_k_indices = torch.topk(logits, k, dim=1)

            for idx, label in enumerate(labels):
                predictions = top_k_indices[idx].cpu().numpy()
                label = label.item()
                if label in predictions:
                    rank = np.where(predictions == label)[0][0] + 1
                    total_hits += 1
                    total_mrr += 1 / rank

                precision_k = len(set(predictions) & {label}) / k
                recall_k = len(set(predictions) & {label}) / 1
                total_precision += precision_k
                total_recall += recall_k

    precision = total_precision / total_users
    recall = total_recall / total_users
    hit_rate = total_hits / total_users
    mrr = total_mrr / total_users

    print(f"Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}, Hit Rate: {hit_rate:.4f}, MRR: {mrr:.4f}")
    return precision, recall, hit_rate, mrr

# Train the Model
train_model(model, train_loader, optimizer, epochs=epochs)

# Evaluate the Model
precision, recall, hit_rate, mrr = evaluate_model(model, test_loader, y_test, k=k)

# Save Results
results = pd.DataFrame([{
    "Precision@10": precision,
    "Recall@10": recall,
    "Hit Rate": hit_rate,
    "MRR": mrr
}])
results.to_csv(r"C:\Users\user\Desktop\CW4\online+retail\bert4rec_results7.csv", index=False)
print("Results saved.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/15
Epoch 1/15 - 106/106 ━━━━━━ 0s remaining - loss: 8.0777
Epoch 1/15 - Average Loss: 8.0777

Epoch 2/15
Epoch 2/15 - 106/106 ━━━━━━ 0s remaining - loss: 7.5810
Epoch 2/15 - Average Loss: 7.5810

Epoch 3/15
Epoch 3/15 - 106/106 ━━━━━━ 0s remaining - loss: 7.1895
Epoch 3/15 - Average Loss: 7.1895

Epoch 4/15
Epoch 4/15 - 106/106 ━━━━━━ 0s remaining - loss: 6.9770
Epoch 4/15 - Average Loss: 6.9770

Epoch 5/15
Epoch 5/15 - 106/106 ━━━━━━ 0s remaining - loss: 6.8323
Epoch 5/15 - Average Loss: 6.8323

Epoch 6/15
Epoch 6/15 - 106/106 ━━━━━━ 0s remaining - loss: 6.7372
Epoch 6/15 - Average Loss: 6.7372

Epoch 7/15
Epoch 7/15 - 106/106 ━━━━━━ 0s remaining - loss: 6.6699
Epoch 7/15 - Average Loss: 6.6699

Epoch 8/15
Epoch 8/15 - 106/106 ━━━━━━ 0s remaining - loss: 6.6096
Epoch 8/15 - Average Loss: 6.6096

Epoch 9/15
Epoch 9/15 - 106/106 ━━━━━━ 0s remaining - loss: 6.5790
Epoch 9/15 - Average Loss: 6.5790

Epoch 10/15
Epoch 10/15 - 106/106 ━━━━━━ 0s remaining - loss: 6.5463
Epoch 10/15 