In [1]:
import datasets
datasets.logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


# Step 1: Data Preparation

## 1.1: Load the Amazon Reviews 2023 dataset for the selected domains.

In [3]:
from datasets import load_dataset

domains = ["All_Beauty", "Video_Games", "Baby_Products"]


dfs = {}  # dictionary to store the different domains
for domain in domains:
    dfs[domain] = {}
    dfs[domain]["reviews"] = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_review_{domain}", trust_remote_code=True)
    dfs[domain]['metadata'] = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{domain}", split="full", trust_remote_code=True)



## 1.2 Preprocess the data

In [4]:
# Print a few examples from the loaded datasets
for domain in domains:
    print(f"Domain: {domain}")
    print("Reviews:")
    print(dfs[domain]['reviews']["full"][0])
    print("Metadata:")
    print(dfs[domain]['metadata'][0])
    print()

# Print the dataset sizes
for domain in domains:
    print(f"Domain: {domain}")
    print(f"Dataset size: {len(dfs[domain]['reviews']['full'])}")



# Count items with images
for domain in domains:
    items_with_images = dfs[domain]['metadata'].filter(lambda example: len(example['images']) > 0)
    print(f"Domain: {domain}")
    print(f"Items with images: {len(items_with_images)}")
    print(f"Total items: {len(dfs[domain]['metadata'])}")
    print()

# Filter reviews to keep only items with images
for domain in domains:
    items_with_images_ids = set(dfs[domain]['metadata'].filter(lambda example: len(example['images']) > 0)['parent_asin'])
    dfs[domain]['reviews'] = dfs[domain]['reviews'].filter(lambda example: example['parent_asin'] in items_with_images_ids)

Domain: All_Beauty
Reviews:
{'rating': 5.0, 'title': 'Such a lovely scent but not overpowering.', 'text': "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!", 'images': [], 'asin': 'B00YQ6X8EO', 'parent_asin': 'B00YQ6X8EO', 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ', 'timestamp': 1588687728923, 'helpful_vote': 0, 'verified_purchase': True}
Metadata:
{'main_category': 'All Beauty', 'title': 'Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)', 'average_rating': 4.8, 'rating_number': 10, 'features': [], 'description': [], 'price': 'None', 'images': {'hi_res': [None, 'https://m.media-amazon.com/images/I/71i77AuI9xL._SL1500_.jpg'], 'large': ['https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg', 'https://m.media-amazon.com/images/I/41w2yznfuZL

Filter: 100%|██████████| 112590/112590 [00:03<00:00, 28456.24 examples/s]


Domain: All_Beauty
Items with images: 112590
Total items: 112590



Filter: 100%|██████████| 137269/137269 [00:05<00:00, 23033.07 examples/s]


Domain: Video_Games
Items with images: 137269
Total items: 137269



Filter: 100%|██████████| 217724/217724 [00:09<00:00, 22835.49 examples/s]


Domain: Baby_Products
Items with images: 217724
Total items: 217724



Filter: 100%|██████████| 112590/112590 [00:03<00:00, 31777.25 examples/s]
Filter: 100%|██████████| 701528/701528 [00:04<00:00, 152402.66 examples/s]
Filter: 100%|██████████| 137269/137269 [00:05<00:00, 25977.19 examples/s]
Filter: 100%|██████████| 4624615/4624615 [00:29<00:00, 154642.96 examples/s]
Filter: 100%|██████████| 217724/217724 [00:08<00:00, 25326.42 examples/s]
Filter: 100%|██████████| 6028884/6028884 [00:36<00:00, 163430.41 examples/s]


In [5]:
import re

def preprocess_reviews(examples):
    # Lowercase the text
    examples["text"] = [text.lower() for text in examples["text"]]

    # Remove punctuation and special characters using regex
    examples["text"] = [re.sub(r'[^\w\s]', "", text) for text in examples["text"]]

    # Tokenize the title
    examples['title_tokens'] = [title.split() for title in examples['title']]

    # Flatten the features list if it exists
    if 'features' in examples:
        examples['features'] = [' '.join(features) for features in examples['features']]

    return examples

def preprocess_metadata(examples):
    # Lowercase the title
    examples['title'] = [title.lower() for title in examples['title']]

    # Remove punctuation and special characters from the title
    examples['title'] = [re.sub(r'[^\w\s]', '', title) for title in examples['title']]

    # Tokenize the title
    examples['title_tokens'] = [title.split() for title in examples['title']]

    # Flatten the features list
    examples['features'] = [' '.join(features) for features in examples['features']]

    return examples

for domain in domains:
    dfs[domain]['reviews'] = dfs[domain]['reviews'].map(preprocess_reviews, batched=True, num_proc=4)
    dfs[domain]['metadata'] = dfs[domain]['metadata'].map(preprocess_metadata, batched=True, num_proc=4)

Map (num_proc=4): 100%|██████████| 701528/701528 [00:09<00:00, 72830.73 examples/s]
Map (num_proc=4): 100%|██████████| 112590/112590 [00:00<00:00, 173055.65 examples/s]
Map (num_proc=4): 100%|██████████| 4624615/4624615 [01:05<00:00, 70257.41 examples/s]
Map (num_proc=4): 100%|██████████| 137269/137269 [00:01<00:00, 87825.34 examples/s]
Map (num_proc=4): 100%|██████████| 6028884/6028884 [01:25<00:00, 70342.90 examples/s]
Map (num_proc=4): 100%|██████████| 217724/217724 [00:02<00:00, 89488.21 examples/s] 


## 1.3 Split the data into train, validation and test using temporal split

In [6]:
from datasets import DatasetDict

for domain in domains:
    # Calculate sizes based on the new dictionary name
    train_size = int(0.8 * len(dfs[domain]['reviews']['full']))
    val_size = int(0.1 * len(dfs[domain]['reviews']['full']))
    test_size = len(dfs[domain]['reviews']['full']) - train_size - val_size

    # Perform the split
    split_datasets = dfs[domain]['reviews']['full'].train_test_split(train_size=train_size, test_size=test_size + val_size, seed=42)

    # Assign the train set and split the test set into validation and test
    dfs[domain]['reviews'] = DatasetDict({
        'train': split_datasets['train'],
        'test': split_datasets['test'].train_test_split(train_size=val_size, seed=42)['test'],
        'validation': split_datasets['test'].train_test_split(train_size=val_size, seed=42)['train']
    })

In [7]:
for domain in domains:
    print(f"Domain: {domain}")
    print("Train set size:", len(dfs[domain]['reviews']['train']))
    print("Validation set size:", len(dfs[domain]['reviews']['validation']))
    print("Test set size:", len(dfs[domain]['reviews']['test']))
    print()

Domain: All_Beauty
Train set size: 561222
Validation set size: 70152
Test set size: 70154

Domain: Video_Games
Train set size: 3699692
Validation set size: 462461
Test set size: 462462

Domain: Baby_Products
Train set size: 4823107
Validation set size: 602888
Test set size: 602889



# Step 2: Baseline Models Implementation

## 2.1 Prepare the data for BERT4Rec

In [9]:
import torch
from transformers import BertConfig, BertForSequenceClassification
from transformers import BertTokenizerFast

def prepare_bert_input(examples, tokenizer, max_seq_length):
    tokenized_inputs = tokenizer(examples['text'], max_length=max_seq_length, truncation=True, padding='max_length', return_tensors='pt')
    return {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'token_type_ids': tokenized_inputs['token_type_ids']
    }

max_seq_length = 128
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(device)

for domain in domains:
    dfs[domain]['reviews']['train'] = dfs[domain]['reviews']['train'].map(
        lambda examples: prepare_bert_input(examples, tokenizer, max_seq_length),
        batched=True,
        batch_size=256,
        num_proc=4
    )
    dfs[domain]['reviews']['validation'] = dfs[domain]['reviews']['validation'].map(
        lambda examples: prepare_bert_input(examples, tokenizer, max_seq_length),
        batched=True,
        batch_size=256,
        num_proc=4
    )
    dfs[domain]['reviews']['test'] = dfs[domain]['reviews']['test'].map(
        lambda examples: prepare_bert_input(examples, tokenizer, max_seq_length),
        batched=True,
        batch_size=256,
        num_proc=4
    )





mps


Map (num_proc=4): 100%|██████████| 561222/561222 [00:26<00:00, 21201.81 examples/s]
Map (num_proc=4): 100%|██████████| 70152/70152 [00:03<00:00, 20899.38 examples/s]
Map (num_proc=4): 100%|██████████| 70154/70154 [00:03<00:00, 20676.01 examples/s]
Map (num_proc=4): 100%|██████████| 3699692/3699692 [03:45<00:00, 16410.91 examples/s]
Map (num_proc=4): 100%|██████████| 462461/462461 [00:28<00:00, 16191.67 examples/s]
Map (num_proc=4): 100%|██████████| 462462/462462 [00:27<00:00, 16607.54 examples/s]
Map (num_proc=4): 100%|██████████| 4823107/4823107 [04:05<00:00, 19629.69 examples/s]
Map (num_proc=4): 100%|██████████| 602888/602888 [00:30<00:00, 19489.38 examples/s]
Map (num_proc=4): 100%|██████████| 602889/602889 [00:30<00:00, 19463.10 examples/s]


## 2.2 Define the BERT4Rec model

In [10]:
def create_bert4rec_model(num_items):
    config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_items)
    model = BertForSequenceClassification(config)
    return model

## 2.3 Train and evaluate BERT4Rec Model

In [15]:
from tqdm import tqdm

# Check if MPS device is available
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using MPS device.")
else:
    device = torch.device('cpu')
    print("MPS device not found, using CPU.")

accumulation_steps = 4  # Adjust this value based on your memory constraints

for domain in domains:
    num_items = len(dfs[domain]['metadata'])
    model = create_bert4rec_model(num_items).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = torch.nn.CrossEntropyLoss()
    
    train_dataset = torch.utils.data.TensorDataset(
        torch.tensor(dfs[domain]['reviews']['train']['input_ids']),
        torch.tensor(dfs[domain]['reviews']['train']['attention_mask']),
        torch.tensor(dfs[domain]['reviews']['train']['token_type_ids'])
    )
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, pin_memory=True)
    
    val_dataset = torch.utils.data.TensorDataset(
        torch.tensor(dfs[domain]['reviews']['validation']['input_ids']),
        torch.tensor(dfs[domain]['reviews']['validation']['attention_mask']),
        torch.tensor(dfs[domain]['reviews']['validation']['token_type_ids'])
    )
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, pin_memory=True)
    
    for epoch in range(3):
        model.train()
        train_loss = 0
        train_accuracy = 0
        
        with tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch") as train_progress:
            for i, batch in enumerate(train_progress):
                batch = tuple(t.to(device) for t in batch)
                input_ids, attention_mask, token_type_ids = batch
                
                optimizer.zero_grad()
                outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                logits = outputs.logits
                loss = loss_fn(logits, torch.zeros(logits.shape[0], dtype=torch.long, device=device))
                loss = loss / accumulation_steps
                loss.backward()
                
                if (i + 1) % accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                
                train_loss += loss.item()
                train_accuracy += (logits.argmax(dim=1) == 0).float().mean().item()
                
                train_progress.set_postfix(loss=train_loss / (train_progress.n + 1), accuracy=train_accuracy / (train_progress.n + 1))
        
        model.eval()
        val_loss = 0
        val_accuracy = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = tuple(t.to(device) for t in batch)
                input_ids, attention_mask, token_type_ids = batch
                
                outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                logits = outputs.logits
                val_loss += loss_fn(logits, torch.zeros(logits.shape[0], dtype=torch.long, device=device)).item()
                val_accuracy += (logits.argmax(dim=1) == 0).float().mean().item()
        
        val_loss /= len(val_loader)
        val_accuracy /= len(val_loader)
        print(f"Epoch {epoch+1}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    test_dataset = torch.utils.data.TensorDataset(
        torch.tensor(dfs[domain]['reviews']['test']['input_ids']),
        torch.tensor(dfs[domain]['reviews']['test']['attention_mask']),
        torch.tensor(dfs[domain]['reviews']['test']['token_type_ids'])
    )
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, pin_memory=True)
    
    model.eval()
    test_loss = 0
    test_accuracy = 0
    with torch.no_grad():
        for batch in test_loader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, token_type_ids = batch
            
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            logits = outputs.logits
            test_loss += loss_fn(logits, torch.zeros(logits.shape[0], dtype=torch.long, device=device)).item()
            test_accuracy += (logits.argmax(dim=1) == 0).float().mean().item()
    
    test_loss /= len(test_loader)
    test_accuracy /= len(test_loader)
    print(f"Domain: {domain}")
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print()

Using MPS device.


Epoch 1:   0%|          | 0/8770 [01:26<?, ?batch/s]


KeyboardInterrupt: 