## Preparation Phase

In [1]:
%pip install recbole
%pip install kmeans_pytorch



In [2]:
import os
import numpy as np
import torch
import yaml
import pickle
import numpy as np
from tqdm import tqdm

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender.sasrec import SASRec
from recbole.trainer import Trainer
from recbole.utils import init_logger, get_model, get_trainer
from recbole.model.general_recommender.ldiffrec import LDiffRec

  from .autonotebook import tqdm as notebook_tqdm


## Data Preparation

In [8]:
import os
import numpy as np
import torch
import yaml
import pickle
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender.sasrec import SASRec
from recbole.trainer import Trainer
from recbole.utils import get_trainer

# Step 1: Data preparation
domains = ["All_Beauty", "Video_Games", "Baby_Products"]
datasets = {}
for domain in domains:
    datasets[domain] = {}
    datasets[domain]["reviews"] = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"0core_timestamp_w_his_{domain}", trust_remote_code=True)
    datasets[domain]['metadata'] = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{domain}", split="full", trust_remote_code=True)

def preprocess_metadata(examples):
    examples['features'] = [' '.join(features) for features in examples['features']]
    return examples

for domain in domains:
    datasets[domain]['metadata'] = datasets[domain]['metadata'].map(preprocess_metadata, batched=True, num_proc=4)

for domain in domains:
    items_with_images_ids = set(datasets[domain]['metadata'].filter(lambda example: len(example['images']) > 0)['parent_asin'])
    datasets[domain]['reviews'] = datasets[domain]['reviews'].filter(lambda example: example['parent_asin'] in items_with_images_ids)


## Baseline Model Implementation

In [None]:

# Step 2: Baseline Models Implementation
tokenizer = AutoTokenizer.from_pretrained("hyp1231/blair-roberta-base")
blair_model = AutoModel.from_pretrained("hyp1231/blair-roberta-base")
blair_model = blair_model.to('cuda') if torch.cuda.is_available() else blair_model.to('cpu')
info_model = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {info_model}")

for domain in tqdm(domains, desc="Processing domains"):
    data_dir = f"dataset/{domain}"
    os.makedirs(data_dir, exist_ok=True)

    inter_path = os.path.join(data_dir, f"{domain}.inter")

    if os.path.exists(inter_path):
        print(f"Interaction dataset already exists for {domain}. Skipping creation.")
    else:
        df = datasets[domain]['reviews']['train'].to_pandas()
        # Ensure the .inter file has the correct headers
        header = "user_id:token\titem_id:token\trating:float\ttimestamp:float\thistory:token_seq"
        df.to_csv(inter_path, index=False, sep='\t', header=False)
        with open(inter_path, 'r') as original:
            data = original.read()
        with open(inter_path, 'w') as modified:
            modified.write(header + '\n' + data)

def extract_item_embeddings(domain, item_ids, batch_size=1024, save_interval=10000):
    item_metadata = datasets[domain]['metadata'].filter(lambda example: example['parent_asin'] in item_ids)
    item_texts = [example['title'] + ' ' + ' '.join(example['features']) for example in item_metadata]
    item_embeddings = {}
    count = 0
    for i in tqdm(range(0, len(item_texts), batch_size), desc="Extracting embeddings"):
        batch_texts = item_texts[i:i+batch_size]
        encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=64, return_tensors='pt')
        encoded_inputs = {k: v.to(blair_model.device) for k, v in encoded_inputs.items()}
        with torch.no_grad():
            batch_embeddings = blair_model(**encoded_inputs).last_hidden_state[:, 0].cpu().numpy()
        for item_id, embedding in zip(item_ids[i:i+batch_size], batch_embeddings):
            item_embeddings[item_id] = embedding
        count += len(batch_embeddings)
        
        if count >= save_interval:
            save_embeddings(domain, item_embeddings, count)
            count = 0
    
    save_embeddings(domain, item_embeddings, count)
    return item_embeddings

def save_embeddings(domain, embeddings, count):
    embeddings_file = f"item_embeddings_{domain}.pkl"
    if os.path.exists(embeddings_file):
        with open(embeddings_file, "rb") as f:
            existing_embeddings = pickle.load(f)
        existing_embeddings.update(embeddings)
        with open(embeddings_file, "wb") as f:
            pickle.dump(existing_embeddings, f)
    else:
        with open(embeddings_file, "wb") as f:
            pickle.dump(embeddings, f)
    print(f"Saved {count} embeddings to {embeddings_file}")

item_embeddings = {}
for domain in domains:
    embeddings_file = f"item_embeddings_{domain}.pkl"
    if os.path.exists(embeddings_file):
        print(f"Loading saved item embeddings for {domain}")
        with open(embeddings_file, "rb") as f:
            item_embeddings[domain] = pickle.load(f)
    else:
        print(f"Extracting item embeddings for {domain}")
        item_ids = set(datasets[domain]['metadata']['parent_asin'])
        item_embeddings[domain] = extract_item_embeddings(domain, item_ids)
        with open(embeddings_file, "wb") as f:
            pickle.dump(item_embeddings[domain], f)

# Load the existing config.yaml
with open('config.yaml', 'r') as f:
    config_dict = yaml.safe_load(f)

if torch.cuda.is_available():
    config_dict['gpu_id'] = 'cuda:0'
else:
    config_dict['gpu_id'] = '0'

config_dict['train_neg_sample_args'] = None

for domain in tqdm(domains, desc="Processing domains"):
    dataset_path = f"dataset/{domain}"
    config_dict["dataset"] = domain

    # Create Config object
    config = Config(model="SASRec", config_dict=config_dict)
    print(f"Config for {domain}: {config}")

    dataset = create_dataset(config)
    train_data, valid_data, test_data = data_preparation(config, dataset)

    class BLAIRSASRec(SASRec):
        def __init__(self, config, dataset):
            super().__init__(config, dataset)
            self.item_embeddings = torch.tensor(np.array(list(item_embeddings[domain].values())), dtype=torch.float32).to(config['device'])
            self.item_embedding_size = self.item_embeddings.shape[1]
            self.position_embedding = torch.nn.Embedding(config['MAX_ITEM_LIST_LENGTH'], self.item_embedding_size)
            self.trm_encoder = torch.nn.TransformerEncoder(
                torch.nn.TransformerEncoderLayer(d_model=self.item_embedding_size, nhead=config['n_heads']),
                num_layers=config['num_layers']
            )

        def forward(self, item_seq, item_seq_len):
            item_emb = self.item_embeddings[item_seq].to(self.device)
            position_ids = torch.arange(item_seq.size(1), dtype=torch.long, device=item_seq.device).unsqueeze(0).expand(item_seq.size(0), -1)
            position_embedding = self.position_embedding(position_ids)
            seq_emb = item_emb + position_embedding
            seq_emb = self.dropout(seq_emb)

            mask = (torch.arange(seq_emb.size(1), device=item_seq.device).unsqueeze(0).expand(seq_emb.size(0), -1) < item_seq_len.unsqueeze(-1))
            seq_emb *= mask.unsqueeze(-1)

            batch_size = seq_emb.size(0)
            x = seq_emb.transpose(0, 1)  # Transpose to shape [seq_len, batch_size, embed_size]
            mask = ~mask  # Invert the mask
            seq_output = self.trm_encoder(x, src_key_padding_mask=mask)
            seq_output = seq_output.transpose(0, 1)  # Transpose back to shape [batch_size, seq_len, embed_size]

            return seq_output

    model = BLAIRSASRec(config, train_data.dataset).to(config['device'])
    trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

    best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)
    test_result = trainer.evaluate(test_data)

    print(f"Domain: {domain}")
    print(f"Best valid result: {best_valid_result}")
    print(f"Test result: {test_result}")


Using device: mps


Processing domains: 100%|██████████| 3/3 [00:00<00:00, 4634.59it/s]

Interaction dataset already exists for All_Beauty. Skipping creation.
Interaction dataset already exists for Video_Games. Skipping creation.
Interaction dataset already exists for Baby_Products. Skipping creation.
Loading saved item embeddings for All_Beauty





Loading saved item embeddings for Video_Games
Loading saved item embeddings for Baby_Products


Processing domains:   0%|          | 0/3 [00:00<?, ?it/s]

Config for All_Beauty: 
[1;35mGeneral Hyper Parameters:
[0m[1;36mgpu_id[0m =[1;33m mps[0m
[1;36muse_gpu[0m =[1;33m True[0m
[1;36mseed[0m =[1;33m 2020[0m
[1;36mstate[0m =[1;33m INFO[0m
[1;36mreproducibility[0m =[1;33m True[0m
[1;36mdata_path[0m =[1;33m dataset/All_Beauty[0m
[1;36mcheckpoint_dir[0m =[1;33m saved[0m
[1;36mshow_progress[0m =[1;33m True[0m
[1;36msave_dataset[0m =[1;33m False[0m
[1;36mdataset_save_path[0m =[1;33m None[0m
[1;36msave_dataloaders[0m =[1;33m False[0m
[1;36mdataloaders_save_path[0m =[1;33m None[0m
[1;36mlog_wandb[0m =[1;33m False[0m

[1;35mTraining Hyper Parameters:
[0m[1;36mepochs[0m =[1;33m 300[0m
[1;36mtrain_batch_size[0m =[1;33m 2048[0m
[1;36mlearner[0m =[1;33m adam[0m
[1;36mlearning_rate[0m =[1;33m 0.001[0m
[1;36mtrain_neg_sample_args[0m =[1;33m {'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}[0m
[1;36meval_step[0m =[1;33m 1[0m

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[field].fillna(value="", inplace=True)
  split_point = np.cumsum(feat[field].agg(len))[:-1]
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

: 

In [4]:
# from datasets import load_dataset

# # Load the dataset
# dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)

# # Filter items with available image URLs
# items_with_images = [item for item in dataset if len(item['images']['large']) > 0]

# # Create a dictionary mapping item IDs to image URLs
# item_image_urls = {item['parent_asin']: item['images']['large'][0] for item in items_with_images}

In [7]:
%pip install clip

Collecting clip
  Downloading clip-0.2.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25ldone
[?25h  Created wheel for clip: filename=clip-0.2.0-py3-none-any.whl size=6988 sha256=0705ef2f8eb720369fec73d091db104e77e459895a90468de5d255fde6e9476c
  Stored in directory: /Users/marcviolides/Library/Caches/pip/wheels/6c/fd/54/9d4e15cf829b871199a7cd3597e869a514d1624a0a43076896
Successfully built clip
Installing collected packages: clip
Successfully installed clip-0.2.0


In [13]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from tqdm import tqdm
import requests
from io import BytesIO
import numpy as np
import os
from datasets import load_dataset


# Set the device to MPS if available, else use CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load the pre-trained CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.to(device)
model.eval()

# Create the embeddings directory if it doesn't exist
os.makedirs("embeddings", exist_ok=True)

for domain in domains:
    print(f"Processing domain: {domain}")
    item_image_urls = {item['parent_asin']: item['images']['large'] for item in datasets[domain]['metadata'] if len(item['images']['large']) > 0}
    
    domain_dir = os.path.join("embeddings", domain)
    os.makedirs(domain_dir, exist_ok=True)

    # Generate CLIP embeddings for the images and save them to disk
    for item_id, image_urls in tqdm(item_image_urls.items(), desc=f"Generating CLIP embeddings for {domain}"):
        embedding_path = os.path.join(domain_dir, f"{item_id}.npy")
        
        # Check if the embedding already exists
        if os.path.exists(embedding_path):
            print(f"Embedding for item {item_id} already exists. Skipping.")
            continue

        try:
            all_image_features = []
            for image_url in image_urls:
                # Download the image from the URL
                response = requests.get(image_url)
                image = Image.open(BytesIO(response.content)).convert("RGB")

                # Preprocess the image
                inputs = processor(images=image, return_tensors="pt").to(device)

                # Generate CLIP embedding
                with torch.no_grad():
                    image_features = model.get_image_features(**inputs)
                    all_image_features.append(image_features.squeeze().cpu().numpy())

            # Aggregate the image embeddings (e.g., by averaging)
            if all_image_features:
                aggregated_image_features = np.mean(all_image_features, axis=0)

                # Save the aggregated embedding to disk
                np.save(embedding_path, aggregated_image_features)
        except Exception as e:
            print(f"Error processing item {item_id}: {str(e)}")
            continue


Using device: mps
Processing domain: All_Beauty


Generating CLIP embeddings for All_Beauty:   5%|▍         | 5380/112590 [00:00<00:01, 53743.54it/s]

Embedding for item B01CUPMQZE already exists. Skipping.
Embedding for item B076WQZGPM already exists. Skipping.
Embedding for item B000B658RI already exists. Skipping.
Embedding for item B088FKY3VD already exists. Skipping.
Embedding for item B07NGFDN6G already exists. Skipping.
Embedding for item B07G9GWFSM already exists. Skipping.
Embedding for item B08XZ97HFY already exists. Skipping.
Embedding for item B08DNQTTQK already exists. Skipping.
Embedding for item B01ERJEGS6 already exists. Skipping.
Embedding for item B08P7LXKP7 already exists. Skipping.
Embedding for item B01555WAOS already exists. Skipping.
Embedding for item B07GGYR3LJ already exists. Skipping.
Embedding for item B06XJZ7955 already exists. Skipping.
Embedding for item B07FVZVQKV already exists. Skipping.
Embedding for item B00FCQHYEW already exists. Skipping.
Embedding for item B07ZJW55Z5 already exists. Skipping.
Embedding for item B08X7JHQG5 already exists. Skipping.
Embedding for item B07WFSQXL5 already exists. Sk

Generating CLIP embeddings for All_Beauty:   9%|▉         | 10507/112590 [32:02<5:11:16,  5.47it/s] 


KeyboardInterrupt: 