# Step 0: Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer, MT5EncoderModel
from tqdm import tqdm
import torch
import time

In [2]:
df = pd.read_csv("../../p1_data/cleaned/amazon_clean.csv")

# Step 1: Select Device (GPU/CPU)

In [3]:
def select_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("GPU Nvidia cuda will be our device")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Apple Silicon Chip will be our device")
    else:
        device = torch.device("cpu")
        print("CPU will be our device")
    
    return device
device = select_device()

Apple Silicon Chip will be our device


# Step 2: Get Model and Tokenizer

In [4]:
def select_model_and_tokenizer(model_name):
    ## Model
    # Select Model [for mt5 just import the encoder]
    if "mt5" in model_name:
        model = MT5EncoderModel.from_pretrained(model_name)
    else:
        model = AutoModel.from_pretrained(model_name)

    # Move model to device
    model.to(device)
    model.eval()
    ####################################################
    ## Tokenizer
    # Select Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)    

    return tokenizer, model

# Step 3: Tokenization and Embedding function

In [5]:
def tokenization_and_embedding(dataframe, model_name, batch_size=100):
    # Starting timer
    total_start_time = time.time()
    
    # Set model and tokenizer
    tokenizer, model = select_model_and_tokenizer(model_name)
    
    # Convert dataframe's "cleaned_review" column to list of reviews
    reviews_data = dataframe["cleaned_review"].astype(str).to_list()
    
    
    # ------------------- Tokenization and Embedding in batches ------------------------------------

    # List of all embeddings from the batches
    all_embeddings = []

    # Batcheting
    with torch.no_grad():
        for i in tqdm(range(0, len(reviews_data), batch_size), desc="Processing Batches"):
            batch_reviews = reviews_data[i: i+batch_size]


    # Tokenizing each batch and save into "inputs"
            inputs = tokenizer(
            batch_reviews,
            max_length = 256,
            padding = 'max_length',
            truncation = True,
            return_tensors = 'pt'
            )
            

    # Move inputs of each batch to the same device as the model
            inputs = {k: v.to(device) for k, v in inputs.items()}

    
    # Embedding each batch
            #Forward pass
            outputs = model(**inputs)
        
            # Mean pooling to get sentence embeddings
            embeddings = outputs.last_hidden_state.mean(dim=1)
        
            # Move to CPU and convert to numpy
            batch_embeddings = embeddings.cpu().detach().numpy()
        
            # Append to all embeddings list
            all_embeddings.append(batch_embeddings)


    # Ending Timer
    total_end_time = time.time()
    total_elapsed_time = total_end_time - total_start_time
            
    print(f"The total time used by {model_name} model is {total_elapsed_time} seconds, which equals to {total_elapsed_time/60} minutes, and {total_elapsed_time/3600} hours")
            
    # Combine all batches
    final_embeddings = np.vstack(all_embeddings)

    
    return final_embeddings

# Step 4: Experiment for four models

In [6]:
# Define the models we want to use
models = [
    "bert-base-uncased",
    "distilbert-base-uncased", 
    "xlm-roberta-base",
    "google/mt5-base"
]

# Dictionary to store all embeddings
all_embeddings_dict = {}

In [7]:
for model_name in models:
    try:
        embeddings = tokenization_and_embedding(df, model_name)
        all_embeddings_dict[model_name] = embeddings

        # Save embeddings
        saving_name = model_name.replace("/", "_")
        np.save(f"embeddings_{saving_name}.npy", embeddings)
        print(f"üíæ Saved: embeddings_{saving_name}.npy")
    except Exception as e:
        print(f"‚ùå Error with {model_name}: {str(e)}")
        continue

Processing Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3633/3633 [2:09:41<00:00,  2.14s/it]


The total time used by bert-base-uncased model is 7782.892159938812 seconds, which equals to 129.71486933231353 minutes, and 2.161914488871892 hours
üíæ Saved: embeddings_bert-base-uncased.npy


Processing Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3633/3633 [1:03:50<00:00,  1.05s/it]


The total time used by distilbert-base-uncased model is 3831.9693439006805 seconds, which equals to 63.866155731678006 minutes, and 1.0644359288613001 hours
üíæ Saved: embeddings_distilbert-base-uncased.npy


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Processing Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3633/3633 [2:06:25<00:00,  2.09s/it]


The total time used by xlm-roberta-base model is 7914.656589984894 seconds, which equals to 131.9109431664149 minutes, and 2.1985157194402483 hours
üíæ Saved: embeddings_xlm-roberta-base.npy


config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

‚ùå Error with google/mt5-base: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwe