In [None]:
import os
import time
import pandas as pd
import numpy as np
import torch
import tiktoken
from transformers import AutoTokenizer, AutoModel, AutoFeatureExtractor

# List of transformer models to use for text embeddings.

options_text = [
    "j-hartmann/emotion-english-distilroberta-base",
    "distilbert/distilroberta-base",
    "j-hartmann/emotion-english-roberta-large",
    "FacebookAI/roberta-large",
    "bertin-project/bertin-roberta-base-spanish",  # RoBERTa Spanish
    "dccuchile/bert-base-spanish-wwm-cased",  # BETO
    "google-bert/bert-base-multilingual-cased",  # multilingual BERT
]


In [None]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """
    Returns the number of tokens in a text string using tiktoken encoding.
    """
    if not string:
        return 0
    encoding = tiktoken.get_encoding(encoding_name)
    return len(encoding.encode(string))


In [None]:
def compute_transformer_embedding(text: str, tokenizer, model, device):
    """
    Given a text string, tokenize it using the provided tokenizer and compute
    its embedding using the provided model. The embedding is taken from the [CLS]
    token (assumed to be the first token).
    """
    # Tokenize text. Adjust truncation and padding as needed.
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Forward pass without gradient calculation.
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the last hidden state and extract the [CLS] token representation.
    hidden_state = outputs.last_hidden_state.detach().cpu().numpy()
    embedding = hidden_state[:, 0, :].reshape(-1)
    return embedding

In [None]:
def process_xlsx_transformers(xlsx_file, models_list, encoding_name="cl100k_base"):
    """
    Processes an Excel file with columns: id, subject, question, transcript.
    For each row, it:
      - Counts tokens in the transcript.
      - Uses transformer models to calculate a text embedding.
      - Saves the embedding as a .npy file in a separate folder.

    The output filename is of the form:
       <file_id>-<model_identifier>.npy
    where <model_identifier> is derived from the model name.
    """
    # Read the Excel file.
    df = pd.read_excel(xlsx_file)

    # Determine the device to use for model inference.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load each model and its tokenizer once.
    transformers_models = {}
    output_folders = {}
    for model_name in models_list:
        print(f"Loading model {model_name} ...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        model.to(device)
        model.eval()
        transformers_models[model_name] = (tokenizer, model)
        # Use the part after the slash as model identifier (or the full name if not available).
        model_identifier = model_name.split('/')[1] if '/' in model_name else model_name
        # Prepare output folder.
        folder_name = f"transformer_embeddings_{model_identifier}"
        os.makedirs(folder_name, exist_ok=True)
        output_folders[model_name] = folder_name


    # Process each row in the Excel file.
    for _, row in df.iterrows():
        # Remove any extension from the file id.
        file_id = os.path.splitext(row['id'])[0]
        transcript = row['transcript']
        if pd.isna(transcript) or transcript == "":
            transcript = ""
            print("SKATA: transcript is empty")
        else:
            transcript = transcript.replace("\n", " ")

        # Count tokens (useful for logging or ensuring transcript length limitations).
        token_count = num_tokens_from_string(transcript, encoding_name) if transcript else 0

        # Process the transcript for each transformer model.
        for model_name, (tokenizer, model) in transformers_models.items():
            model_identifier = model_name.split('/')[1] if '/' in model_name else model_name
            output_folder = output_folders[model_name]

            if token_count == 0:
                output_filename = f"{file_id}-{model_identifier}-EMPTY.npy"
                embedding = np.empty((0,))
            else:
                output_filename = f"{file_id}-{model_identifier}.npy"
                embedding = compute_transformer_embedding(transcript, tokenizer, model, device)

            output_filepath = os.path.join(output_folder, output_filename)
            with open(output_filepath, 'wb') as f:
                np.save(f, embedding)
            print(f"Saved embedding to {output_filepath}")

In [None]:
excel_path = "transcriptions.xlsx"  # Path to your Excel file.
tic = time.perf_counter()
process_xlsx_transformers(excel_path, options_text)
toc = time.perf_counter()
print('Duration:', round((toc - tic) / 60, 2), 'min')