In [17]:
import os
import time
import pandas as pd
import numpy as np
import torch
import tiktoken
from transformers import AutoTokenizer, AutoModel, AutoFeatureExtractor

# List of transformer models to use for text embeddings.
MODELS_LIST = [
    "Qwen/Qwen3-Embedding-0.6B",
    "FacebookAI/roberta-large",
    "bertin-project/bertin-roberta-base-spanish",  # RoBERTa Spanish
    "dccuchile/bert-base-spanish-wwm-cased",  # BETO
    "google-bert/bert-base-multilingual-cased",  # multilingual BERT
]

# Base directories
INPUT_DIR = '/Volumes/mgialou/Portrait/Embeddings/Portrait Transcripts'
OUTPUT_BASE_DIR = os.path.join(INPUT_DIR, 'embeddings')
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)

In [18]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """
    Returns the number of tokens in a text string using tiktoken encoding.
    """
    if not string:
        return 0
    encoding = tiktoken.get_encoding(encoding_name)
    return len(encoding.encode(string))


In [19]:
def compute_transformer_embedding(text: str, tokenizer, model, device):
    """
    Given a text string, tokenize it using the provided tokenizer and compute
    its embedding using the provided model. The embedding is taken from the [CLS]
    token (assumed to be the first token).
    """
    # Tokenize text. Adjust truncation and padding as needed.
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Forward pass without gradient calculation.
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the last hidden state and extract the [CLS] token representation.
    hidden_state = outputs.last_hidden_state.detach().cpu().numpy()
    embedding = hidden_state[:, 0, :].reshape(-1)
    return embedding

In [20]:
def load_transformer_models(models_list, device):
    """
    Load each transformer model and its tokenizer, move to the specified device.
    Returns a dict mapping model_name -> (tokenizer, model, model_id).
    """
    transformers_models = {}
    for model_name in models_list:
        print(f"Loading model {model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        model.to(device)
        model.eval()
        # derive a simple model identifier for filenames
        model_id = model_name.split('/')[-1]
        transformers_models[model_name] = (tokenizer, model, model_id)
    return transformers_models

In [21]:
def process_user_file(xlsx_path, output_dir, transformers_models, encoding_name='cl100k_base'):
    """
    Process a single user's Excel file of transcripts:
      - Reads columns: questionnaire, question, transcript_whisper
      - Generates embeddings per model
      - Saves each embedding as <questionnaire>_<question>-<model_id>.npy
    """
    df = pd.read_excel(xlsx_path)
    for _, row in df.iterrows():
        text = row.get('transcript_whisper', '') or ''
        # build filename stem
        questionnaire = str(row.get('questionnaire', ''))
        question = str(row.get('question', ''))
        file_stem = f"{questionnaire}_{question}"
        # count tokens
        token_count = num_tokens_from_string(text, encoding_name) if text else 0

        for model_name, (tokenizer, model, model_id) in transformers_models.items():
            if token_count == 0:
                filename = f"{file_stem}-{model_id}-EMPTY.npy"
                embedding = np.empty((0,))
            else:
                filename = f"{file_stem}-{model_id}.npy"
                embedding = compute_transformer_embedding(text, tokenizer, model, torch.device(model.device.type))

            save_path = os.path.join(output_dir, filename)
            np.save(save_path, embedding)
            print(f"Saved embedding to {save_path}")


In [22]:
tic = time.perf_counter()

# determine device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# load models once
transformers_models = load_transformer_models(MODELS_LIST, device)

# iterate over all Excel files in INPUT_DIR
for fname in os.listdir(INPUT_DIR):
    if not fname.endswith('_transcripts_Model.xlsx'):
        continue
    user_id = fname.split('_')[0]
    user_input_path = os.path.join(INPUT_DIR, fname)
    user_output_dir = os.path.join(OUTPUT_BASE_DIR, user_id)
    os.makedirs(user_output_dir, exist_ok=True)

    print(f"\nProcessing user {user_id} from {user_input_path}...")
    process_user_file(user_input_path, user_output_dir, transformers_models)

toc = time.perf_counter()
print(f"\nAll users processed in {round((toc - tic)/60, 2)} min.")

Loading model Qwen/Qwen3-Embedding-0.6B...


Cancellation requested; stopping current tasks.


KeyboardInterrupt: 