In [40]:
import os
import time
import pandas as pd
import numpy as np
import torch
import tiktoken
import warnings
from torch import Tensor
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoFeatureExtractor
from transformers.utils import is_flash_attn_2_available
from typing import Dict, Optional, List, Union

# Suppress HF hub symlink warnings (optional)
warnings.filterwarnings(
    "ignore",
    message=".*cache-system uses symlinks by default.*",
)

# List of transformer models to use for text embeddings.
MODELS_LIST = [
    "Qwen/Qwen3-Embedding-0.6B",
    "FacebookAI/roberta-large",
    "bertin-project/bertin-roberta-base-spanish",  # RoBERTa Spanish
    "dccuchile/bert-base-spanish-wwm-cased",  # BETO
    "google-bert/bert-base-multilingual-cased",  # multilingual BERT
]

# Base directories
INPUT_DIR = 'W:/Portrait/Embeddings/Portrait Transcripts'
# For linux
#INPUT_DIR = '/Volumes/mgialou/Portrait/Embeddings/Portrait Transcripts'

OUTPUT_BASE_DIR = os.path.join(INPUT_DIR, 'embeddings')
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)

In [41]:
class Qwen3Embedding():
    def __init__(self, model_name_or_path, instruction=None,  use_fp16: bool = True, use_cuda: bool = True, max_length=8192):
        if instruction is None:
            instruction = 'Given a web search query, retrieve relevant passages that answer the query'
        self.instruction = instruction
        if is_flash_attn_2_available() and use_cuda:
            self.model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16)
        else:
            self.model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype=torch.float16)
        if use_cuda:
            self.model = self.model.cuda()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True, padding_side='left')
        self.max_length=max_length
    
    def last_token_pool(self, last_hidden_states: Tensor,
        attention_mask: Tensor) -> Tensor:
        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
        if left_padding:
            return last_hidden_states[:, -1]
        else:
            sequence_lengths = attention_mask.sum(dim=1) - 1
            batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

    def get_detailed_instruct(self, task_description: str, query: str) -> str:
        if task_description is None:
            task_description = self.instruction
        return f'Instruct: {task_description}\nQuery:{query}'

    def encode(self, sentences: Union[List[str], str], is_query: bool = False, instruction=None, dim: int = -1):
        if isinstance(sentences, str):
            sentences = [sentences]
        if is_query:
            sentences = [self.get_detailed_instruct(instruction, sent) for sent in sentences]
        inputs = self.tokenizer(sentences, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')
        inputs.to(self.model.device)
        model_outputs = self.model(**inputs)
        output = self.last_token_pool(model_outputs.last_hidden_state, inputs['attention_mask'])
        if dim != -1:
            output = output[:, :dim]
        output  = F.normalize(output, p=2, dim=1)
        return output

In [42]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """
    Returns the number of tokens in a text string using tiktoken encoding.
    """
    if not string:
        return 0
    encoding = tiktoken.get_encoding(encoding_name)
    return len(encoding.encode(string))


In [43]:
def compute_transformer_embedding(text: str, tokenizer, model, device):
    """
    Given a text string, tokenize it using the provided tokenizer and compute
    its embedding using the provided model. The embedding is taken from the [CLS]
    token (assumed to be the first token).
    """
    # Tokenize text. Adjust truncation and padding as needed.
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Forward pass without gradient calculation.
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the last hidden state and extract the [CLS] token representation.
    hidden_state = outputs.last_hidden_state.detach().cpu().numpy()
    embedding = hidden_state[:, 0, :].reshape(-1)
    return embedding

In [44]:
def load_transformer_models(models_list, device):
    """
    Load each transformer model and its tokenizer, move to the specified device.
    Returns a dict mapping model_name -> (tokenizer, model, model_id).
    """
    transformers_models = {}
    for model_name in models_list:
        print(f"Loading model {model_name}...")
        # derive a simple model identifier for filenames
        model_id = model_name.split('/')[-1]
        # special-case Qwen
        if model_name.lower().startswith("qwen/qwen3-embedding"):
            print(f"Loading Qwen3Embedding for {model_name}")
            loader = Qwen3Embedding(
                model_name_or_path=model_name,
                use_fp16=True,
                use_cuda=torch.cuda.is_available(),
            )
            transformers_models[model_name] = ("qwen", loader, model_id)
            continue

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        model.to(device)
        model.eval()
        transformers_models[model_name] = (tokenizer, model, model_id)
    return transformers_models

In [45]:
def process_user_file(xlsx_path, output_dir, transformers_models, encoding_name='cl100k_base'):
    """
    Process a single user's Excel file of transcripts:
      - Reads columns: questionnaire, question, transcript_whisper
      - Generates embeddings per model
      - Saves each embedding as <questionnaire>_<question>-<model_id>.npy
    """
    df = pd.read_excel(xlsx_path)
    for _, row in df.iterrows():
        text = row.get('transcript_whisper', '') or ''
        # build filename stem
        questionnaire = str(row.get('questionnaire', ''))
        question = str(row.get('question', ''))
        file_stem = f"{questionnaire}_{question}"
        # count tokens
        token_count = num_tokens_from_string(text, encoding_name) if text else 0

        for model_name, (tokenizer, model, model_id) in transformers_models.items():
            # create per-model folder under user
            model_dir = os.path.join(user_output_dir, model_id)
            os.makedirs(model_dir, exist_ok=True)

            filename = f"{file_stem}.npy"
            if token_count == 0:
                print(f"\nSKATA empty\n")
                embedding = np.empty((0,))
            else:
                if tokenizer == "qwen":
                    with torch.no_grad():
                        # Qwen3Embedding.encode will return a torch.Tensor
                        embedding = model.encode(text, is_query=False).cpu().numpy().reshape(-1)
                else:
                    embedding = compute_transformer_embedding(text, tokenizer, model, torch.device(model.device.type))

            save_path = os.path.join(model_dir, filename)
            np.save(save_path, embedding)
            print(f"Saved embedding to {save_path}")


In [None]:
tic = time.perf_counter()

# determine device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# load models once
transformers_models = load_transformer_models(MODELS_LIST, device)

# iterate over all Excel files in INPUT_DIR
for fname in os.listdir(INPUT_DIR):
    # Skip hidden files
    if fname.startswith('.'):
        continue
    if not fname.endswith('_transcripts_Model.xlsx'):
        continue
    user_id = fname.split('_')[0]
    user_input_path = os.path.join(INPUT_DIR, fname)
    user_output_dir = os.path.join(OUTPUT_BASE_DIR, user_id)
    os.makedirs(user_output_dir, exist_ok=True)
    print(f"\nProcessing user {user_id} from {user_input_path}...")
    process_user_file(user_input_path, user_output_dir, transformers_models)

toc = time.perf_counter()
print(f"\nAll users processed in {round((toc - tic)/60, 2)} min.")

Loading model Qwen/Qwen3-Embedding-0.6B...
Loading Qwen3Embedding for Qwen/Qwen3-Embedding-0.6B
Loading model FacebookAI/roberta-large...


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model bertin-project/bertin-roberta-base-spanish...


Some weights of RobertaModel were not initialized from the model checkpoint at bertin-project/bertin-roberta-base-spanish and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model dccuchile/bert-base-spanish-wwm-cased...


Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model google-bert/bert-base-multilingual-cased...

Processing user 12HSC from W:/Portrait/Embeddings/Portrait Transcripts\12HSC_transcripts_Model.xlsx...
Saved embedding to W:/Portrait/Embeddings/Portrait Transcripts\embeddings\12HSC\Qwen3-Embedding-0.6B\GR_Survey_q1.npy
Saved embedding to W:/Portrait/Embeddings/Portrait Transcripts\embeddings\12HSC\roberta-large\GR_Survey_q1.npy
Saved embedding to W:/Portrait/Embeddings/Portrait Transcripts\embeddings\12HSC\bertin-roberta-base-spanish\GR_Survey_q1.npy
Saved embedding to W:/Portrait/Embeddings/Portrait Transcripts\embeddings\12HSC\bert-base-spanish-wwm-cased\GR_Survey_q1.npy
Saved embedding to W:/Portrait/Embeddings/Portrait Transcripts\embeddings\12HSC\bert-base-multilingual-cased\GR_Survey_q1.npy
Saved embedding to W:/Portrait/Embeddings/Portrait Transcripts\embeddings\12HSC\Qwen3-Embedding-0.6B\GR_Survey_q2.npy
Saved embedding to W:/Portrait/Embeddings/Portrait Transcripts\embeddings\12HSC\roberta-large\GR_Survey_q2.npy
Save