In [None]:
import re
import os
import uuid
from transformers import AutoTokenizer, AutoModel

def document_chunker(directory_path,
                     model_name,
                     paragraph_separator='\n\n',
                     chunk_size=256,
                     separator=' ',
                     secondary_chunking_regex=r'\S+?[\.,;!?]',
                     chunk_overlap=0):
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load tokenizer for the specified model
    documents = {}  # Initialize dictionary to store results

    # Read each file in the specified directory
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        base = os.path.basename(file_path)
        sku = os.path.splitext(base)[0]
        print(file_path, base, sku)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='latin-1') as file:
                text = file.read()

            # Generate a unique identifier for the document
            doc_id = str(uuid.uuid4())

            # Process each file using the existing chunking logic
            paragraphs = re.split(paragraph_separator, text)
            all_chunks = {}
            for paragraph in paragraphs:
                words = paragraph.split(separator)
                current_chunk = ""
                chunks = []

                for word in words:
                    print("words")
                    new_chunk = current_chunk + (separator if current_chunk else '') + word
                    if len(tokenizer.tokenize(new_chunk)) <= chunk_size:
                        print("tokenizer")
                        current_chunk = new_chunk
                    else:
                        if current_chunk:
                            chunks.append(current_chunk)
                        current_chunk = word

                if current_chunk:
                    chunks.append(current_chunk)

                refined_chunks = []
                for chunk in chunks:
                    if len(tokenizer.tokenize(chunk)) > chunk_size:
                        print("tokenizer 2")
                        sub_chunks = re.split(secondary_chunking_regex, chunk)
                        sub_chunk_accum = ""
                        for sub_chunk in sub_chunks:
                            if sub_chunk_accum and len(tokenizer.tokenize(sub_chunk_accum + sub_chunk + ' ')) > chunk_size:
                                refined_chunks.append(sub_chunk_accum.strip())
                                sub_chunk_accum = sub_chunk
                            else:
                                sub_chunk_accum += (sub_chunk + ' ')
                        if sub_chunk_accum:
                            refined_chunks.append(sub_chunk_accum.strip())
                    else:
                        refined_chunks.append(chunk)

                final_chunks = []
                if chunk_overlap > 0 and len(refined_chunks) > 1:
                    for i in range(len(refined_chunks) - 1):
                        final_chunks.append(refined_chunks[i])
                        overlap_start = max(0, len(refined_chunks[i]) - chunk_overlap)
                        overlap_end = min(chunk_overlap, len(refined_chunks[i+1]))
                        overlap_chunk = refined_chunks[i][overlap_start:] + ' ' + refined_chunks[i+1][:overlap_end]
                        final_chunks.append(overlap_chunk)
                    final_chunks.append(refined_chunks[-1])
                else:
                    final_chunks = refined_chunks

                # Assign a UUID for each chunk and structure it with text and metadata
                for chunk in final_chunks:
                    chunk_id = str(uuid.uuid4())
                    all_chunks[chunk_id] = {"text": chunk, "metadata": {"file_name":sku}}  # Initialize metadata as dict
                    print(all_chunks)

            # Map the document UUID to its chunk dictionary
            documents[doc_id] = all_chunks

    return documents

In [None]:
document_chunker(directory_path="/home/paborsan/Documents/RAG_Data/",
                            model_name='TheBloke/mistral-7b-instruct-v0.2.Q5_K_M.gguf',
                            chunk_size=256)

In [None]:
import fitz  # alias de PyMuPDF

def extraer_texto_pdf_fitz(ruta_pdf: str) -> str:
    """
    Extrae texto conservando espacios usando PyMuPDF.
    """
    texto = []
    doc = fitz.open(ruta_pdf)
    for pagina in doc:
        # get_text("text") preserva la posición y espacios más fielmente
        texto.append(pagina.get_text("text"))
    return "".join(texto)

# Ejemplo de uso:
if __name__ == "__main__":
    ruta = r"/home/paborsan/Documents/RAG_Data/Deep Learning by Ian Goodfellow, Yoshua Bengio, Aaron Courville (z-lib.org).pdf"
    texto = extraer_texto_pdf_fitz(ruta)
    print(texto)

In [None]:
def document_chunker(directory_path,
                     model_name,
                     paragraph_separator='\n\n',
                     chunk_size=256,
                     separator=' ',
                     secondary_chunking_regex=r'\S+?[\.,;!?]',
                     chunk_overlap=0):
    """
    Lee todos los archivos de texto en un directorio y los fragmenta en chunks.
    Retorna un dict {doc_id: {chunk_id: {'text': ..., 'metadata': {...}}}}
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    documents = {}

    def _chunk_text(text):
        """
        Fragmenta un texto en chunks de tamaño máximo `chunk_size` tokens,
        aplicando overlap y una división secundaria con regex si es necesario.
        """
        # 1) División inicial por palabras
        words = text.split(separator)
        chunks = []
        current = []
        current_len = 0

        for word in words:
            token_count = len(tokenizer.tokenize(word + separator))
            if current_len + token_count <= chunk_size:
                current.append(word)
                current_len += token_count
            else:
                chunks.append(separator.join(current))
                current = [word]
                current_len = token_count
        if current:
            chunks.append(separator.join(current))

        # 2) Refinar los chunks demasiado largos con regex secundaria
        refined = []
        for chunk in chunks:
            if len(tokenizer.tokenize(chunk)) <= chunk_size:
                refined.append(chunk)
            else:
                parts = re.split(secondary_chunking_regex, chunk)
                part_accum = []
                accum_len = 0
                for part in parts:
                    part_tokens = len(tokenizer.tokenize(part))
                    if accum_len + part_tokens > chunk_size:
                        refined.append(separator.join(part_accum).strip())
                        part_accum = [part]
                        accum_len = part_tokens
                    else:
                        part_accum.append(part)
                        accum_len += part_tokens
                if part_accum:
                    refined.append(separator.join(part_accum).strip())

        # 3) Aplicar overlap si se solicita
        if chunk_overlap > 0 and len(refined) > 1:
            overlapped = []
            for i in range(len(refined) - 1):
                overlapped.append(refined[i])
                # Construir pedazo solapado
                left = refined[i].split(separator)[-chunk_overlap:]
                right = refined[i+1].split(separator)[:chunk_overlap]
                overlapped.append(separator.join(left + right))
            overlapped.append(refined[-1])
            return overlapped

        return refined

    # Procesar cada archivo en el directorio
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if not os.path.isfile(file_path):
            continue

        base = os.path.basename(file_path)
        sku = os.path.splitext(base)[0]
        with open(file_path, 'r', encoding='latin-1') as f:
            text = f.read()

        doc_id = str(uuid.uuid4())
        all_chunks = {}

        # Separar por párrafos y luego fragmentar cada uno
        paragraphs = re.split(paragraph_separator, text)
        x = 0
        for para in paragraphs:
            para = para.strip()
            if not para:
                continue
            sub_chunks = _chunk_text(para)
            for chunk_text in sub_chunks:
                chunk_id = str(uuid.uuid4())
                all_chunks[chunk_id] = {
                    'text': chunk_text,
                    'metadata': {'file_name': sku}
                }
                if x % 1000 == 0:
                    print(sub_chunks)
                x += 1

        documents[doc_id] = all_chunks

    return documents

In [None]:
document_chunker(directory_path="/home/paborsan/Documents/RAG_Data/",
                            model_name='heBloke/TheBloke/Mistral-7B-Instruct-v0.2-GGUF',
                            chunk_size=256)

In [5]:
model_name = "BAAI/bge-small-en-v1.5"
directory_path = "/home/paborsan/Documents/RAG_Data/"
paragraph_separator='\n\n'
chunk_size=256
separator=' '
secondary_chunking_regex=r'\S+?[\.,;!?]'
chunk_overlap= 0

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
import re
import uuid
import torch
import fitz

# Configuración de cuantización para 4-bit (NF4)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", # nf4 o fp4
    bnb_4bit_use_double_quant=True, # Doble cuantización para más ahorro de memoria
    bnb_4bit_compute_dtype=torch.bfloat16 # Tipo de datos para los cálculos internos
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto" # Para distribuir el modelo si tienes múltiples GPUs
)

"""
Lee todos los archivos de texto en un directorio y los fragmenta en chunks.
Retorna un dict {doc_id: {chunk_id: {'text': ..., 'metadata': {...}}}}
"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
documents = {}

def _chunk_text(text):
    """
    Fragmenta un texto en chunks de tamaño máximo `chunk_size` tokens,
    aplicando overlap y una división secundaria con regex si es necesario.
    """
    # 1) División inicial por palabras
    words = text.split(separator)
    chunks = []
    current = []
    current_len = 0

    for word in words:
        token_count = len(tokenizer.tokenize(word + separator))
        if current_len + token_count <= chunk_size:
            current.append(word)
            current_len += token_count
        else:
            chunks.append(separator.join(current))
            current = [word]
            current_len = token_count
    if current:
        chunks.append(separator.join(current))

    # 2) Refinar los chunks demasiado largos con regex secundaria
    refined = []
    for chunk in chunks:
        if len(tokenizer.tokenize(chunk)) <= chunk_size:
            refined.append(chunk)
        else:
            parts = re.split(secondary_chunking_regex, chunk)
            part_accum = []
            accum_len = 0
            for part in parts:
                part_tokens = len(tokenizer.tokenize(part))
                if accum_len + part_tokens > chunk_size:
                    refined.append(separator.join(part_accum).strip())
                    part_accum = [part]
                    accum_len = part_tokens
                else:
                    part_accum.append(part)
                    accum_len += part_tokens
            if part_accum:
                refined.append(separator.join(part_accum).strip())

    # 3) Aplicar overlap si se solicita
    if chunk_overlap > 0 and len(refined) > 1:
        overlapped = []
        for i in range(len(refined) - 1):
            overlapped.append(refined[i])
            # Construir pedazo solapado
            left = refined[i].split(separator)[-chunk_overlap:]
            right = refined[i+1].split(separator)[:chunk_overlap]
            overlapped.append(separator.join(left + right))
        overlapped.append(refined[-1])
        return overlapped

    return refined

# Procesar cada archivo en el directorio
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    if not os.path.isfile(file_path):
        continue

    base = os.path.basename(file_path)
    sku = os.path.splitext(base)[0]

    doc   = fitz.open(file_path)
    text  = ""
    for page in doc:
        text += page.get_text("text")

    doc_id = str(uuid.uuid4())
    all_chunks = {}

    # Separar por párrafos y luego fragmentar cada uno
    paragraphs = re.split(paragraph_separator, text)
    x = 0
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        sub_chunks = _chunk_text(para)
        for chunk_text in sub_chunks:
            chunk_id = str(uuid.uuid4())
            all_chunks[chunk_id] = {
                'text': chunk_text,
                'metadata': {'file_name': sku}
            }
            if x % 1000 == 0:
                print(sub_chunks)
            x += 1

    documents[doc_id] = all_chunks

tokenizer.save_pretrained("model/tokenizer")
model.save_pretrained(
    "model/embedding",
    max_shard_size="1GB",
    safe_serialization=False
)

documents

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at BAAI/bge-small-en-v1.5 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['Deep Learning\nIan Goodfellow\nYoshua Bengio\nAaron Courville\nContents\nWebsite\nvii\nAcknowledgments\nviii\nNotation\nxi\n1\nIntroduction\n1\n1.1\nWho Should Read This Book? . . . . . . . . . . . . . . . . . . . .\n8\n1.2\nHistorical Trends in Deep Learning . . . . . . . . . . . . . . . . .\n11\nI\nApplied Math and Machine Learning Basics\n29\n2\nLinear Algebra\n31\n2.1\nScalars, Vectors, Matrices and Tensors . . . . . . . . . . . . . . .\n31\n2.2\nMultiplying Matrices and Vectors . . . . . . . . . . . . . . . . . .\n34\n2.3\nIdentity and Inverse Matrices\n. . . . . . . . . . . . . . . . . . . .\n36\n2.4\nLinear Dependence and Span\n. . . . . . . . . . . . . . . . . . . .\n37\n2.5\nNorms . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n39\n2.6\nSpecial Kinds of Matrices and', 'Vectors\n. . . . . . . . . . . . . . .\n40\n2.7\nEigendecomposition . . . . . . . . . . . . . . . . . . . . . . . . . .\n42\n2.8\nSingular Value Decomposition . . . . . . . . . . . . . . . .

{'555b06a2-0e75-46fc-8246-26e87ebeeb28': {'9ed789be-ea87-4db2-bf37-aacc3a3182b3': {'text': 'Deep Learning\nIan Goodfellow\nYoshua Bengio\nAaron Courville\nContents\nWebsite\nvii\nAcknowledgments\nviii\nNotation\nxi\n1\nIntroduction\n1\n1.1\nWho Should Read This Book? . . . . . . . . . . . . . . . . . . . .\n8\n1.2\nHistorical Trends in Deep Learning . . . . . . . . . . . . . . . . .\n11\nI\nApplied Math and Machine Learning Basics\n29\n2\nLinear Algebra\n31\n2.1\nScalars, Vectors, Matrices and Tensors . . . . . . . . . . . . . . .\n31\n2.2\nMultiplying Matrices and Vectors . . . . . . . . . . . . . . . . . .\n34\n2.3\nIdentity and Inverse Matrices\n. . . . . . . . . . . . . . . . . . . .\n36\n2.4\nLinear Dependence and Span\n. . . . . . . . . . . . . . . . . . . .\n37\n2.5\nNorms . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .\n39\n2.6\nSpecial Kinds of Matrices and',
   'metadata': {'file_name': 'Deep Learning by Ian Goodfellow, Yoshua Bengio, Aaron Courville (z-lib.

In [None]:
import torch

tokenizer = AutoTokenizer.from_pretrained("/home/paborsan/Documents/Projects/local_rag/local_rag/src/model/tokenizer", local_files_only=True) 
model = AutoModelForCausalLM.from_pretrained("/home/paborsan/Documents/Projects/local_rag/local_rag/src/model/embedding", local_files_only=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()} 

with torch.no_grad():
    # Asegúrate de pedir los hidden states para poder extraer el embedding
    outputs = model(**inputs, output_hidden_states=True)
    
    # Extrae el embedding del último estado oculto, promedia sobre la dimensión de la secuencia
    # y elimina las dimensiones de tamaño 1.
    embeddings = outputs.hidden_states[-1].mean(dim=1).squeeze()

# Si `text` fue una sola cadena, `embeddings` será un tensor 1D.
# Si `text` fue una lista de cadenas (un lote), `embeddings` será 2D.
# `.tolist()` lo convierte a una lista de Python (o lista de listas para lotes).
compute_embeddings_result = embeddings.tolist()

In [None]:
def compute_embeddings(text):
    tokenizer = AutoTokenizer.from_pretrained("/home/paborsan/Documents/Projects/local_rag/local_rag/src/model/tokenizer", local_files_only=True) 
    model = AutoModelForCausalLM.from_pretrained("/home/paborsan/Documents/Projects/local_rag/local_rag/src/model/embedding", local_files_only=True)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.eos_token_id

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) 
    
    # Generate the embeddings 
    with torch.no_grad():    
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze()

    return embeddings.tolist()

In [None]:
def create_vector_store(doc_store):
    vector_store = {}
    for doc_id, chunks in doc_store.items():
        doc_vectors = {}
        for chunk_id, chunk_dict in chunks.items():
            # Generate an embedding for each chunk of text
            doc_vectors[chunk_id] = compute_embeddings(chunk_dict.get("text"))
        # Store the document's chunk embeddings mapped by their chunk UUIDs
        vector_store[doc_id] = doc_vectors
    return vector_store

In [7]:
import torch
from sentence_transformers import SentenceTransformer # ¡Nueva importación!

# --- Carga Global del Modelo de Embeddings (¡Una sola vez!) ---
# Define el dispositivo a usar (GPU si está disponible, si no, CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo para embeddings: {device}")

# Nombre del modelo de SentenceTransformer
MODEL_NAME = "BAAI/bge-small-en-v1.5"

# Declara la variable global para el modelo
GLOBAL_EMBEDDING_MODEL = None

try:
    print(f"Cargando modelo de embeddings: {MODEL_NAME} en {device}...")
    # SentenceTransformer carga el modelo automáticamente al dispositivo disponible (GPU si está).
    # No necesitas AutoTokenizer o AutoModel aquí, SentenceTransformer lo maneja.
    # El parámetro 'device' asegura que el modelo se cargue directamente en la GPU si está disponible.
    GLOBAL_EMBEDDING_MODEL = SentenceTransformer(MODEL_NAME, device=str(device))
    GLOBAL_EMBEDDING_MODEL.eval() # Poner en modo evaluación para inferencia
    print("Modelo de embeddings cargado exitosamente.")

except Exception as e:
    print(f"Error al cargar el modelo de embeddings. Asegúrate de tener conexión a internet o los archivos descargados localmente: {e}")
    GLOBAL_EMBEDDING_MODEL = None
    # Si el modelo no se carga, el script no puede continuar.
    # Para Jupyter, considera no usar exit(1) para no detener el kernel,
    # pero asegúrate de manejar el caso donde GLOBAL_EMBEDDING_MODEL es None.
    # raise e # O relanza la excepción si quieres un error más visible.


def compute_embeddings_batch(texts: list[str], batch_size: int = 32) -> list[list[float]]:
    """
    Calcula embeddings para una lista de textos en lotes utilizando el modelo BGE.

    Args:
        texts: Una lista de cadenas de texto a embeber.
        batch_size: El número de textos a procesar en cada lote.

    Returns:
        Una lista de embeddings, donde cada embedding es una lista de flotantes.
    """
    if GLOBAL_EMBEDDING_MODEL is None:
        raise RuntimeError("El modelo de embeddings no se cargó correctamente. No se pueden generar embeddings.")

    print(f"Generando embeddings para {len(texts)} textos en lotes de {batch_size}...")
    
    # El método .encode() de SentenceTransformer es altamente optimizado:
    # maneja la tokenización, el paso por el modelo y el pooling (pooling por defecto para BGE es CLS o mean)
    # y devuelve tensores de PyTorch.
    # También maneja automáticamente el movimiento a la GPU si el modelo está en ella.
    embeddings = GLOBAL_EMBEDDING_MODEL.encode(
        texts, # ¡Pasamos directamente la lista de textos!
        batch_size=batch_size,
        show_progress_bar=True, # Muestra una barra de progreso, útil para muchos chunks
        convert_to_tensor=True, # Devuelve tensores de PyTorch
        normalize_embeddings=True # BGE está diseñado para que los embeddings estén normalizados
    )
    
    # Convierte los tensores PyTorch a listas de Python.
    return embeddings.tolist()


def create_vector_store_optimized(doc_store: dict) -> dict:
    """
    Crea un almacén de vectores a partir de un diccionario de documentos,
    generando embeddings para cada fragmento de texto en lotes.

    Args:
        doc_store: Un diccionario donde las claves son IDs de documentos y los valores
                   son diccionarios de fragmentos (chunks), donde cada fragmento
                   contiene un campo 'text'.

    Returns:
        Un diccionario que mapea IDs de documentos a diccionarios de IDs de fragmentos
        a sus respectivos embeddings.
    """
    vector_store = {}
    
    # Recolectar todos los textos y sus metadatos
    all_texts_to_embed = []
    chunk_metadata_map = [] 

    for doc_id, chunks in doc_store.items():
        for chunk_id, chunk_dict in chunks.items():
            text = chunk_dict.get("text")
            if text:
                all_texts_to_embed.append(text)
                chunk_metadata_map.append((doc_id, chunk_id))

    # Generar todos los embeddings en lotes
    print(f"Iniciando generación de embeddings para {len(all_texts_to_embed)} fragmentos con {MODEL_NAME}...")
    if all_texts_to_embed:
        # Para BGE-small, puedes probar con batch_size más grandes como 64, 128 o incluso 256
        # si tu GPU lo soporta, para mayor velocidad.
        embeddings = compute_embeddings_batch(all_texts_to_embed, batch_size=64) 
    else:
        embeddings = []
    print("Embeddings generados.")

    # Asignar los embeddings de vuelta a la estructura original
    for i, (doc_id, chunk_id) in enumerate(chunk_metadata_map):
        if doc_id not in vector_store:
            vector_store[doc_id] = {}
        vector_store[doc_id][chunk_id] = embeddings[i]
        
    return vector_store


vec_store = create_vector_store_optimized(documents)

Usando dispositivo para embeddings: cuda
Cargando modelo de embeddings: BAAI/bge-small-en-v1.5 en cuda...
Modelo de embeddings cargado exitosamente.
Iniciando generación de embeddings para 1621 fragmentos con BAAI/bge-small-en-v1.5...
Generando embeddings para 1621 textos en lotes de 64...


Batches: 100%|██████████| 26/26 [00:02<00:00, 11.50it/s]

Embeddings generados.





In [8]:
import torch
import numpy as np # Keep numpy import if you still use it elsewhere, but it won't be for tensor ops here
from sentence_transformers import SentenceTransformer # Keep if you're loading the model in a global scope

# --- Assume GLOBAL_EMBEDDING_MODEL and device are already loaded and defined as before ---
# Example (ensure this part runs once at the top of your notebook):
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# GLOBAL_EMBEDDING_MODEL = SentenceTransformer("BAAI/bge-small-en-v1.5", device=str(device))
# GLOBAL_EMBEDDING_MODEL.eval()


def compute_embeddings(text: str) -> list[float]:
    """
    Helper function to compute a single embedding.
    This function internally uses the GLOBAL_EMBEDDING_MODEL.
    """
    if GLOBAL_EMBEDDING_MODEL is None:
        raise RuntimeError("Embedding model not loaded.")
    # .encode() handles batching internally, even for a single string.
    # We want a PyTorch tensor here to keep it on the GPU for comparison.
    embedding_tensor = GLOBAL_EMBEDDING_MODEL.encode(
        [text],
        convert_to_tensor=True,
        normalize_embeddings=True # Crucial for cosine similarity
    )
    return embedding_tensor[0] # Return the single tensor for the query, not a list of lists


def compute_matches(vector_store: dict, query_str: str, top_k: int) -> list:
    """
    This function takes in a vector store dictionary, a query string, and an int 'top_k'.
    It computes embeddings for the query string and then calculates the cosine similarity
    against every chunk embedding in the dictionary, all on the GPU.
    The top_k matches are returned based on the highest similarity scores.
    """
    if GLOBAL_EMBEDDING_MODEL is None:
        raise RuntimeError("The embedding model is not loaded. Cannot compute matches.")

    # 1. Get the embedding for the query string
    # Ensure this is a PyTorch tensor and on the correct device (GPU)
    query_embedding_tensor = compute_embeddings(query_str).to(device)

    scores_list = []
    metadata_list = []

    # 2. Collect all chunk embeddings into a single PyTorch tensor on the GPU
    # This loop is for collecting, not for individual similarity calculations
    for doc_id, chunks in vector_store.items():
        for chunk_id, chunk_embedding_list in chunks.items():
            # Convert list to PyTorch tensor and move to GPU
            chunk_embedding_tensor = torch.tensor(chunk_embedding_list, dtype=torch.float32).to(device)
            scores_list.append(chunk_embedding_tensor)
            metadata_list.append((doc_id, chunk_id))

    if not scores_list:
        print("Warning: Vector store is empty. No matches to compute.")
        return []

    # Stack all chunk embeddings into a single 2D tensor (matrix)
    # Shape: (num_chunks, embedding_dimension)
    all_chunk_embeddings_matrix = torch.stack(scores_list)

    # 3. Calculate cosine similarity (or dot product, since embeddings are normalized)
    # query_embedding_tensor shape: (embedding_dimension) or (1, embedding_dimension)
    # all_chunk_embeddings_matrix shape: (num_chunks, embedding_dimension)
    
    # Perform dot product: (num_chunks, embedding_dim) @ (embedding_dim) -> (num_chunks)
    # The result will be a 1D tensor of scores, still on the GPU
    similarity_scores = torch.matmul(all_chunk_embeddings_matrix, query_embedding_tensor)

    # 4. Sort scores and return the top_k results
    # topk returns values and indices, both as PyTorch tensors
    top_scores, top_indices = torch.topk(similarity_scores, k=min(top_k, len(metadata_list)))

    # Convert results to Python lists for easy return and further processing
    top_results = []
    for i in range(len(top_scores)):
        score = top_scores[i].item() # .item() extracts scalar from 0-dim tensor
        doc_id, chunk_id = metadata_list[top_indices[i].item()]
        top_results.append((score, doc_id, chunk_id))

    return top_results

In [17]:
top_resultados = compute_matches(vector_store=vec_store,
                query_str="Convolutional Networks",
                top_k=3)

In [18]:
top_resultados

[(0.8371599316596985,
  '555b06a2-0e75-46fc-8246-26e87ebeeb28',
  'dca340f9-68a3-4046-8f33-37e73e33f300'),
 (0.8355835676193237,
  '555b06a2-0e75-46fc-8246-26e87ebeeb28',
  'ee273193-722c-4eab-9429-629de8e39bd4'),
 (0.8163367509841919,
  '555b06a2-0e75-46fc-8246-26e87ebeeb28',
  '7830ec90-d14f-4bcf-8c01-3ff541e74e74')]

In [19]:
documents['555b06a2-0e75-46fc-8246-26e87ebeeb28']['dca340f9-68a3-4046-8f33-37e73e33f300']

{'text': 'Turner.\n• Chapter\n,9 Convolutional Networks: Martín Arjovsky, Eugene Brevdo, Kon-\nstantin Divilov, Eric Jensen, Mehdi Mirza, Alex Paino, Marjorie Sayer, Ryan\nStout and Wentao Wu.\n• Chapter\n,\n10 Sequence Modeling: Recurrent and Recursive Nets: Gökçen\nEraslan, Steven Hickson, Razvan Pascanu, Lorenzo von Ritter, Rui Rodrigues,\nDmitriy Serdyuk, Dongyu Shi and Kaiyu Yang.\n• Chapter\n,\n: Daniel Beckstein.\n11 Practical Methodology\n• Chapter\n,\n: George Dahl, Vladimir Nekrasov and Ribana\n12 Applications\nRoscher.\n• Chapter\n,\n13 Linear Factor Models: Jayanth Koushik.\nix\nCONTENTS\n• Chapter\n,\n: Kunal Ghosh.\n15 Representation Learning\n• Chapter\n,\n: Minh Lê\n16 Structured Probabilistic Models for Deep Learning\nand Anton Varfolom.\n• Chapter\n,\n18 Confronting the Partition Function: Sam Bowman.\n• Chapter\n,\n: Yujia Bao.\n19 Approximate Inference\n• Chapter\n,\n20 Deep Generative Models: Nicolas Chapados, Daniel Galvez,\nWenming Ma, Fady Medhat, Shakir Mohamed