In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "unsloth/Llama-3.2-3B-Instruct-GGUF"
filename = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"

model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)
tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)

  from .autonotebook import tqdm as notebook_tqdm
Converting and de-quantizing GGUF tensors...: 100%|██████████| 255/255 [01:21<00:00,  3.13it/s]
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [8]:
!git clone https://github.com/99991/pygguf.git
!cd pygguf && pip install -e .


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


fatal: destination path 'pygguf' already exists and is not an empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Obtaining file:///home/giorgio6846/Code/Sign-AI/Sign-chris/notebooks/pygguf
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: gguf
  Building editable for gguf (pyproject.toml) ... [?25ldone
[?25h  Created wheel for gguf: filename=gguf-0.1.0-0.editable-py3-none-any.whl size=5166 sha256=0fdbbaa9a1f09b48ff7860a3e2290e4dbbdecf85a89535ffc47b3426ee9d5c92
  Stored in directory: /tmp/pip-ephem-wheel-cache-0uztnxn8/wheels/b0/eb/bd/1cba46bf6571db3045945cb9fdd7ab7c0f21669dd9033615d2
Successfully built gguf
Installing collected packages: gguf
  Attempting uninstall: gguf
    Found existing installation: gguf 0.17.1
    Uninstalling gguf-0.17.1:
      Successfully uninstalled gguf-0.17.1
Successfully installed gguf-0.1.0


In [1]:
import torch
from llama_cpp import Llama
import gguf
import numpy as np

In [None]:
def load_embedding_weights_from_gguf(model_path):
    """
    Carga los pesos del embedding directamente desde un archivo GGUF.

    Args:
        model_path (str): Ruta al archivo .gguf del modelo.

    Returns:
        numpy.ndarray: Un array 2D con los pesos del embedding (vocab_size, embedding_dim).
                       Retorna None si no se encuentra el tensor.
    """
    try:
        # 1. Cargar el archivo GGUF
        reader = gguf.GGUFReader(model_path, 'r')

        # 2. Iterar por los tensores para encontrar el de embedding
        # El nombre puede variar, pero suele ser 'token_embd.weight'
        embedding_tensor_name = 'token_embd.weight' # Nombre común
        found_tensor = None

        for tensor in reader.tensors:
            if tensor.name == embedding_tensor_name:
                print(f"Tensor encontrado: {tensor.name}, Shape: {tensor.shape}, Tipo: {tensor.tensor_type}")
                found_tensor = tensor
                break
        else:
            raise ValueError(f"No se encontró el tensor de embedding con el nombre '{embedding_tensor_name}' en {model_path}")

        # 3. Extraer los datos
        # gguf.Reader nos da los datos ya deserializados y descuantizados (si es necesario)
        # como un numpy array. El acceso es mediante found_tensor.data
        print(found_tensor)
        embedding_weights = found_tensor.data
        print(embedding_weights[0])

        # Asegurarse de que es un array 2D
        # La forma típica es (n_vocab, n_embd) en el archivo,
        # aunque internamente ggml/gguf puede almacenarlo transpuesto.
        # gguf.Reader debería manejar esto y devolver la forma correcta.
        # Verifica la forma:
        print(f"Pesos del embedding cargados. Shape: {embedding_weights.shape}, Dtype: {embedding_weights.dtype}")

        return embedding_weights

    except FileNotFoundError:
        print(f"Error: No se pudo encontrar el archivo {model_path}")
    except Exception as e:
        print(f"Error al cargar el archivo GGUF: {e}")
        import traceback
        traceback.print_exc()
    return None

In [2]:
def load_embedding_from_gguf(model_path: str) -> np.ndarray:
    # 1) Abrir el archivo en modo binario
    with open(model_path, "rb") as f:
        info, tensorinfo = gguf.load_gguf(f)  # metadata + descriptores :contentReference[oaicite:2]{index=2}

        # 2) Cargar el tensor 'token_embd.weight' directamente
        try:
            weights = gguf.load_gguf_tensor(f, tensorinfo, "token_embd.weight")
        except KeyError:
            raise ValueError("No se encontró 'token_embd.weight' en tensorinfo")

    # 3) Verificar dimensiones
    #    weights.shape debería ser (3072, vocab_size) o (vocab_size, 3072)
    if weights.shape[0] == info.get("llama.n_embd", 3072):
        # Si viene como (3072, V), lo transponemos
        weights = weights.T

    print(f"Embedding final: shape={weights.shape}, dtype={weights.dtype}")
    return weights


In [3]:
model_path = "/home/giorgio6846/Code/Sign-AI/local_models/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q4_K_M.gguf"
W = load_embedding_from_gguf(model_path)

Embedding final: shape=(128256, 3072), dtype=float32


In [24]:
# --- Ejemplo de uso ---
model_path = "/home/giorgio6846/Code/Sign-AI/local_models/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q4_K_M.gguf"
embedding_weights = load_embedding_weights_from_gguf(model_path)
#
if embedding_weights is not None:
    print(f"Shape final: {embedding_weights.shape}")
else:
    print("No se pudieron cargar los pesos del embedding.")


AttributeError: 'GGUFReader' object has no attribute 'tensor_by_name'

In [5]:
path_model = "/home/giorgio6846/Code/Sign-AI/local_models/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q4_K_M.gguf"
model_cpp = Llama(model_path=path_model, n_ctx=1024, embedding=True, logits_all=True, verbose=False,) #n_gpu_layers=-1)

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


In [6]:
target = "un texto normal como cualquiera"
embeddings_cpp = torch.tensor(model_cpp.embed(target))
embeddings_cpp.shape

torch.Size([8, 3072])

In [6]:
input_ids = tokenizer(target, return_tensors="pt").input_ids
input_ids = input_ids.to("cpu")
embeddings = model.get_input_embeddings()(input_ids[0])

In [7]:
embeddings_cpp == embeddings

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [15]:
import torch.nn.functional as F

In [21]:
def embeddings_to_text_gpu(embeddings: torch.Tensor, model, tokenizer) -> str:
    device = "cpu" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # model.eval()

    embeddings = embeddings.to(device)

    # embedding_layer = model.get_input_embeddings()
    embedding_matrix = torch.tensor(W, dtype=torch.bfloat16).to(device) # embedding_layer.weight.to(device)  # [vocab_size, hidden_dim]

    embedding_matrix_norm = F.normalize(embedding_matrix, p=2, dim=1)  # [V, D]
    print(embedding_matrix_norm.shape)

    embeddings_norm = F.normalize(embeddings, p=2, dim=1)  # [T, D]

    similarities = torch.matmul(embeddings_norm, embedding_matrix_norm.T)  # [T, V]

    token_ids = torch.argmax(similarities, dim=1).tolist()
    print(token_ids)

    return tokenizer.decode(token_ids,) #skip_special_tokens=True)

In [20]:
torch.tensor(W).dtype

torch.float32

In [23]:
import torch
import torch.nn.functional as F
import numpy as np

def embeddings_to_text(embeddings: torch.Tensor, W: np.ndarray, tokenizer) -> str:
    # 1) fuerza float32
    E = embeddings.to(torch.float32)            # [T, D]
    M = torch.from_numpy(W.astype(np.float32))  # [V, D]

    # 2) compara con producto punto
    sims = E @ M.T                               # [T, V]
    token_ids = sims.argmax(dim=1).tolist()

    return tokenizer.decode(token_ids)

# uso:
# W ya cargada con load_gguf_tensor → float32 (vocab_size, 3072)
emb = torch.tensor(model_cpp.embed(target))      # [T, D]
text = embeddings_to_text(emb, W, model_cpp.tokenizer_)
print(text)


Theited de
 elquiera de


In [22]:
embeddings_to_text_gpu(embeddings_cpp.to(dtype=torch.bfloat16), model_cpp, model_cpp.tokenizer_)

torch.Size([128256, 3072])
[791, 1639, 15482, 271, 10566, 447, 26919, 11158]


'Theited sobre\n\n estequiera más'