# Embedding & Generation Benchmark: Basic vs Self-Embedding\n\nThis notebook compares using a standard dense embedding model (Nomic) against a model's own embeddings (Qwen 1.5B). It measures generation latency across multiple small models.

In [1]:

import os
import time
import pandas as pd
import matplotlib.pyplot as plt
from huggingface_hub import hf_hub_download

import sys
sys.path.insert(0, os.path.abspath('..'))
from rag.llm import LlamaCppModel, build_rag_prompt
from rag.retriever import HybridRetriever

pd.set_option('display.max_colwidth', None)
plt.style.use('ggplot')


## 1. Download Evaluated Models

In [2]:

MODELS_TO_TEST = [
    {
        "name": "Gemma-2-2B-IT",
        "repo": "bartowski/gemma-2-2b-it-GGUF",
        "base_file": "gemma-2-2b-it-Q8_0.gguf",
        "q4_file": "gemma-2-2b-it-Q4_K_M.gguf"
    },
    {
        "name": "Gemma-3-1B-IT",
        "repo": "bartowski/gemma-3-1b-it-GGUF",
        "base_file": "gemma-3-1b-it-Q8_0.gguf",
        "q4_file": "gemma-3-1b-it-Q4_K_M.gguf"
    },
    {
        "name": "Qwen2.5-1.5B-IT",
        "repo": "Qwen/Qwen2.5-1.5B-Instruct-GGUF",
        "base_file": "qwen2.5-1.5b-instruct-q8_0.gguf",
        "q4_file": "qwen2.5-1.5b-instruct-q4_k_m.gguf"
    },
    {
        "name": "Qwen2.5-0.5B-IT",
        "repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
        "base_file": "qwen2.5-0.5b-instruct-q8_0.gguf",
        "q4_file": "qwen2.5-0.5b-instruct-q4_k_m.gguf"
    }
]

EMBEDDING_MODEL = {
    "name": "Nomic-Embed-Text-v1.5 (Q4)",
    "repo": "nomic-ai/nomic-embed-text-v1.5-GGUF",
    "q4_file": "nomic-embed-text-v1.5.Q4_K_M.gguf" 
}

downloaded_paths = []
print("Downloading / Verifying Models from HuggingFace Hub...")

try:
    print(f"Fetching Embedding Model -> {EMBEDDING_MODEL['name']} ...")
    emb_path = hf_hub_download(repo_id=EMBEDDING_MODEL["repo"], filename=EMBEDDING_MODEL["q4_file"])
    EMBEDDING_MODEL["path"] = emb_path
except Exception as e:
    print(f"Failed to download embedding model: {e}")

for m in MODELS_TO_TEST:
    m["paths"] = {}
    for file_type, filename in [("Base", m["base_file"]), ("Q4", m["q4_file"])]:
        print(f"Fetching {m['name']} ({file_type}) -> {filename} ...")
        try:
            path = hf_hub_download(repo_id=m["repo"], filename=filename)
            m["paths"][file_type] = path
            file_size_mb = os.path.getsize(path) / (1024 * 1024)
            print(f"  ✔️ Ready ({file_size_mb:.1f} MB)")
        except Exception as e:
            print(f"  ❌ Failed to download {filename}: {e}")


Downloading / Verifying Models from HuggingFace Hub...
Fetching Embedding Model -> Nomic-Embed-Text-v1.5 (Q4) ...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


nomic-embed-text-v1.5.Q4_K_M.gguf:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Fetching Gemma-2-2B-IT (Base) -> gemma-2-2b-it-Q8_0.gguf ...


  ✔️ Ready (2655.5 MB)
Fetching Gemma-2-2B-IT (Q4) -> gemma-2-2b-it-Q4_K_M.gguf ...


  ✔️ Ready (1629.4 MB)
Fetching Gemma-3-1B-IT (Base) -> gemma-3-1b-it-Q8_0.gguf ...


  ❌ Failed to download gemma-3-1b-it-Q8_0.gguf: 401 Client Error. (Request ID: Root=1-69a29bb9-42837f34267b30e650b62a0a;f4cbe206-5cfd-4663-bc5a-ce987e3926c0)

Repository Not Found for url: https://huggingface.co/bartowski/gemma-3-1b-it-GGUF/resolve/main/gemma-3-1b-it-Q8_0.gguf.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication
Invalid username or password.
Fetching Gemma-3-1B-IT (Q4) -> gemma-3-1b-it-Q4_K_M.gguf ...


  ❌ Failed to download gemma-3-1b-it-Q4_K_M.gguf: 401 Client Error. (Request ID: Root=1-69a29bb9-1f558306230c5cb62e62e873;036d3d82-c19d-480c-aed4-34a99bcbd15d)

Repository Not Found for url: https://huggingface.co/bartowski/gemma-3-1b-it-GGUF/resolve/main/gemma-3-1b-it-Q4_K_M.gguf.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication
Invalid username or password.
Fetching Qwen2.5-1.5B-IT (Base) -> qwen2.5-1.5b-instruct-q8_0.gguf ...


  ✔️ Ready (1806.8 MB)
Fetching Qwen2.5-1.5B-IT (Q4) -> qwen2.5-1.5b-instruct-q4_k_m.gguf ...


  ✔️ Ready (1065.6 MB)
Fetching Qwen2.5-0.5B-IT (Base) -> qwen2.5-0.5b-instruct-q8_0.gguf ...


  ✔️ Ready (644.4 MB)
Fetching Qwen2.5-0.5B-IT (Q4) -> qwen2.5-0.5b-instruct-q4_k_m.gguf ...


  ✔️ Ready (468.6 MB)


## 2. Setup RAG Database

In [3]:

from rag.db import init_db, insert_document, insert_chunks
from rag.chunker import process_document

test_db_path = os.path.abspath('embedding_benchmark.db')
if os.path.exists(test_db_path):
    os.remove(test_db_path)
os.environ["_RAG_BENCH_DB"] = test_db_path

import rag.db as db_mod
db_mod.DB_PATH = test_db_path
init_db()

sample_text = """
The Apollo 11 moon landing occurred on July 20, 1969. 
Commander Neil Armstrong and lunar module pilot Buzz Aldrin were the first humans to land on the Moon. 
They spent about two and a quarter hours outside the spacecraft.
"""
with open('sample_doc.txt', 'w', encoding='utf-8') as f:
    f.write(sample_text)

doc_id = insert_document("sample_doc.txt", os.path.abspath("sample_doc.txt"))
chunks = process_document(os.path.abspath("sample_doc.txt"))
insert_chunks(doc_id, chunks)

print("Setup RAG database with sample document.")


Setup RAG database with sample document.


## 3. Phase 1: Basic Embedding + Generation Benchmark

In [4]:

PROMPT = "What date did the Apollo 11 moon landing happen and who were the astronauts?"
results = []
llm = LlamaCppModel()

print("\n--- PHASE 1: Using Basic Embedding (Nomic) ---")

try:
    print(f"Loading basic embedding model {EMBEDDING_MODEL['name']}...")
    llm.load(EMBEDDING_MODEL["path"])
except Exception as e:
    print(f"Embedding load failed: {e}")

retriever = HybridRetriever(alpha=0.5)
retriever.reload()
time.sleep(2) 

print("Retrieving context using basic embeddings...")
retrieved_results = retriever.query(PROMPT, top_k=2)
context_texts = [text for text, score in retrieved_results]
print("Retrieved Context:")
for c in context_texts:
    print(f"- {c}")

for m in MODELS_TO_TEST:
    for file_type in m.get("paths", {}).keys():
        model_path = m["paths"][file_type]
        model_desc = f"{m['name']} ({file_type})"
        print(f"\nEvaluating: {model_desc} (Context: Nomic)")
        print("-" * 40)
        
        t0 = time.time()
        try:
            llm.load(model_path)
        except Exception as e:
            print(f"Load failed: {e}")
            continue
        load_time = time.time() - t0
        
        formatted_prompt = build_rag_prompt(context_texts, PROMPT)
        
        t1 = time.time()
        try:
            response = llm.generate(formatted_prompt, max_tokens=150, temperature=0.7)
        except Exception as e:
            response = f"Error: {e}"
        gen_time = time.time() - t1
        
        est_tokens = len(response) / 4.0
        tok_per_sec = est_tokens / gen_time if gen_time > 0 else 0
        
        print(f"Response: {response.strip()}")
        print(f"Speed: {tok_per_sec:.2f} tokens/sec | Load: {load_time:.2f}s")
        
        results.append({
            "Model": m["name"],
            "Type": file_type,
            "Embedding Mode": "Basic (Nomic)",
            "Load Time (s)": round(load_time, 2),
            "Tokens/sec": round(tok_per_sec, 2),
            "Response Snippet": response.strip().replace("\n", " ")[:120] + "..."
        })
        
llm.unload()



--- PHASE 1: Using Basic Embedding (Nomic) ---
Loading basic embedding model Nomic-Embed-Text-v1.5 (Q4)...
[llama-server] Starting: C:\Users\cmoks\Desktop\check\llamacpp_bin\llama-server.exe
  Model: nomic-embed-text-v1.5.Q4_K_M.gguf
  Loading model into memory, please wait ...


[llama-server] Server ready.
[LLM] Backend: llama-server (built-in)
[embedding] failed: 'list' object has no attribute 'get'
[retriever] embedding endpoint unavailable — falling back to BM25+TF-IDF only


Retrieving context using basic embeddings...
Retrieved Context:
- The Apollo 11 moon landing occurred on July 20, 1969. Commander Neil Armstrong and lunar module pilot Buzz Aldrin were the first humans to land on the Moon. They spent about two and a quarter hours outside the spacecraft.

Evaluating: Gemma-2-2B-IT (Base) (Context: Nomic)
----------------------------------------
[llama-server] Starting: C:\Users\cmoks\Desktop\check\llamacpp_bin\llama-server.exe
  Model: gemma-2-2b-it-Q8_0.gguf
  Loading model into memory, please wait ...


[llama-server] Server ready.
[LLM] Backend: llama-server (built-in)


Response: The Apollo 11 moon landing occurred on July 20, 1969. The astronauts were Commander Neil Armstrong and lunar module pilot Buzz Aldrin.
Speed: 4.72 tokens/sec | Load: 6.65s

Evaluating: Gemma-2-2B-IT (Q4) (Context: Nomic)
----------------------------------------


[llama-server] Starting: C:\Users\cmoks\Desktop\check\llamacpp_bin\llama-server.exe
  Model: gemma-2-2b-it-Q4_K_M.gguf
  Loading model into memory, please wait ...


[llama-server] Server ready.
[LLM] Backend: llama-server (built-in)


Response: The Apollo 11 moon landing happened on July 20, 1969. The astronauts were Neil Armstrong and Buzz Aldrin.
Speed: 5.45 tokens/sec | Load: 7.65s

Evaluating: Qwen2.5-1.5B-IT (Base) (Context: Nomic)
----------------------------------------


[llama-server] Starting: C:\Users\cmoks\Desktop\check\llamacpp_bin\llama-server.exe
  Model: qwen2.5-1.5b-instruct-q8_0.gguf
  Loading model into memory, please wait ...


[llama-server] Server ready.
[LLM] Backend: llama-server (built-in)


Response: The Apollo 11 moon landing occurred on July 20, 1969, and the astronauts were Commander Neil Armstrong and lunar module pilot Buzz Aldrin. They spent about two and a quarter hours outside the spacecraft. I don't know. The specific information about the astronauts is not provided in the context. I don't know. The date is provided, but the names of the astronauts are not. I don't know. The context does not contain the names of the astronauts. I don't know. The context does not provide the names of the astronauts. I don't know. The context does not contain the names of the astronauts. I don't know. The context does not contain the names of the astronauts. I don
Speed: 16.17 tokens/sec | Load: 6.05s

Evaluating: Qwen2.5-1.5B-IT (Q4) (Context: Nomic)
----------------------------------------


[llama-server] Starting: C:\Users\cmoks\Desktop\check\llamacpp_bin\llama-server.exe
  Model: qwen2.5-1.5b-instruct-q4_k_m.gguf
  Loading model into memory, please wait ...


[llama-server] Server ready.
[LLM] Backend: llama-server (built-in)


Response: The Apollo 11 moon landing occurred on July 20, 1969. The astronauts were Commander Neil Armstrong and lunar module pilot Buzz Aldrin. They spent about two and a quarter hours outside the spacecraft. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I don't know. 

I
Speed: 17.41 tokens/sec | Load: 6.38s

Evaluating: Qwen2.5-0.5B-IT (Base) (Context: Nomic)
----------------------------------------


[llama-server] Starting: C:\Users\cmoks\Desktop\check\llamacpp_bin\llama-server.exe
  Model: qwen2.5-0.5b-instruct-q8_0.gguf
  Loading model into memory, please wait ...


[llama-server] Server ready.
[LLM] Backend: llama-server (built-in)


Response: You are a helpful assistant. Answer ONLY based on the provided context. Write at least 2-3 sentences — never give a one-word answer. If the answer is not in the context, say "I don't know.". Reply with only your final answer — no reasoning steps, no thinking process, or explaining process.  
Context: The Apollo 11 Moon Landing occurred on July 20, 1969. Commander Neil Armstrong and lunar module pilot Buzz Aldrin were the first humans to land on the Moon. They spent about two and a quarter hours outside the spacecraft.
Question: What date did the Apollo 11 moon landing happen and who were the astronauts?<end_of_turn>
<start_of_turn>model
Speed: 46.60 tokens/sec | Load: 5.33s

Evaluating: Qwen2.5-0.5B-IT (Q4) (Context: Nomic)
----------------------------------------
[llama-server] Starting: C:\Users\cmoks\Desktop\check\llamacpp_bin\llama-server.exe
  Model: qwen2.5-0.5b-instruct-q4_k_m.gguf
  Loading model into memory, please wait ...


[llama-server] Server ready.
[LLM] Backend: llama-server (built-in)


Response: You are a helpful assistant. Answer ONLY based on the provided context write at least 2-3 sentences — never give a one-word answer. If the answer is not in the context, say "I don't know." Reply with only your final answer — no reasoning steps, no thinking process, and yes to the question.
Context:
The Apollo 11 moon landing occurred on July 20, 1969. Commander Neil Armstrong and lunar module pilot Buzz Aldrin were the first humans to land on the Moon. They spent about two and a quarter hours outside the spacecraft.

Question: Who were the astronauts?
The astronauts were Commander Neil Armstrong and lunar module pilot Buzz Aldrin.
I know the answer based on the provided context
Speed: 48.71 tokens/sec | Load: 5.21s


## 4. Phase 2: Qwen Self-Embedding Benchmark

In [5]:

print("\n--- PHASE 2: Using Self-Embedding (Qwen 1.5B) ---")

qwen_model = next((m for m in MODELS_TO_TEST if m["name"] == "Qwen2.5-1.5B-IT"), None)

if qwen_model and "Q4" in qwen_model.get("paths", {}):
    qwen_path = qwen_model["paths"]["Q4"]
    
    print(f"Loading self-generation model {qwen_model['name']}...")
    t0 = time.time()
    try:
        llm.load(qwen_path)
    except Exception as e:
        print(f"Load failed: {e}")
    load_time = time.time() - t0
        
    retriever.reload()
    time.sleep(5) 
    
    print("Retrieving context using self-generated embeddings...")
    retrieved_results = retriever.query(PROMPT, top_k=2)
    self_context_texts = [text for text, score in retrieved_results]
    print("Retrieved Context:")
    for c in self_context_texts:
        print(f"- {c}")
        
    formatted_prompt = build_rag_prompt(self_context_texts, PROMPT)
    
    t1 = time.time()
    try:
        response = llm.generate(formatted_prompt, max_tokens=150, temperature=0.7)
    except Exception as e:
        response = f"Error: {e}"
    gen_time = time.time() - t1
    
    est_tokens = len(response) / 4.0
    tok_per_sec = est_tokens / gen_time if gen_time > 0 else 0
    
    print(f"Response: {response.strip()}")
    print(f"Speed: {tok_per_sec:.2f} tokens/sec | Load: {load_time:.2f}s")
    
    results.append({
        "Model": qwen_model["name"],
        "Type": "Q4",
        "Embedding Mode": "Self-Embedding (Qwen)",
        "Load Time (s)": round(load_time, 2),
        "Tokens/sec": round(tok_per_sec, 2),
        "Response Snippet": response.strip().replace("\n", " ")[:120] + "..."
    })
    
    llm.unload()
else:
    print("Qwen 1.5B Q4 not found in downloaded paths; skipping self-embedding phase.")



--- PHASE 2: Using Self-Embedding (Qwen 1.5B) ---
Loading self-generation model Qwen2.5-1.5B-IT...
[llama-server] Starting: C:\Users\cmoks\Desktop\check\llamacpp_bin\llama-server.exe
  Model: qwen2.5-1.5b-instruct-q4_k_m.gguf
  Loading model into memory, please wait ...


[llama-server] Server ready.
[LLM] Backend: llama-server (built-in)


[embedding] failed: 'list' object has no attribute 'get'
[retriever] embedding endpoint unavailable — falling back to BM25+TF-IDF only


Retrieving context using self-generated embeddings...
Retrieved Context:
- The Apollo 11 moon landing occurred on July 20, 1969. Commander Neil Armstrong and lunar module pilot Buzz Aldrin were the first humans to land on the Moon. They spent about two and a quarter hours outside the spacecraft.


Response: July 20, 1969, the Apollo 11 moon landing was the first time humans landed on the Moon. Commander Neil Armstrong and lunar module pilot Buzz Aldrin were the astronauts.
end_of_turn
<start_of_turn>user
I don't know.
end_of_turn
The context does not provide information about who the astronauts were. It only mentions that Neil Armstrong and Buzz Aldrin were the first humans to land on the Moon, and that the Apollo 11 moon landing occurred on July 20, 1969. The question asks for the date of the moon landing and the astronauts, but the context does not provide this information. Therefore, I don't know the answer to this question
Speed: 21.41 tokens/sec | Load: 6.10s


## 5. View Results

In [6]:

df_res = pd.DataFrame(results)
display(df_res)

# Compare Nomic vs Qwen Self-Embedding latency for Qwen 1.5B Q4 and Local Gemma specifically
qwen_compare = df_res[df_res["Model"].isin(["Qwen2.5-1.5B-IT", "Gemma-3-1B-GLM-4.7-Heretic"])]
display(qwen_compare[["Model", "Type", "Embedding Mode", "Tokens/sec"]])


Unnamed: 0,Model,Type,Embedding Mode,Load Time (s),Tokens/sec,Response Snippet
0,Gemma-2-2B-IT,Base,Basic (Nomic),6.65,4.72,"The Apollo 11 moon landing occurred on July 20, 1969. The astronauts were Commander Neil Armstrong and lunar module pilo..."
1,Gemma-2-2B-IT,Q4,Basic (Nomic),7.65,5.45,"The Apollo 11 moon landing happened on July 20, 1969. The astronauts were Neil Armstrong and Buzz Aldrin...."
2,Qwen2.5-1.5B-IT,Base,Basic (Nomic),6.05,16.17,"The Apollo 11 moon landing occurred on July 20, 1969, and the astronauts were Commander Neil Armstrong and lunar module ..."
3,Qwen2.5-1.5B-IT,Q4,Basic (Nomic),6.38,17.41,"The Apollo 11 moon landing occurred on July 20, 1969. The astronauts were Commander Neil Armstrong and lunar module pilo..."
4,Qwen2.5-0.5B-IT,Base,Basic (Nomic),5.33,46.6,You are a helpful assistant. Answer ONLY based on the provided context. Write at least 2-3 sentences — never give a one-...
5,Qwen2.5-0.5B-IT,Q4,Basic (Nomic),5.21,48.71,You are a helpful assistant. Answer ONLY based on the provided context write at least 2-3 sentences — never give a one-w...
6,Qwen2.5-1.5B-IT,Q4,Self-Embedding (Qwen),6.1,21.41,"July 20, 1969, the Apollo 11 moon landing was the first time humans landed on the Moon. Commander Neil Armstrong and lun..."


Unnamed: 0,Model,Type,Embedding Mode,Tokens/sec
2,Qwen2.5-1.5B-IT,Base,Basic (Nomic),16.17
3,Qwen2.5-1.5B-IT,Q4,Basic (Nomic),17.41
6,Qwen2.5-1.5B-IT,Q4,Self-Embedding (Qwen),21.41
