In [2]:
from llama_cpp import Llama
import llama_cpp
from datasets import load_dataset
import evaluate
import requests
import json
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter Notebook
import time

In [4]:
print("GPU offload supported:", llama_cpp.llama_supports_gpu_offload())

# Path to your GGUF model (adjust path if needed)
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-gguf/llama-3.1-8B-Q8_0.gguf"

# Initialize Llama with GPU layers offloaded
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_gpu_layers=80,     # Offload to GPU! Adjust as needed based on your VRAM
    verbose=False         # Prints backend info
)

GPU offload supported: True


In [5]:
# Load test data
dataset = load_dataset("gigaword", split="test[:100]")  # Limit to 100 for fast eval

# Initialize ROUGE metric
rouge = evaluate.load('rouge')

In [6]:
def summarize_with_llama_cpp(document, max_tokens=50):
    prompt_template = (
        "You are an AI assistant specialized in summarizing news articles. "
        "Summarize the following news sentence into a concise headline.\n\n"

        "Here is an example:\n"
        "News: Japan 's nec corp. and UNK computer corp. of the united states said wednesday they had agreed to join forces in supercomputer sales.\n"
        "Headline: Nec UNK in computer sales tie-up\n\n"

        "Now summarize the following news:\n\n"

        "News: {document}\n\n"
        "Headline:"
    )
    
    prompt = prompt_template.format(document=document)

    payload = {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": 0.3,
        "stream": False,
        "seed": 0,
    }

    response = llm(**payload)

    summary = response['choices'][0]['text'].strip()
    return summary


In [7]:
# Generate summaries and evaluate
references = []
predictions = []

start = time.time()

for item in dataset:

    doc = item['document']
    ref_summary = item['summary']

    pred_summary = summarize_with_llama_cpp(doc)

    if pred_summary:
        references.append(ref_summary)
        predictions.append(pred_summary)
    
end = time.time()

In [9]:
# Evaluate with ROUGE
results = rouge.compute(predictions=predictions, references=references)

print("llama.cpp (Llama-3.1-8B) float16 Summarization Results:")

print(f"\nNumber of examples: {len(references)}")
print(f"\nElapsed time: {end - start:.2f} s")

print("\nROUGE Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llama.cpp (Llama-3.1-8B) float16 Summarization Results:

Number of examples: 100

Elapsed time: 162.28 s

ROUGE Results:
rouge1: 0.1792
rouge2: 0.0601
rougeL: 0.1577
rougeLsum: 0.1629


### Now trying the 8bit quantization

In [None]:
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-gguf/llama-3.1-8B-Q8_0.gguf"

llm = Llama(
    model_path=model_path,
    n_ctx=8192,         # 8K tokens context is fine unless you need more
    n_gpu_layers=80,    # L4 GPU 24 GB can usually handle **60-80 layers** for 8B models
    verbose=False
)

In [11]:
# Generate summaries and evaluate
references = []
predictions = []

start = time.time()

for item in dataset:

    doc = item['document']
    ref_summary = item['summary']

    pred_summary = summarize_with_llama_cpp(doc)

    if pred_summary:
        references.append(ref_summary)
        predictions.append(pred_summary)
    
end = time.time()

In [12]:
# Evaluate with ROUGE
results = rouge.compute(predictions=predictions, references=references)

print("llama.cpp (Llama-3.1-8B) 8Bit Summarization Results:")

print(f"\nNumber of examples: {len(references)}")
print(f"\nElapsed time: {end - start:.2f} s")

print("\nROUGE Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llama.cpp (Llama-3.1-8B) 8Bit Summarization Results:

Number of examples: 100

Elapsed time: 161.84 s

ROUGE Results:
rouge1: 0.1788
rouge2: 0.0608
rougeL: 0.1580
rougeLsum: 0.1631


### Now 4Bit quantization

In [16]:
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-gguf/llama-3.1-8B-Q4_K_M.gguf"

llm = Llama(
    model_path=model_path,
    n_ctx=8192,         # 8K tokens context is fine unless you need more
    n_gpu_layers=80,    # L4 GPU 24 GB can usually handle **60-80 layers** for 8B models
    verbose=False
)

In [17]:
# Generate summaries and evaluate
references = []
predictions = []

start = time.time()

for item in dataset:

    doc = item['document']
    ref_summary = item['summary']

    pred_summary = summarize_with_llama_cpp(doc)

    if pred_summary:
        references.append(ref_summary)
        predictions.append(pred_summary)
    
end = time.time()

In [18]:
# Evaluate with ROUGE
results = rouge.compute(predictions=predictions, references=references)

print("llama.cpp (Llama-3.1-8B) 4Bit Summarization Results:")

print(f"\nNumber of examples: {len(references)}")
print(f"\nElapsed time: {end - start:.2f} s")

print("\nROUGE Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llama.cpp (Llama-3.1-8B) 4Bit Summarization Results:

Number of examples: 98

Elapsed time: 64.42 s

ROUGE Results:
rouge1: 0.2699
rouge2: 0.1160
rougeL: 0.2491
rougeLsum: 0.2501


### LLaMA 3.1 8B Summarization Benchmark

<small>

#### 📝 Tested Models
| **Model File**                      | **Precision** | **Quantization Scheme** | **Notes**                                     |
|-------------------------------------|---------------|-------------------------|-----------------------------------------------|
| llama-3.1-8B-f16.gguf               | float16       | Full Precision          | Largest model, highest theoretical accuracy, slowest inference |
| llama-3.1-8B-Q8_0.gguf              | 8-bit         | Q8_0                   | Reduced size, good balance of speed and quality |
| llama-3.1-8B-Q4_K_M.gguf            | 4-bit         | Q4_K_M                 | Highly optimized 4-bit quantization, best summarization quality in tests |

---

#### 📊 Summarization Results

| **Platform / Model**               | **Elapsed Time (s)** | **ROUGE-1** | **ROUGE-2** | **ROUGE-L** | **ROUGE-Lsum** |
|------------------------------------|----------------------|-------------|-------------|-------------|----------------|
| Ollama (LLaMA 3.1 8B Q4_K_M)       | 49.06                | 0.2886      | 0.1040      | 0.2632      | 0.2658         |
| llama.cpp (Q4_K_M)                 | 64.42                | 0.2699      | 0.1160      | 0.2491      | 0.2501         |
| llama.cpp (Q8_0)                   | 161.84               | 0.1788      | 0.0608      | 0.1580      | 0.1631         |
| llama.cpp (float16)                | 161.65               | 0.1801      | 0.0599      | 0.1591      | 0.1629         |

---

#### 🔍 Summary of Insights

- **4-bit quantized models (Q4_K_M)** in both **Ollama** and **llama.cpp** delivered **better summarization quality** and **faster inference** than higher-precision models.
- The **Q4_K_M quantization scheme** preserves summarization performance surprisingly well and matches Ollama's results.
- **8-bit (Q8_0)** and **float16** models performed worse in ROUGE scores, despite having more precision. This may be due to:
  - Differences in **prompt formatting**
  - **Sampling parameters**
  - Potential model variant differences (instruction-tuned vs base models)

---

#### ✅ Recommendations

1. Use **Q4_K_M quantized models** in llama.cpp for comparable performance to Ollama.
2. Match **prompt templates** used in Ollama:

</small>

### 🧠 Q4_K_M vs Q8_0 Quantization Comparison

<small>

#### What Are They?
Both **Q4_K_M** and **Q8_0** are quantization methods used to compress model weights for faster inference and lower memory usage.

---

#### Q4_K_M (4-bit Quantization, Optimized)

| Feature          | Description |
|------------------|-------------|
| **Precision**    | 4-bit |
| **Quantization Type** | "K" series, specifically **K_M** (multi-purpose optimized) |
| **Compression**  | Very high (significantly smaller than 8-bit) |
| **Speed**        | Extremely fast, ideal for CPU/GPU |
| **Memory Usage** | Very low (fits on smaller GPUs like 6-8GB VRAM) |
| **Accuracy**     | Preserves high accuracy in **instruction-tuned tasks** like **summarization**, **chat**, and **QA** |
| **Best Use Cases** | Chatbots, summarization, reasoning tasks |
| **Notes**        | Uses **groupwise quantization** and **per-channel scaling** for better accuracy retention despite low precision |

---

#### Q8_0 (8-bit Quantization, General Purpose)

| Feature          | Description |
|------------------|-------------|
| **Precision**    | 8-bit |
| **Quantization Type** | Uniform 8-bit |
| **Compression**  | Moderate (smaller than float16 but larger than 4-bit) |
| **Speed**        | Faster than float16, but slower than Q4_K_M |
| **Memory Usage** | Moderate (needs more VRAM, typically 12GB+) |
| **Accuracy**     | Higher precision retention in general, but not optimized for specific tasks |
| **Best Use Cases** | Complex reasoning, precision-sensitive tasks |
| **Notes**        | General-purpose quantization without task-specific optimizations |

---

#### ⚖️ Comparison Table: Q4_K_M vs Q8_0

| Feature          | **Q4_K_M**                  | **Q8_0**                  |
|------------------|-----------------------------|---------------------------|
| **Precision**    | 4-bit                       | 8-bit                    |
| **Size**         | Very small                  | Medium                   |
| **Speed**        | Very fast (low latency)     | Fast (higher latency than 4-bit) |
| **VRAM/Memory**  | Very low usage (fits on smaller GPUs/CPUs) | Medium (requires more VRAM) |
| **Accuracy**     | High for summarization, chat, reasoning (optimized quantization) | General higher precision (not task-optimized) |
| **Task Tuning**  | Task-specific optimizations (instruction-following, summarization) | General-purpose |
| **Best Use**     | Chatbots, summarization, QA tasks with constrained resources | Complex reasoning or precision-sensitive tasks |
| **Ollama Default?** | ✅ Frequently used (Q4_K_M or Q4_K_S) | ❌ Usually not used |

---

#### ✅ Why Q4_K_M Outperformed Q8_0 in Summarization
- **Q4_K_M** is optimized for **task-specific performance**, often giving better results for **instruction-tuned models**, **summarization**, and **chat** tasks.
- **Q8_0** retains more raw precision but isn't tuned for these tasks, leading to lower scores in ROUGE evaluation.
- **Q4_K_M** also runs significantly faster with less resource usage.

---

</small>

### 🧠 How Q4_K_M Is Optimized

<small>

Q4_K_M is part of the advanced **K series** quantization schemes, designed to balance **speed**, **size**, and **accuracy**. It introduces several optimizations to maintain high task performance despite being a 4-bit quantization.

#### Key Optimizations

| Optimization                   | Description |
|--------------------------------|-------------|
| **Groupwise Quantization**     | Weights are divided into small groups (e.g., 32 or 64) and quantized individually, improving precision retention. |
| **Per-Channel Scaling**        | Each group or channel has its own scale factor, ensuring finer control over the quantization process. |
| **Mixed Weight Packing (M)**   | Uses different packing strategies optimized for different layers (e.g., attention vs MLP layers). |
| **Dynamic Zero Points**        | Zero points are dynamically computed within groups, reducing quantization bias. |
| **Efficient SIMD Utilization** | The packed format is optimized for vectorized operations on CPU and GPU, increasing inference speed. |

#### Why It Works Well
- Optimized for **instruction-following**, **summarization**, and **chat** tasks.
- Preserves task-critical accuracy despite aggressive compression.
- Runs **very efficiently** on both CPU and GPU.

#### Q4_K_M vs Q8_0

| Feature            | Q4_K_M                   | Q8_0                 |
|--------------------|--------------------------|----------------------|
| Precision          | 4-bit                    | 8-bit               |
| Compression        | High                     | Medium              |
| Accuracy Retention | High (task-optimized)    | High (general)      |
| Speed              | Very fast                | Fast                |
| Memory Usage       | Very low                 | Medium              |
| Task Tuning        | Summarization, Chat, QA  | General-purpose     |
| Ollama Use         | ✅ Often used (default)  | ❌ Less common       |

</small>

In [1]:
from datasets import load_dataset
import evaluate
import requests
import json
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter Notebook
import time

In [2]:
from datasets import load_dataset
import random
import json

# Load SQuAD v2 dataset (validation split)
squad_v2 = load_dataset("squad_v2")

# Set random seed for reproducibility
random.seed(42)

# Convert the validation split to a list and sample 100 random questions
validation_list = list(squad_v2["validation"])
sampled_questions = random.sample(validation_list, 100)

print(sampled_questions[1])

{'id': '57111b95a58dae1900cd6c53', 'title': 'Huguenot', 'context': 'Frederick William, Elector of Brandenburg, invited Huguenots to settle in his realms, and a number of their descendants rose to positions of prominence in Prussia. Several prominent German military, cultural, and political figures were ethnic Huguenot, including poet Theodor Fontane, General Hermann von François, the hero of the First World War Battle of Tannenberg, Luftwaffe General and fighter ace Adolf Galland, Luftwaffe flying ace Hans-Joachim Marseille, and famed U-boat captain Lothar von Arnauld de la Perière. The last Prime Minister of the (East) German Democratic Republic, Lothar de Maizière, is also a descendant of a Huguenot family, as is the German Federal Minister of the Interior, Thomas de Maizière.', 'question': 'Who was the final Prime Minister of East Germany?', 'answers': {'text': ['Lothar de Maizière', 'Lothar de Maizière', 'Lothar de Maizière'], 'answer_start': [588, 588, 588]}}


In [7]:
# Count how many sampled questions are answerable
answerable_count = sum(1 for q in sampled_questions if len(q['answers']['text']) > 0)

# Optionally, count unanswerable too
unanswerable_count = len(sampled_questions) - answerable_count

# Print results
print(f"Total sampled questions: {len(sampled_questions)}")
print(f"Answerable questions: {answerable_count}")
print(f"Unanswerable questions: {unanswerable_count}")

Total sampled questions: 100
Answerable questions: 48
Unanswerable questions: 52


In [None]:
from evaluate import load

# Load the SQuAD evaluation metric
squad_metric = load("squad")

def qa_with_llama_cpp(example, max_tokens=50, verbose=True):

    context = example['context']
    question = example['question']
    ground_truth_answers = example['answers']

    prompt_template = (
        "You are a question answering assistant. Given the context, answer the question. "
        "If the answer isn't in the context, say 'I don't know'.\n\n"

        "Here is an example:\n"
        "Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France.\n"
        "Question: What is the name of the region the Normans gave their name to?\n"
        "Answer: Normandy\n\n"

        "Context: {context}\n\n"
        "Question: {question}\n\n"
        "Answer:"
    )
    
    prompt = prompt_template.format(context=context, question=question)

    payload = {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": 0.0,
        "stream": False,
        "seed": 0,
    }

    # Send to llama.cpp
    response = llm(**payload)
    prediction = response['choices'][0]['text'].strip()


    return {
        "question": question,
        "prediction": prediction,
        "ground_truths": ground_truth_answers['text'],
    }

In [None]:
def evaluate_qa_with_llama_cpp(dataset, qa_function, skip_unanswerable=True, verbose=True):
    """
    Evaluates a QA function using the SQuAD metric (Exact Match and F1).

    Returns:
        results: Dictionary with exact_match and f1 score.
        num_references: Number of evaluated references.
        elapsed_time: Total time taken.
    """
    from evaluate import load
    import time

    # Load the SQuAD evaluation metric
    squad_metric = load("squad")

    references = []
    predictions = []

    start = time.time()

    for example in dataset:
        if skip_unanswerable and len(example['answers']['text']) == 0:
            continue

        result = qa_function(example)

        if result['prediction'] is not None:
            predictions.append({
                "id": example['id'],
                "prediction_text": result['prediction']
            })

            references.append({
                "id": example['id'],
                "answers": {
                    "text": result['ground_truths']
                }
            })

    end = time.time()

    # Compute EM and F1 scores
    results = squad_metric.compute(predictions=predictions, references=references)
    elapsed_time = end - start

    if verbose:
        print("\nllama.cpp QA Evaluation Results")
        print("-" * 50)
        print(f"Number of evaluated examples: {len(references)}")
        print(f"Elapsed time: {elapsed_time:.2f} seconds\n")
        for key, value in results.items():
            print(f"{key}: {value:.4f}")

    return results, len(references), elapsed_time

In [5]:
from llama_cpp import Llama
import llama_cpp

print("GPU offload supported:", llama_cpp.llama_supports_gpu_offload())

# Path to your GGUF model (adjust path if needed)
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-gguf/llama-3.1-8B-f16.gguf"

# Initialize Llama with GPU layers offloaded
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_gpu_layers=80,     # Offload to GPU! Adjust as needed based on your VRAM
    verbose=False         # Prints backend info
)

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA L4, compute capability 8.9, VMM: yes


GPU offload supported: True


In [9]:
results, num_examples, elapsed_time = evaluate_qa_with_llama_cpp(
    sampled_questions,
    qa_with_llama_cpp,
    skip_unanswerable=True,
    verbose=False   # We print it manually below
)

# Print final report
print("llama.cpp (Llama-3.1-8B) float16 QA Results:")

print(f"\nNumber of examples: {num_examples}")
print(f"Elapsed time: {elapsed_time:.2f} seconds")

print("\nQA Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llama.cpp (Llama-3.1-8B) float16 QA Results:

Number of examples: 48
Elapsed time: 150.84 seconds

QA Evaluation Results:
exact_match: 0.0000
f1: 20.5061


In [1]:
from llama_cpp import Llama
import llama_cpp

print("GPU offload supported:", llama_cpp.llama_supports_gpu_offload())

# Path to your GGUF model (adjust path if needed)
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-gguf/llama-3.1-8B-Q8_0.gguf"

# Initialize Llama with GPU layers offloaded
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_gpu_layers=80,     # Offload to GPU! Adjust as needed based on your VRAM
    verbose=False         # Prints backend info
)

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA L4, compute capability 8.9, VMM: yes


GPU offload supported: True


In [6]:
results, num_examples, elapsed_time = evaluate_qa_with_llama_cpp(
    sampled_questions,
    qa_with_llama_cpp,
    skip_unanswerable=True,
    verbose=False   # We print it manually below
)

# Print final report
print("llama.cpp (Llama-3.1-8B) 8Bit QA Results:")

print(f"\nNumber of examples: {num_examples}")
print(f"Elapsed time: {elapsed_time:.2f} seconds")

print("\nQA Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llama.cpp (Llama-3.1-8B) 8Bit QA Results:

Number of examples: 48
Elapsed time: 89.03 seconds

QA Evaluation Results:
exact_match: 0.0000
f1: 20.4193


In [5]:
from llama_cpp import Llama
import llama_cpp

print("GPU offload supported:", llama_cpp.llama_supports_gpu_offload())

# Path to your GGUF model (adjust path if needed)
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-gguf/llama-3.1-8B-Q4_K_M.gguf"

# Initialize Llama with GPU layers offloaded
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_gpu_layers=80,     # Offload to GPU! Adjust as needed based on your VRAM
    verbose=False         # Prints backend info
)

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA L4, compute capability 8.9, VMM: yes


GPU offload supported: True


In [6]:
results, num_examples, elapsed_time = evaluate_qa_with_llama_cpp(
    sampled_questions,
    qa_with_llama_cpp,
    skip_unanswerable=True,
    verbose=False   # We print it manually below
)

# Print final report
print("llama.cpp (Llama-3.1-8B) 4Bit QA Results:")

print(f"\nNumber of examples: {num_examples}")
print(f"Elapsed time: {elapsed_time:.2f} seconds")

print("\nQA Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llama.cpp (Llama-3.1-8B) 4Bit QA Results:

Number of examples: 48
Elapsed time: 58.31 seconds

QA Evaluation Results:
exact_match: 0.0000
f1: 19.4507
