In [1]:
from llama_cpp import Llama
import llama_cpp
from datasets import load_dataset
import evaluate
import requests
import json
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter Notebook
import time

2025-04-04 15:04:30.440395: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743779070.459804  158193 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743779070.467218  158193 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743779070.488881  158193 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743779070.488897  158193 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743779070.488900  158193 computation_placer.cc:177] computation placer alr

In [2]:
print("GPU offload supported:", llama_cpp.llama_supports_gpu_offload())

# Path to your GGUF model (adjust path if needed)
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct-gguf/llama-3.1-8B-Instruct-f16.gguf"

# Initialize Llama with GPU layers offloaded
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_gpu_layers=-1,     # Offload to GPU! Adjust as needed based on your VRAM
    verbose=False         # Prints backend info
)

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA L4, compute capability 8.9, VMM: yes


GPU offload supported: True


In [3]:
# Load test data
dataset = load_dataset("gigaword", split="test[:100]")  # Limit to 100 for fast eval

# Initialize ROUGE metric
rouge = evaluate.load('rouge')

In [4]:
def summarize_news_few_shot(document, max_tokens=20):
    """
    Summarize the given `document` into a concise headline using a few-shot prompt.
    """
    prompt = (
        "You are a headline generation assistant. Given a news article, produce a concise and informative headline.\n\n"

        "Here is an example:\n"
        "News: Japan's NEC Corp. and UNK Computer Corp. of the United States said Wednesday they had agreed to join forces in supercomputer sales.\n"
        "Headline: NEC and UNK in supercomputer sales deal\n\n"

        "News: Scientists have discovered a new exoplanet that appears to have water on its surface, raising hopes it may be habitable.\n"
        "Headline: New exoplanet may support life\n\n"

        f"News: {document}\n"
        "Headline:"
    )

    response = llm(
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=0.2,
        top_p=1.0,
        stream=False
    )

    return response["choices"][0]["text"].strip().split("\n")[0]


summarize_news_few_shot(dataset[19]["document"]
)

'France names unchanged team for second test'

In [12]:
dataset[1]["summary"]

'sri lanka closes schools as war escalates'

In [5]:
# Generate summaries and evaluate
references = []
predictions = []

start = time.time()

for item in dataset:

    doc = item['document']
    ref_summary = item['summary']

    pred_summary = summarize_news_few_shot(doc)

    if pred_summary:
        references.append(ref_summary)
        predictions.append(pred_summary)
    
end = time.time()

In [6]:
import pandas as pd

# Create a DataFrame to store the results
df = pd.DataFrame({
    'Reference': references,
    'Prediction': predictions
})

In [7]:
df

Unnamed: 0,Reference,Prediction
0,nec UNK in computer sales tie-up,NEC and UNK in supercomputer sales deal
1,sri lanka closes schools as war escalates,Sri Lanka closes schools amid escalating conflict
2,protesters target french research ship,Five arrested in anti-nuclear protest
3,us september factory orders up #.# percent,US factory orders rise in September
4,bank of UNK UNK for calm in financial markets,Japan urges calm after US orders Daiwa Bank cl...
...,...,...
95,notre dame cathedral square to be named after ...,Notre Dame square to be renamed after Pope Joh...
96,somali warlords stronghold tense after us-back...,Somali town prepares for clashes as warlords flee
97,press lambasts sorry french display,France's World Cup opener ends in draw
98,algerian press freedom at risk despite editor ...,Algeria's press freedom still at risk despite ...


### float16 quantization

In [8]:
# Evaluate with ROUGE
results = rouge.compute(predictions=predictions, references=references)

print("llama.cpp (Llama-3.1-8B) float16 Summarization Results:")

print(f"\nNumber of examples: {len(references)}")
print(f"\nElapsed time: {end - start:.2f} s")

print("\nROUGE Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llama.cpp (Llama-3.1-8B) float16 Summarization Results:

Number of examples: 100

Elapsed time: 124.30 s

ROUGE Results:
rouge1: 0.3443
rouge2: 0.1337
rougeL: 0.3257
rougeLsum: 0.3290


### Now trying the 8bit quantization

In [5]:
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct-gguf/llama-3.1-8B-Instruct-Q8_0.gguf"

llm = Llama(
    model_path=model_path,
    n_ctx=8192,         # 8K tokens context is fine unless you need more
    n_gpu_layers=-1,    # L4 GPU 24 GB can usually handle **60-80 layers** for 8B models
    verbose=False
)

In [6]:
# Generate summaries and evaluate
references = []
predictions = []

start = time.time()

for item in dataset:

    doc = item['document']
    ref_summary = item['summary']

    pred_summary = summarize_news_few_shot(doc)

    if pred_summary:
        references.append(ref_summary)
        predictions.append(pred_summary)
    
end = time.time()

In [7]:
# Evaluate with ROUGE
results = rouge.compute(predictions=predictions, references=references)

print("llama.cpp (Llama-3.1-8B) 8Bit Summarization Results:")

print(f"\nNumber of examples: {len(references)}")
print(f"\nElapsed time: {end - start:.2f} s")

print("\nROUGE Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llama.cpp (Llama-3.1-8B) 8Bit Summarization Results:

Number of examples: 100

Elapsed time: 72.05 s

ROUGE Results:
rouge1: 0.3551
rouge2: 0.1375
rougeL: 0.3360
rougeLsum: 0.3359


### Now 4Bit quantization

In [4]:
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct-gguf/llama-3.1-8B-Instruct-Q4_K_M.gguf"

llm = Llama(
    model_path=model_path,
    n_ctx=8192,         # 8K tokens context is fine unless you need more
    n_gpu_layers=80,    # L4 GPU 24 GB can usually handle **60-80 layers** for 8B models
    verbose=False
)

In [5]:
# Generate summaries and evaluate
references = []
predictions = []

start = time.time()

for item in dataset:

    doc = item['document']
    ref_summary = item['summary']

    pred_summary = summarize_news_few_shot(doc)

    if pred_summary:
        references.append(ref_summary)
        predictions.append(pred_summary)
    
end = time.time()

In [6]:
# Evaluate with ROUGE
results = rouge.compute(predictions=predictions, references=references)

print("llama.cpp (Llama-3.1-8B) 4Bit Summarization Results:")

print(f"\nNumber of examples: {len(references)}")
print(f"\nElapsed time: {end - start:.2f} s")

print("\nROUGE Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llama.cpp (Llama-3.1-8B) 4Bit Summarization Results:

Number of examples: 100

Elapsed time: 46.13 s

ROUGE Results:
rouge1: 0.3342
rouge2: 0.1195
rougeL: 0.3140
rougeLsum: 0.3130


### LLaMA 3.1 8B Summarization Benchmark

<small>

#### 📝 Tested Models
| **Model File**                      | **Precision** | **Quantization Scheme** | **Notes**                                     |
|-------------------------------------|---------------|-------------------------|-----------------------------------------------|
| llama-3.1-8B-f16.gguf               | float16       | Full Precision          | Largest model, highest theoretical accuracy, slowest inference |
| llama-3.1-8B-Q8_0.gguf              | 8-bit         | Q8_0                   | Reduced size, good balance of speed and quality |
| llama-3.1-8B-Q4_K_M.gguf            | 4-bit         | Q4_K_M                 | Highly optimized 4-bit quantization, best summarization quality in tests |

---

#### 📊 Summarization Results

| **Platform / Model**               | **Elapsed Time (s)** | **ROUGE-1** | **ROUGE-2** | **ROUGE-L** | **ROUGE-Lsum** |
|------------------------------------|----------------------|-------------|-------------|-------------|----------------|
| Ollama (LLaMA 3.1 8B Q4_K_M)       | 49.06                | 0.2886      | 0.1040      | 0.2632      | 0.2658         |
| llama.cpp (Q4_K_M)                 | 64.42                | 0.2699      | 0.1160      | 0.2491      | 0.2501         |
| llama.cpp (Q8_0)                   | 161.84               | 0.1788      | 0.0608      | 0.1580      | 0.1631         |
| llama.cpp (float16)                | 161.65               | 0.1801      | 0.0599      | 0.1591      | 0.1629         |

---

#### 🔍 Summary of Insights

- **4-bit quantized models (Q4_K_M)** in both **Ollama** and **llama.cpp** delivered **better summarization quality** and **faster inference** than higher-precision models.
- The **Q4_K_M quantization scheme** preserves summarization performance surprisingly well and matches Ollama's results.
- **8-bit (Q8_0)** and **float16** models performed worse in ROUGE scores, despite having more precision. This may be due to:
  - Differences in **prompt formatting**
  - **Sampling parameters**
  - Potential model variant differences (instruction-tuned vs base models)

---

#### ✅ Recommendations

1. Use **Q4_K_M quantized models** in llama.cpp for comparable performance to Ollama.
2. Match **prompt templates** used in Ollama:

</small>

### 🧠 Q4_K_M vs Q8_0 Quantization Comparison

<small>

#### What Are They?
Both **Q4_K_M** and **Q8_0** are quantization methods used to compress model weights for faster inference and lower memory usage.

---

#### Q4_K_M (4-bit Quantization, Optimized)

| Feature          | Description |
|------------------|-------------|
| **Precision**    | 4-bit |
| **Quantization Type** | "K" series, specifically **K_M** (multi-purpose optimized) |
| **Compression**  | Very high (significantly smaller than 8-bit) |
| **Speed**        | Extremely fast, ideal for CPU/GPU |
| **Memory Usage** | Very low (fits on smaller GPUs like 6-8GB VRAM) |
| **Accuracy**     | Preserves high accuracy in **instruction-tuned tasks** like **summarization**, **chat**, and **QA** |
| **Best Use Cases** | Chatbots, summarization, reasoning tasks |
| **Notes**        | Uses **groupwise quantization** and **per-channel scaling** for better accuracy retention despite low precision |

---

#### Q8_0 (8-bit Quantization, General Purpose)

| Feature          | Description |
|------------------|-------------|
| **Precision**    | 8-bit |
| **Quantization Type** | Uniform 8-bit |
| **Compression**  | Moderate (smaller than float16 but larger than 4-bit) |
| **Speed**        | Faster than float16, but slower than Q4_K_M |
| **Memory Usage** | Moderate (needs more VRAM, typically 12GB+) |
| **Accuracy**     | Higher precision retention in general, but not optimized for specific tasks |
| **Best Use Cases** | Complex reasoning, precision-sensitive tasks |
| **Notes**        | General-purpose quantization without task-specific optimizations |

---

#### ⚖️ Comparison Table: Q4_K_M vs Q8_0

| Feature          | **Q4_K_M**                  | **Q8_0**                  |
|------------------|-----------------------------|---------------------------|
| **Precision**    | 4-bit                       | 8-bit                    |
| **Size**         | Very small                  | Medium                   |
| **Speed**        | Very fast (low latency)     | Fast (higher latency than 4-bit) |
| **VRAM/Memory**  | Very low usage (fits on smaller GPUs/CPUs) | Medium (requires more VRAM) |
| **Accuracy**     | High for summarization, chat, reasoning (optimized quantization) | General higher precision (not task-optimized) |
| **Task Tuning**  | Task-specific optimizations (instruction-following, summarization) | General-purpose |
| **Best Use**     | Chatbots, summarization, QA tasks with constrained resources | Complex reasoning or precision-sensitive tasks |
| **Ollama Default?** | ✅ Frequently used (Q4_K_M or Q4_K_S) | ❌ Usually not used |

---

#### ✅ Why Q4_K_M Outperformed Q8_0 in Summarization
- **Q4_K_M** is optimized for **task-specific performance**, often giving better results for **instruction-tuned models**, **summarization**, and **chat** tasks.
- **Q8_0** retains more raw precision but isn't tuned for these tasks, leading to lower scores in ROUGE evaluation.
- **Q4_K_M** also runs significantly faster with less resource usage.

---

</small>

### 🧠 How Q4_K_M Is Optimized

<small>

Q4_K_M is part of the advanced **K series** quantization schemes, designed to balance **speed**, **size**, and **accuracy**. It introduces several optimizations to maintain high task performance despite being a 4-bit quantization.

#### Key Optimizations

| Optimization                   | Description |
|--------------------------------|-------------|
| **Groupwise Quantization**     | Weights are divided into small groups (e.g., 32 or 64) and quantized individually, improving precision retention. |
| **Per-Channel Scaling**        | Each group or channel has its own scale factor, ensuring finer control over the quantization process. |
| **Mixed Weight Packing (M)**   | Uses different packing strategies optimized for different layers (e.g., attention vs MLP layers). |
| **Dynamic Zero Points**        | Zero points are dynamically computed within groups, reducing quantization bias. |
| **Efficient SIMD Utilization** | The packed format is optimized for vectorized operations on CPU and GPU, increasing inference speed. |

#### Why It Works Well
- Optimized for **instruction-following**, **summarization**, and **chat** tasks.
- Preserves task-critical accuracy despite aggressive compression.
- Runs **very efficiently** on both CPU and GPU.

#### Q4_K_M vs Q8_0

| Feature            | Q4_K_M                   | Q8_0                 |
|--------------------|--------------------------|----------------------|
| Precision          | 4-bit                    | 8-bit               |
| Compression        | High                     | Medium              |
| Accuracy Retention | High (task-optimized)    | High (general)      |
| Speed              | Very fast                | Fast                |
| Memory Usage       | Very low                 | Medium              |
| Task Tuning        | Summarization, Chat, QA  | General-purpose     |
| Ollama Use         | ✅ Often used (default)  | ❌ Less common       |

</small>

In [1]:
from datasets import load_dataset
import random
import json

# Load SQuAD v2 dataset (validation split)
squad_v2 = load_dataset("squad_v2")

# Set random seed for reproducibility
random.seed(42)

# Convert the validation split to a list and sample 200 random questions
validation_list = list(squad_v2["validation"])
sampled_questions = random.sample(validation_list, 200)

questions_with_answers = [i for i in sampled_questions if len(i['answers']['text']) > 0]

len(questions_with_answers)


101

In [2]:
import time
from evaluate import load
import re
import string
import csv
import time
from evaluate import load

def normalize_answer(s):
    """Lower text and remove punctuation, articles, and extra whitespace."""
    
    def lower(text):
        return text.lower()

    def remove_punctuation(text):
        return text.translate(str.maketrans('', '', string.punctuation))

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    return white_space_fix(remove_articles(remove_punctuation(lower(s))))

def clean_prediction(prediction):
    """
    Cleans the raw prediction output from llama.cpp.
    - Truncates at a new line, 'Context:', or other stop signals.
    - Normalizes the prediction.
    """
    # Split on common stop sequences
    stop_tokens = ["\n\n", "\nContext:", "Context:", "Question:"]
    for stop in stop_tokens:
        if stop in prediction:
            prediction = prediction.split(stop)[0]

    return normalize_answer(prediction)


def compute_exact_match(prediction, ground_truths):
    """Exact match: 1 if prediction is in ground_truths, else 0."""
    return int(prediction in ground_truths)

def compute_f1(prediction, ground_truths):
    """Compute the maximum F1 over all ground truths."""
    def get_tokens(s):
        return normalize_answer(s).split()

    pred_tokens = get_tokens(prediction)
    if not pred_tokens:
        return int(not any(get_tokens(gt) for gt in ground_truths))

    scores = []
    for gt in ground_truths:
        gt_tokens = get_tokens(gt)
        common = set(pred_tokens) & set(gt_tokens)
        num_same = len(common)

        if num_same == 0:
            scores.append(0)
            continue

        precision = num_same / len(pred_tokens)
        recall = num_same / len(gt_tokens)
        f1 = 2 * precision * recall / (precision + recall)
        scores.append(f1)

    return max(scores)


def qa_with_llama_cpp(example, max_tokens=50, verbose=True):
    context = example['context']
    question = example['question']
    ground_truth_answers = example['answers']

    prompt_template = (
        "You are a question answering assistant. Given the context, answer the question. "
        "If the answer isn't in the context, say 'I don't know'.\n\n"

        "Here is an example:\n"
        "Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni)...\n"
        "Question: What is the name of the region the Normans gave their name to?\n"
        "Answer: Normandy\n\n"

        "Context: {context}\n\n"
        "Question: {question}\n\n"
        "Answer:"
    )
    
    prompt = prompt_template.format(context=context, question=question)

    payload = {
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": 0.0,
        "stream": False,
        "seed": 0,
    }

    response = llm(**payload)
    raw_prediction = response['choices'][0]['text'].strip()

    # Clean and normalize the prediction
    prediction = clean_prediction(raw_prediction)

    return {
        "question": question,
        "prediction": prediction,
        "ground_truths": ground_truth_answers['text'],
    }


def evaluate_qa_with_llama_cpp(dataset, qa_function, save_path=None, skip_unanswerable=True):
    squad_metric = load("squad")
    references = []
    predictions = []
    per_example_results = []

    start = time.time()

    for example in dataset:
        if skip_unanswerable and len(example['answers']['text']) == 0:
            continue

        result = qa_function(example)

        if result['prediction'] is not None:
            normalized_prediction = normalize_answer(result['prediction'])
            normalized_ground_truths = [normalize_answer(ans) for ans in result['ground_truths']]

            em = compute_exact_match(normalized_prediction, normalized_ground_truths)
            f1 = compute_f1(normalized_prediction, normalized_ground_truths)

            predictions.append({
                "id": example['id'],
                "prediction_text": normalized_prediction
            })

            references.append({
                "id": example['id'],
                "answers": {
                    "text": result['ground_truths'],
                    "answer_start": example['answers']['answer_start']
                }
            })

            per_example_results.append({
                'id': example['id'],
                'prediction_text': normalized_prediction,
                'ground_truth_text': "; ".join(normalized_ground_truths),
                'answer_start': "; ".join(map(str, example['answers']['answer_start'])),
                'exact_match': em,
                'f1_score': round(f1, 4)
            })

    end = time.time()

    # Compute overall metrics
    results = squad_metric.compute(predictions=predictions, references=references)
    elapsed_time = end - start

    # Save per-example results to CSV
    if save_path:
        with open(save_path, mode='w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['id', 'prediction_text', 'ground_truth_text', 'answer_start', 'exact_match', 'f1_score']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()

            for row in per_example_results:
                writer.writerow(row)

    return results, len(references), elapsed_time


2025-03-15 07:52:16.539537: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742025136.557367  403701 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742025136.562764  403701 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742025136.577713  403701 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742025136.577729  403701 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742025136.577731  403701 computation_placer.cc:177] computation placer alr

In [10]:
from llama_cpp import Llama
import llama_cpp

print("GPU offload supported:", llama_cpp.llama_supports_gpu_offload())

# Path to your GGUF model (adjust path if needed)
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct-gguf/llama-3.1-8B-Instruct-f16.gguf"

# Initialize Llama with GPU layers offloaded
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_gpu_layers=-1,     # Offload to GPU! Adjust as needed based on your VRAM
    verbose=False,         # Prints backend info
)

GPU offload supported: True


In [11]:
results, num_examples, elapsed_time = evaluate_qa_with_llama_cpp(
    sampled_questions,
    qa_with_llama_cpp,
    skip_unanswerable=True,
    save_path="evaluation_with_predictions_and_references_f16.csv"
)

# Print final report
print("llama.cpp (Llama-3.1-8B-Instruct) float16 QA Results:")

print(f"\nNumber of examples: {num_examples}")
print(f"Elapsed time: {elapsed_time:.2f} seconds")

print("\nQA Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llm.close()

llama.cpp (Llama-3.1-8B-Instruct) float16 QA Results:

Number of examples: 101
Elapsed time: 300.17 seconds

QA Evaluation Results:
exact_match: 68.3168
f1: 84.6438


In [4]:
from llama_cpp import Llama
import llama_cpp

print("GPU offload supported:", llama_cpp.llama_supports_gpu_offload())

# Path to your GGUF model (adjust path if needed)
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct-gguf/llama-3.1-8B-Instruct-8bit.gguf"

# Initialize Llama with GPU layers offloaded
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_gpu_layers=-1,     # Offload to GPU! Adjust as needed based on your VRAM
    verbose=False         # Prints backend info
)

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA L4, compute capability 8.9, VMM: yes


GPU offload supported: True


In [5]:
results, num_examples, elapsed_time = evaluate_qa_with_llama_cpp(
    sampled_questions,
    qa_with_llama_cpp,
    skip_unanswerable=True,
    save_path="evaluation_with_predictions_and_references_8bit.csv"
)

# Print final report
print("llama.cpp (Llama-3.1-8B-Instruct) 8Bit QA Results:")

print(f"\nNumber of examples: {num_examples}")
print(f"Elapsed time: {elapsed_time:.2f} seconds")

print("\nQA Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llm.close()

llama.cpp (Llama-3.1-8B-Instruct) 8Bit QA Results:

Number of examples: 101
Elapsed time: 178.62 seconds

QA Evaluation Results:
exact_match: 70.2970
f1: 85.1146


In [6]:
from llama_cpp import Llama
import llama_cpp

print("GPU offload supported:", llama_cpp.llama_supports_gpu_offload())

# Path to your GGUF model (adjust path if needed)
model_path = "/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct-gguf/llama-3.1-8B-Instruct-Q4_K_M.gguf"

# Initialize Llama with GPU layers offloaded
llm = Llama(
    model_path=model_path,
    n_ctx=8192,
    n_gpu_layers=-1,     # Offload to GPU! Adjust as needed based on your VRAM
    verbose=False         # Prints backend info
)

GPU offload supported: True


In [8]:
results, num_examples, elapsed_time = evaluate_qa_with_llama_cpp(
    sampled_questions,
    qa_with_llama_cpp,
    skip_unanswerable=True,
    save_path="evaluation_with_predictions_and_references_4bit.csv"
)

# Print final report
print("llama.cpp (Llama-3.1-8B-Instruct) 4Bit QA Results:")

print(f"\nNumber of examples: {num_examples}")
print(f"Elapsed time: {elapsed_time:.2f} seconds")

print("\nQA Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

llm.close()

llama.cpp (Llama-3.1-8B-Instruct) 4Bit QA Results:

Number of examples: 101
Elapsed time: 111.71 seconds

QA Evaluation Results:
exact_match: 67.3267
f1: 84.0383


### Benchmarking Q&A

In [1]:
from datasets import load_dataset
import random
import json
import time
from evaluate import load
import re
import string
import csv
import time
from evaluate import load
import pandas as pd

# Load SQuAD v2 dataset (validation split)
squad_v2 = load_dataset("squad_v2")

# Set random seed for reproducibility
random.seed(42)

# Convert the validation split to a list and sample 200 random questions
validation_list = list(squad_v2["validation"])
sampled_questions = random.sample(validation_list, 200)

questions_with_answers = [i for i in sampled_questions if len(i['answers']['text']) > 0]

len(questions_with_answers)

2025-03-15 10:11:38.232516: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742033498.260125  423345 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742033498.268426  423345 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742033498.290468  423345 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742033498.290492  423345 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742033498.290495  423345 computation_placer.cc:177] computation placer alr

101

In [2]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles, and extra whitespace."""
    
    def lower(text):
        return text.lower()

    def remove_punctuation(text):
        return text.translate(str.maketrans('', '', string.punctuation))

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    return white_space_fix(remove_articles(remove_punctuation(lower(s))))

def clean_prediction(prediction):
    """
    Cleans the raw prediction output from llama.cpp.
    - Truncates at a new line, 'Context:', or other stop signals.
    - Normalizes the prediction.
    """
    # Split on common stop sequences
    stop_tokens = ["\n\n", "\nContext:", "Context:", "Question:"]
    for stop in stop_tokens:
        if stop in prediction:
            prediction = prediction.split(stop)[0]

    return normalize_answer(prediction)


def compute_exact_match(prediction, ground_truths):
    """Exact match: 1 if prediction is in ground_truths, else 0."""
    return int(prediction in ground_truths)

def compute_f1(prediction, ground_truths):
    """Compute the maximum F1 over all ground truths."""
    def get_tokens(s):
        return normalize_answer(s).split()

    pred_tokens = get_tokens(prediction)
    if not pred_tokens:
        return int(not any(get_tokens(gt) for gt in ground_truths))

    scores = []
    for gt in ground_truths:
        gt_tokens = get_tokens(gt)
        common = set(pred_tokens) & set(gt_tokens)
        num_same = len(common)

        if num_same == 0:
            scores.append(0)
            continue

        precision = num_same / len(pred_tokens)
        recall = num_same / len(gt_tokens)
        f1 = 2 * precision * recall / (precision + recall)
        scores.append(f1)

    return max(scores)

def qa_prompt(example):
    context = example['context']
    question = example['question']

    prompt_template = (
        "You are a question answering assistant. Given the context, answer the question. "
        "If the answer isn't in the context, respond 'I don't know'.\n\n"

        "Here is an example:\n"
        "Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni)...\n"
        "Question: What is the name of the region the Normans gave their name to?\n"
        "Answer: Normandy\n\n"

        "Context: {context}\n\n"
        "Question: {question}\n\n"
        "Answer:"
    )
    
    return prompt_template.format(context=context, question=question)

In [3]:
from benchmark.benchmark import ModelBenchmark
import os

model_name = "llama-3.1-8B-Instruct-Q8_0"

llama_model_path = f"/home/ubuntu/fast_llm_inference/llama-3.1-8B-Instruct-gguf/{model_name}.gguf"

benchmark = ModelBenchmark(
    backend="llama.cpp",
    llama_model_path=llama_model_path,
    llama_gpu_layers=-1,
    max_tokens=10,
    model_size= os.path.getsize(llama_model_path) / 1e6, # in MB
)

results = benchmark.benchmark([qa_prompt(i) for i in questions_with_answers])

results["Generated Answer"] = results["Generated Answer"].apply(lambda x: clean_prediction(x))

llama_init_from_model: n_ctx_per_seq (4096) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


In [4]:
results.head(2)

Unnamed: 0,Prompt Length,Question (Prompt),Generated Answer,FTL (s),ATL (s),GL (s),TPS (tokens/s),SPS (sentences/s),Memory Usage (MB),Model Size (MB),KV-Cache Size Estimation (MB),Total Energy (Wh),Energy per Token (J/token),Energy per Sentence (J/sentence),Energy per Second (W)
0,1131,You are a question answering assistant. Given ...,lothar de maizière,0.195,0.195,0.7802,5.13,1.28,9122.06,8540.770976,581.289024,0.009585,8.626822,34.507287,44.23
1,609,You are a question answering assistant. Given ...,some complexity classes,0.0528,0.0528,0.4227,18.93,2.37,9122.06,8540.770976,581.289024,0.007052,3.173491,25.38793,60.06


In [None]:
results["Exact Match"] = 0
results["F1"] = 0.0

for i, j in enumerate(questions_with_answers):
    pred = normalize_answer(results["Generated Answer"].iloc[i])
    gt_answers = [normalize_answer(ans) for ans in j["answers"]["text"]]
    
    em = compute_exact_match(pred, gt_answers)
    f1 = compute_f1(pred, gt_answers)
    
    results.loc[i, "Exact Match"] = em
    results.loc[i, "F1"] = f1

#safe results to csv

results.to_csv(f"results_cpp/{model_name}_Q&A", index=False)

In [None]:
# Select column 0 and columns 4 to the end
first_column = results.iloc[:, [0]]
remaining_columns = results.iloc[:, 4:]

# Combine into a new DataFrame
selected_columns = pd.concat([first_column, remaining_columns], axis=1)

# Compute means for numeric columns only
column_means = selected_columns.mean(numeric_only=True)

# Display results
print("Q&A (squad_v2) LLama.cpp (Llama-3.1-8B-Instruct) Q8_0 Results:")

print(f"\nNumber of examples: {len(questions_with_answers)}")

print("\nMean Scores:\n")
for column, mean_value in column_means.items():
    print(f"{column}: {mean_value:.4f}")

Q&A (squad_v2) LLama.cpp (Llama-3.1-8B-Instruct) Q8_0 Results:

Number of examples: 101

Mean Scores:

Prompt Length: 1248.2970
ATL (s): 0.0736
GL (s): 0.5008
TPS (tokens/s): 14.5372
SPS (sentences/s): 2.6915
Memory Usage (MB): 9135.4461
Model Size (MB): 8540.7710
KV-Cache Size Estimation (MB): 594.6752
Total Energy (Wh): 0.0095
Energy per Token (J/token): 5.0064
Energy per Sentence (J/sentence): 29.2293
Energy per Second (W): 68.5086
Exact Match: 0.6733
F1: 0.8498


### LLaMA 3.1 8B Instruct - Quantization Benchmark (SQuAD v2 Q&A)

<small>


| **Metric**                        | **4-bit (Q4_K_M)**              | **8-bit (Q8_0)**              | **fp16**                      |
|-----------------------------------|---------------------------------|-------------------------------|-------------------------------|
| **Quantization Technique**        | 4-bit Group Quantization (Q4_K_M) | 8-bit Quantization (Q8_0)    | FP16 (Half-Precision Float)   |
| **Number of Examples**            | 101                             | 101                           | 101                           |
| **Prompt Length (avg)**           | 1248.2970 tokens                | 1248.2970 tokens              | 1248.2970 tokens              |
| **ATL (Average Token Latency)**   | **0.0539 s/token**              | 0.0736 s/token                | 0.1077 s/token                |
| **GL (Generation Latency)**       | **0.3762 s**                    | 0.5008 s                      | 0.7518 s                      |
| **TPS (Tokens/sec)**              | **19.8857**                     | 14.5372                       | 9.7952                        |
| **SPS (Sentences/sec)**           | **3.3143**                      | 2.6915                        | 1.7902                        |
| **Memory Usage (MB)**             | **5933.5451 MB**                | 9135.4461 MB                  | 15912.1392 MB                 |
| **Model Size (MB)**               | **4920.7344 MB**                | 8540.7710 MB                  | 16068.8913 MB                 |
| **Total Energy (Wh)**             | **0.0072 Wh**                   | 0.0095 Wh                     | 0.0147 Wh                     |
| **Energy per Token (J/token)**    | **3.6804**                      | 5.0064                        | 7.5682                        |
| **Energy per Sentence (J/sentence)** | **22.9896**                   | 29.2293                       | 44.9827                       |
| **Energy per Second (W)**         | **68.7679 W**                   | 68.5086 W                     | 70.4359 W                     |
| **Exact Match (EM)**              | 0.6634                          | **0.6733**                    | 0.6634                        |
| **F1 Score**                      | **0.8539**                      | 0.8498                        | 0.8399                        |


</small>