### Creating evaluating pipeline.

In [None]:
model_names = ["llama-3.1-8B-Instruct", "llama-3.1-8B-Instruct-4bit", "llama-3.1-8B-Instruct-8bit"]
backends = ["vllm", "huggingface", "llama-cpp"]
task = ["qa", "sql", "summarization"]
use_cases = ["batch", "server"]

In [None]:
import multiprocessing
from benchmark.benchmark import ModelBenchmark

def run_benchmark(backend, model_name, task):
    print(f"Running benchmark for {model_name} with {backend} on {task}")
    bm = ModelBenchmark(
        backend=backend,
        model_name=model_name,
        model_path=f"/home/ubuntu/fast_llm_inference/models/{model_name}",
        task=task,
        verbose=True,
    )
    bm.run(samples=5)
    del bm  # make sure to dereference
    import torch
    torch.cuda.empty_cache()

model_names = ["llama-3.1-8B-Instruct-4bit", "llama-3.1-8B-Instruct-8bit", "llama-3.1-8B-Instruct"]
backends = ["huggingface"]
tasks = ["qa", "sql", "summarization"]

for backend in backends:
    for model_name in model_names:
        for task in tasks:
            p = multiprocessing.Process(target=run_benchmark, args=(backend, model_name, task))
            p.start()
            p.join()  # wait until finished before next


In [None]:
from benchmark.backends.backend_factory import get_backend

be = get_backend(
    name="vllm",
    model_path="/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit",
    max_tokens=10
)

be.load_model()

In [9]:
be.generate(prompt="Is the pope the highest ")

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

' authority in the Catholic Church?'

In [1]:
from benchmark.benchmark import ModelBenchmark

backends = ["huggingface", "vllm", "deepspeed_mii", "llama-cpp"]

bm = ModelBenchmark(
    backend=backends[0],
    model_name="llama-3.1-8B-Instruct",
    model_path="/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct",
    task="qa",
    verbose=False,
)

2025-04-25 12:57:32.618729: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745585852.636204  490515 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745585852.641550  490515 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745585852.657139  490515 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745585852.657157  490515 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745585852.657159  490515 computation_placer.cc:177] computation placer alr

[2025-04-25 12:57:39,395] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
INFO 04-25 12:57:42 [__init__.py:239] Automatically detected platform cuda.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [2]:
test_prompt_qa = (
            "You are a question answering assistant. Given the context, answer the question. "
            "If the answer isn't in the context, respond 'I don't know'. Provide the answer in a single line.\n\n"

            "Here is an example:\n"
            "Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni)...\n"
            "Question: What is the name of the region the Normans gave their name to?\n"
            "Answer: Normandy\n\n"

            "Context: At the end of the 19th century, the United States was a nation of farmers. "
            "By 1920, more Americans lived in cities than on farms. The industrial revolution "
            "had changed the way people lived and worked. The United States was becoming a modern nation of cities and factories. The most important change was the rise of the automobile. "
            "The automobile changed the way people lived. It changed the way people worked. It changed the way people traveled. "
            "It changed the way people thought about themselves and their country.\n"
            "Question: What was the most important change in the United States at the end of the 19th century?\n"
            "Answer:"
        )

test_prompt_sql = (
            "You are a SQL query generation assistant. Given a natural language question, generate the corresponding SQL query.\n"
            "Only generate valid SQL statements, no explanations or extra text.\n\n"

            "Here is an example:\n\n"
            "Question: How many heads of the departments are older than 56?\n\n"
            "Tables in the database:\n"
            "Table 'department': columns = Department_ID, Name, Creation, Ranking, Budget_in_Billions, Num_Employees\n"
            "Table 'head': columns = head_ID, name, born_state, age\n"
            "Table 'management': columns = department_ID, head_ID, temporary_acting\n\n"
            "SQL: SELECT count(*) FROM head WHERE age > 56\n\n"

            "Question: How many departments have a budget greater than 5 billion?\n\n"
            "Tables in the database:\n"
            "Table 'department': columns = Department_ID, Name, Creation, Ranking, Budget_in_Billions, Num_Employees\n"
            "Table 'head': columns = head_ID, name, born_state, age\n"
            "Table 'management': columns = department_ID, head_ID, temporary_acting\n\n"
            "SQL:"
        )

test_prompt_summarization = (
            "You are a news summarization assistant. Given a full news article, produce a concise and informative summary in 2–3 sentences.\n\n"

            "Example:\n\n"

            "Article:\n"
            "(CNN) -- The partnership started as a single shop on Oxford Street in London, opened in 1864 by John Lewis. "
            "Today the partnership is an organization with bases throughout the UK, with supermarkets and department stores, "
            "employing approximately 67,100 people. All 67,100 permanent staff are Partners who own 26 John Lewis department stores, "
            "183 Waitrose supermarkets, an online and catalogue business, John Lewis Direct a direct services company - Greenbee, "
            "three production units and a farm. Every Partner receives the same scale of bonus, based on a fixed percentage of their annual wage. "
            "The bonus for 2006 was 18% equivalent to 9 weeks pay, which was rolled out for every employee. "
            "Chairman Sir Stuart Hampson retired at the end of March 2007, his successor is Charlie Mayfield. Hampson's salary for January 26, "
            "2006 to January 26, 2007 was $1.66 million which included the partnership bonus of $250,000."

            "Summary:\n"
            "John Lewis Partnership began as a shop on London's Oxford street in 1864 .\n"
            "All 67,100 employees are partners in the organization and own shares ."

            "Now summarize the following article:\n\n"

            "Article: The newly released iPhone 14 has a larger screen and improved camera features. "
            "It is available in multiple colors and storage options. The battery life has also been extended, "
            "making it more efficient for daily use. The price starts at $799, and pre-orders are available now. The phone is expected to be "
            "a popular choice among consumers, especially those looking to upgrade from older models. The hardware improvements include a faster processor, "
            "better graphics performance, and enhanced security features. The iPhone 14 also supports 5G connectivity, allowing for faster internet speeds. "
            "Overall, the iPhone 14 is a significant upgrade over its predecessor, the iPhone 13, and is expected to be a top seller this holiday season.\n\n"
            "Summary:"
        )



qa = bm.generate_single(prompt=test_prompt_qa, task="qa")
sql = bm.generate_single(prompt=test_prompt_sql, task="sql")
summarization = bm.generate_single(prompt=test_prompt_summarization, task="sum")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [3]:
qa[0]

'The rise of the automobile.'

In [4]:
sql[0]

'SELECT count(*) FROM department WHERE Budget_in_Billions > 5'

In [5]:
summarization[0]

'The iPhone 14 has a larger screen, improved camera features, and extended battery life, starting at $799. It also has a faster processor, better graphics, and enhanced security, and supports 5G connectivity. The phone is expected to be a popular choice among consumers, especially those looking to upgrade from older models.'