## Excercise 1

```bash
vllm serve Qwen/Qwen3-1.7B \
  --port 8000 \
  --max-model-len 8192 \
  --max-num-seqs 32 \
  --gpu-memory-utilization 0.85

vllm serve Qwen/Qwen3-1.7B \
  --port 8000 \
  --max-model-len 8192 \
  --max-num-seqs 32 \
  --gpu-memory-utilization 0.85 \
  --quantization bitsandbytes
```

(EngineCore_DP0 pid=29361) INFO 01-21 16:28:44 [monitor.py:34] torch.compile takes 12.04 s in total
(EngineCore_DP0 pid=29361) INFO 01-21 16:28:45 [gpu_worker.py:358] Available KV cache memory: 9.04 GiB
(EngineCore_DP0 pid=29361) INFO 01-21 16:28:46 [kv_cache_utils.py:1305] GPU KV cache size: 84,608 tokens
(EngineCore_DP0 pid=29361) INFO 01-21 16:28:46 [kv_cache_utils.py:1310] Maximum concurrency for 8,192 tokens per request: 10.33x


(EngineCore_DP0 pid=34017) INFO 01-21 16:40:15 [gpu_worker.py:358] Available KV cache memory: 10.57 GiB
(EngineCore_DP0 pid=34017) INFO 01-21 16:40:16 [kv_cache_utils.py:1305] GPU KV cache size: 98,976 tokens
(EngineCore_DP0 pid=34017) INFO 01-21 16:40:16 [kv_cache_utils.py:1310] Maximum concurrency for 8,192 tokens per request: 12.08x



In [8]:
import time
from openai import OpenAI

def benchmark_vllm_chat(
    prompts,
    base_url="http://localhost:8000/v1",
    model="",
    max_completion_tokens=200,
):
    """
    Benchmarks vLLM OpenAI-compatible chat completions.

    Args:
        prompts (list[str]): List of user prompts.
        base_url (str): vLLM server URL.
        model (str): Model name (empty string = default server model).
        max_completion_tokens (int): Max tokens per response.

    Returns:
        dict with timing statistics.
    """

    client = OpenAI(api_key="EMPTY", base_url=base_url)

    start_time = time.perf_counter()
    latencies = []

    for prompt in prompts:
        req_start = time.perf_counter()

        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "developer", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt},
            ],
            max_completion_tokens=max_completion_tokens,
            extra_body={"chat_template_kwargs": {"enable_thinking": False}},
        )

        _ = response.choices[0].message.content
        latencies.append(time.perf_counter() - req_start)

    total_time = time.perf_counter() - start_time

    return {
        "num_requests": len(prompts),
        "total_time_sec": total_time,
        "avg_latency_sec": sum(latencies) / len(latencies),
        "min_latency_sec": min(latencies),
        "max_latency_sec": max(latencies),
    }


In [9]:
prompts = [
    "Explain what MLOps is.",
    "What is LLMOps?",
    "Explain KV cache in transformers.",
    "What is dynamic batching?",
    "Explain model quantization.",
    "What is bitsandbytes?",
    "What is FlashAttention?",
    "Explain prefix caching.",
    "What is vLLM?",
    "Difference between FP16 and INT4.",
]

results = benchmark_vllm_chat(prompts)

results

{'num_requests': 10,
 'total_time_sec': 31.157270540000354,
 'avg_latency_sec': 3.1157256762000087,
 'min_latency_sec': 3.0992684600000757,
 'max_latency_sec': 3.1314260409999406}

# TODO task 1: benchmark other quanitzaziton change benchamrking code, change prompts

## Exercise 2 (tool calling)

vllm serve Qwen/Qwen3-1.7B \
  --port 8000 \
  --max-model-len 8192 \
  --max-num-seqs 32 \
  --gpu-memory-utilization 0.85 \
  --enable-auto-tool-choice \
  --tool-call-parser hermes

In [11]:
import datetime
import json
from typing import Callable
from openai import OpenAI


def make_llm_request(
        prompt: str, 
        tool_definitions: list[dict], 
        tool_name_to_func: dict[str, Callable]
    ) -> str:

    client = OpenAI(api_key="EMPTY", base_url="http://localhost:8000/v1")
    messages = [
        {"role": "developer", "content": "You are a weather assistant."},
        {"role": "user", "content": prompt},
    ]

    # guard: loop limit, we break as soon as we get an answer
    for _ in range(10):
        response = client.chat.completions.create(
            model="",
            messages=messages,
            tools=tool_definitions,  # always pass all tools in this example
            tool_choice="auto",
            max_completion_tokens=1000,
            extra_body={"chat_template_kwargs": {"enable_thinking": False}},
        )
        resp_message = response.choices[0].message
        messages.append(resp_message.model_dump())

        print(f"Generated message: {resp_message.model_dump()}")
        print()

        # parse possible tool calls (assume only "function" tools)
        if resp_message.tool_calls:
            for tool_call in resp_message.tool_calls:
                func_name = tool_call.function.name
                func_args = json.loads(tool_call.function.arguments)

                # call tool, serialize result, append to messages
                func = tool_name_to_func[func_name]
                func_result = func(**func_args)

                messages.append(
                    {
                        "role": "tool",
                        "content": json.dumps(func_result),
                        "tool_call_id": tool_call.id,
                    }
                )
        else:
            # no tool calls, we're done
            return resp_message.content

    # we should not get here
    last_response = resp_message.content
    return f"Could not resolve request, last response: {last_response}"


def get_tool_definitions() -> tuple[list[dict], dict[str, Callable]]:
    tool_definitions = [
        {
            "type": "function",
            "function": {
                "name": "get_current_date",
                "description": 'Get current date in the format "Year-Month-Day" (YYYY-MM-DD).',
                "parameters": {},
            },
        },
        {
            "type": "function",
            "function": {
                "name": "get_weather_forecast",
                "description": "Get weather forecast at given country, city, and date.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "country": {
                            "type": "string",
                            "description": "The country the city is in.",
                        },
                        "city": {
                            "type": "string",
                            "description": "The city to get the weather for.",
                        },
                        "date": {
                            "type": "string",
                            "description": (
                                "The date to get the weather for, "
                                'in the format "Year-Month-Day" (YYYY-MM-DD). '
                                "At most 4 weeks into the future."
                            ),
                        },
                    },
                    "required": ["country", "city", "date"],
                },
            },
        },
    ]

    tool_name_to_callable = {
        "get_current_date": current_date_tool,
        "get_weather_forecast": weather_forecast_tool,
    }

    return tool_definitions, tool_name_to_callable


def current_date_tool() -> str:
    return datetime.date.today().isoformat()


def weather_forecast_tool(country: str, city: str, date: str) -> str:
    if country.lower() in {"united kingdom", "uk", "england"}:
        return "Fog and rain"
    else:
        return "Sunshine"


prompt = "What will be weather in Birmingham in two weeks?"
response = make_llm_request(prompt, *get_tool_definitions())
print("Response:\n", response)

print()

prompt = "What will be weather in Warsaw the day after tomorrow?"
response = make_llm_request(prompt, *get_tool_definitions())
print("Response:\n", response)

print()

prompt = "What will be weather in New York in two months?"
response = make_llm_request(prompt, *get_tool_definitions())
print("Response:\n", response)

Generated message: {'content': None, 'refusal': None, 'role': 'assistant', 'annotations': None, 'audio': None, 'function_call': None, 'tool_calls': [{'id': 'chatcmpl-tool-a4ae362e79ab9155', 'function': {'arguments': '{}', 'name': 'get_current_date'}, 'type': 'function'}], 'reasoning': None, 'reasoning_content': None}

Generated message: {'content': None, 'refusal': None, 'role': 'assistant', 'annotations': None, 'audio': None, 'function_call': None, 'tool_calls': [{'id': 'chatcmpl-tool-91c115de7ffa4728', 'function': {'arguments': '{"country": "United Kingdom", "city": "Birmingham", "date": "2026-01-21"}', 'name': 'get_weather_forecast'}, 'type': 'function'}], 'reasoning': None, 'reasoning_content': None}

Generated message: {'content': 'The weather in Birmingham, United Kingdom, on January 21, 2026, is expected to be fog and rain.', 'refusal': None, 'role': 'assistant', 'annotations': None, 'audio': None, 'function_call': None, 'tool_calls': [], 'reasoning': None, 'reasoning_content': 

In [44]:
import polars as pl

def get_dataset_tool_definitions() -> tuple[list[dict], dict]:
    tool_definitions = [
        {
            "type": "function",
            "function": {
                "name": "read_remote_csv",
                "description": "Read a CSV file from a URL and return the first n rows as text. n can be at most 20",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "url": {
                            "type": "string",
                            "description": "Public URL to a CSV file",
                        },
                        "n": {
                            "type": "integer",
                            "description": "Maximum number of rows to return",
                            "default": 50,
                            "minimum": 0,
                            "maximum": 20
                        },
                    },
                    "required": ["url"],
                },
            },
        },
        {
            "type": "function",
            "function": {
                "name": "read_remote_parquet",
                "description": "Read a Parquet file from a URL and return the first n rows as text. n can be at most 20",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "url": {
                            "type": "string",
                            "description": "Public URL to a Parquet file",
                        },
                        "n": {
                            "type": "integer",
                            "description": "Maximum number of rows to return",
                            "default": 50,
                            "minimum": 0,
                            "maximum": 20
                        },
                    },
                    "required": ["url"],
                },
            },
        },
    ]

    tool_name_to_callable = {
        "read_remote_csv": read_remote_csv,
        "read_remote_parquet": read_remote_parquet,
    }

    return tool_definitions, tool_name_to_callable

def read_remote_csv(url: str, n: int) -> str:
    n = min(n, 20)
    try:
        df = pl.read_csv(
            url,
            n_rows=n,
            ignore_errors=True,
        )
        return str(df.to_dicts())
    except Exception as e:
        return (
            f"ERROR: Failed to read CSV from URL: {url}.\n"
            f"Reason: {type(e).__name__}: {e}\n"
        )
    

def read_remote_parquet(url: str, n: int) -> str:
    n = min(n, 20)
    try:
        df = pl.read_parquet(
            url,
            n_rows=n,
        )
        return str(df.to_dicts())
    except Exception as e:
        return (
            f"ERROR: Failed to read CSV from URL: {url}.\n"
            f"Reason: {type(e).__name__}: {e}\n"
        )

In [45]:
apis_tox_url = "https://raw.githubusercontent.com/j-adamczyk/ApisTox_dataset/master/outputs/dataset_final.csv"
taxi_data_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet"


prompt = (
    f"Here you have a daset on taxi rides: {taxi_data_url} "
    f"How much did the average ride cost in NYC taxi cost in January 2025"
)

response = make_llm_request(prompt, *get_dataset_tool_definitions())
print("Response:\n", response)

Generated message: {'content': None, 'refusal': None, 'role': 'assistant', 'annotations': None, 'audio': None, 'function_call': None, 'tool_calls': [{'id': 'chatcmpl-tool-a827e3edb39513b1', 'function': {'arguments': '{"url": "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet", "n": 20}', 'name': 'read_remote_parquet'}, 'type': 'function'}], 'reasoning': None, 'reasoning_content': None}

Generated message: {'content': 'The average ride cost in NYC for January 2025 is calculated by summing up all the `total_amount` values and dividing by the number of rides. \n\nFrom the data provided, the `total_amount` values are as follows:\n\n1. 18.0\n2. 12.12\n3. 12.1\n4. 9.7\n5. 8.3\n6. 24.1\n7. 11.75\n8. 19.1\n9. 27.1\n10. 16.4\n11. 16.4\n12. 12.96\n13. 19.2\n14. 12.9\n15. 38.9\n16. 22.7\n17. 25.55\n18. -8.54\n19. 12.2\n20. 20.6\n\nSumming these values:\n\n$$\n18.0 + 12.12 + 12.1 + 9.7 + 8.3 + 24.1 + 11.75 + 19.1 + 27.1 + 16.4 + 16.4 + 12.96 + 19.2 + 12.9 + 38.9 + 22.

## Excercise 3