Skip to content

Python Runtime Syntax

Mike edited this page May 28, 2026 · 2 revisions

Runtime Syntax

Manual lifecycle

runtime = xlocllm.runtime([xlocllm.unit("LLM", "Qwen-3.5-0.8b")])
runtime.install()
runtime.run()
print(runtime.status())
runtime.stop()
runtime.close()

Context manager

with xlocllm.runtime([xlocllm.unit("LLM", "Qwen-3.5-0.8b")]) as runtime:
    runtime.run()
    print(runtime.chat("Say hello", temperature=0))

Add and remove unit

rt = xlocllm.runtime([xlocllm.unit("LLM", "Qwen-3.5-0.8b")])
rt.run()
emb = xlocllm.unit("embedding", "multilingual-e5-small")
rt.add_unit(emb)
rt.remove_unit(emb.id, delete_cache=False)

Hot reasoning control

llm = xlocllm.unit("LLM", "Qwen-3.5-0.8b-fp32", reasoning=False)
with xlocllm.runtime([llm]) as rt:
    rt.run()
    rt.set_reasoning(llm.id, True)

Multi-unit runtime for OpenAI tests

from openai import OpenAI

unit = xlocllm.unit("LLM", "Qwen-3.5-0.8b")
unit1 = xlocllm.unit("embedding", "multilingual-e5-small")

rt = xlocllm.runtime([unit, unit1], mode="native")
rt.run()

client = OpenAI(base_url=rt.url, api_key="xlocllm")
# Existing OpenAI code continues here.

Low-level invoke

vectors = runtime.invoke("embeddings", {"model": "Xenova/multilingual-e5-small", "input": ["hello"]})
ranked = runtime.invoke("rerank", {"query": "local llm", "documents": ["browser", "server"]})

Clone this wiki locally