# ChatGPT API 스타일로 llama.cpp 서버 호출하기
이 노트북은 로컬에서 실행 중인 llama.cpp 기반 HTTP 서버에 `requests` 라이브러리를 이용해 OpenAI API 스타일로 요청을 보내는 예제입니다.

In [1]:
import requests
import uuid
import time

API_URL = "http://127.0.0.1:8080/v1/chat/completions"

def chat_completion(
    model: str,
    messages: list,
    max_tokens: int = 100,
    temperature: float = 0.7,
    top_p: float = 0.9,
):
    payload = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
    }

    headers = {
        "Content-Type": "application/json",
    }

    start_time = time.time()
    resp = requests.post(API_URL, json=payload, headers=headers)
    latency = time.time() - start_time

    if resp.status_code != 200:
        raise Exception(f"Request failed with status code {resp.status_code}: {resp.text}")

    result = resp.json()
    print(f"Request latency: {latency:.2f} seconds")

    return {
        "id": result.get("id", str(uuid.uuid4())),
        "object": result.get("object", "chat.completion"),
        "created": result.get("created", int(time.time())),
        "model": result.get("model", model),
        "choices": result.get("choices", []),
        "usage": result.get("usage", {}),
    }

model_name = "qwen3-4b-q4_k_m"  # 서버에서 로드된 모델 이름
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "보통 몇시에 잠에 드니?"},
]

response = chat_completion(
    model=model_name,
    messages=messages,
    max_tokens=1024,
    temperature=0.7
)

display(response)

Request latency: 8.32 seconds


{'id': 'chatcmpl-mbc2IYsmEsIdFziwQfz6EMEpWZRcWNCp',
 'object': 'chat.completion',
 'created': 1760963537,
 'model': 'qwen3-4b-q4_k_m',
 'choices': [{'finish_reason': 'stop',
   'index': 0,
   'message': {'role': 'assistant',
    'content': '<think>\nOkay, the user is asking, "보통 몇시에 잠에 드니?" which translates to "What time do you usually go to bed?" I need to respond appropriately.\n\nFirst, I should acknowledge that I\'m an AI and don\'t have a physical body, so I can\'t sleep. But I should make the response friendly and helpful. Maybe explain that I don\'t sleep but am here to help. Also, I should offer assistance with any questions they have. Keep the tone warm and approachable. Avoid any technical jargon. Make sure to invite them to ask more questions if needed.\n</think>\n\n안녕하세요! 저는 사람이 아니고 AI 어시스턴트이기 때문에 실제로 잠을 자는 건 없습니다. 😊 하지만 언제든 도움이 필요하면 언제든 말씀해 주세요! 질문이 있으신가요?'}}],
 'usage': {'completion_tokens': 179, 'prompt_tokens': 32, 'total_tokens': 211}}

reasoning 플래그를 True/False 로 넣어 객체 형태로 chat_completion 함수를 변형

In [2]:
def chat_completion(
    model: str,
    messages: list,
    max_tokens: int = 100,
    temperature: float = 0.7,
    top_p: float = 0.9,
    reasoning: bool = False      # ← 여기 새로 추가된 플래그
):
    payload = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
    }

    # reasoning 끄거나 켜기 위한 추가 필드
    if not reasoning:
        # reasoning 비활성화 요청
        payload["reasoning"] = {
            "budget": 0,
            "format": "none"
        }
        payload["chat_template_kwargs"] = {
            "enable_thinking": False
        }
    # else: reasoning=True 이면 기본 동작 유지 또는 별도 설정을 할 수도 있어요

    headers = {
        "Content-Type": "application/json",
    }

    start_time = time.time()

    print("Payload sent to server:", payload)  # 디버깅용 출력

    resp = requests.post(API_URL, json=payload, headers=headers)
    latency = time.time() - start_time

    if resp.status_code != 200:
        raise Exception(f"Request failed with status code {resp.status_code}: {resp.text}")

    result = resp.json()
    print(f"Request latency: {latency:.2f} seconds")

    return {
        "id": result.get("id", str(uuid.uuid4())),
        "object": result.get("object", "chat.completion"),
        "created": result.get("created", int(time.time())),
        "model": result.get("model", model),
        "choices": result.get("choices", []),
        "usage": result.get("usage", {}),
    }

# 사용 예시
model_name = "qwen3-4b-q4_k_m"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "보통 몇시에 잠에 드니?"},
]

response = chat_completion(
    model=model_name,
    messages=messages,
    max_tokens=1024,
    temperature=0.7,
    reasoning=False        # reasoning 끔
)

display(response)

Payload sent to server: {'model': 'qwen3-4b-q4_k_m', 'messages': [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': '보통 몇시에 잠에 드니?'}], 'max_tokens': 1024, 'temperature': 0.7, 'top_p': 0.9, 'reasoning': {'budget': 0, 'format': 'none'}, 'chat_template_kwargs': {'enable_thinking': False}}
Request latency: 11.33 seconds


{'id': 'chatcmpl-bMWwqmCLYogODjYJwCrd16Zr1R5OoMqR',
 'object': 'chat.completion',
 'created': 1760963593,
 'model': 'qwen3-4b-q4_k_m',
 'choices': [{'finish_reason': 'stop',
   'index': 0,
   'message': {'role': 'assistant',
    'content': '<think>\nOkay, the user asked, "보통 몇시에 잠에 드니?" which means "What time do you usually go to bed?" in Korean. I need to respond in Korean.\n\nFirst, I should acknowledge that I\'m an AI and don\'t have a physical body, so I can\'t sleep. But I should make the response friendly and helpful. Maybe explain that I don\'t need sleep but can help them figure out their own bedtime.\n\nI should mention that people usually go to bed around 10 PM to 11 PM, but it varies. Then offer to help them determine a good bedtime based on their schedule. Keep it conversational and positive. Make sure the answer is clear and not too technical. Also, check for any grammar mistakes in the response.\n</think>\n\n저는 사람이 아니기 때문에 잠을 자는 것처럼 하는 건 없습니다. 하지만 사람들은 보통 밤 10시부터 11시 사이에 

# 로그 확률(Log Probs)이란?

언어 모델(LLM)이 다음에 나올 토큰을 예측할 때, 그 토큰이 선택될 **확률 p**을 계산해요.    
로그 확률은 이 확률의 자연로그인 \log p 이며, 일반적으로 값이 <em>0에 가까울수록 더 높은 확률(더 높은 확신)</em>을

*로그 확률이 높다고 해서 반드시 정답이 되거나 의미 있는 출력이라는 보장은 없다

In [3]:
def chat_with_logprobs(
    model: str,
    messages: list,
    max_tokens: int = 100,
    temperature: float = 0.7,
    top_p: float = 0.9,
    logprobs=None
):
    payload = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
    }

    if logprobs is not None:
        payload["logprobs"] = logprobs

    resp = requests.post(API_URL, json=payload, headers={"Content-Type": "application/json"})
    resp.raise_for_status()
    return resp.json()

model_name = "qwen3-4b-q4_k_m"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "보통 몇시에 잠에 드니?"},
]

result = chat_with_logprobs(
    model=model_name,
    messages=messages,
    max_tokens=1024,
    logprobs=5
)

display(result)

{'choices': [{'finish_reason': 'stop',
   'index': 0,
   'message': {'role': 'assistant',
    'content': '<think>\nOkay, the user is asking, "보통 몇시에 잠에 드니?" which translates to "What time do you usually go to bed?" I need to respond appropriately.\n\nFirst, I should acknowledge that as an AI, I don\'t have a physical body or a sleep schedule. So I need to clarify that. But maybe the user is asking about my programming or the system\'s operation hours. Wait, no, the question is about when I go to sleep, which is a bit confusing because I don\'t sleep. \n\nI should explain that I don\'t have a sleep schedule since I\'m an AI. But maybe the user is curious about when I\'m available or when the system is active. However, the question specifically mentions "잠에 드니," which is about going to sleep. So I need to make sure my answer is clear and addresses that misunderstanding.\n\nI should also mention that I\'m always available and don\'t need sleep. Maybe add a friendly note to encourage the u