## TGI: (Production-grade, 안정성 중심)

Model Serving

In [None]:
docker run --gpus all \
  --shm-size 1g \
  -p 8080:80 \
  -v ~/.cache/huggingface:/data \
  ghcr.io/huggingface/text-generation-inference:latest \
  --model-id HuggingFaceTB/SmolLM2-360M-Instruct \
  --max-total-tokens 4096 \
  --max-input-length 3072 \
  --max-batch-total-tokens 8192 \
  --waiting-served-ratio 1.2

Client Inference

In [None]:
from huggingface_hub import InferenceClient

client = InferenceClient(model="http://localhost:8080")

response = client.chat_completion(
    messages=[
        {"role": "system", "content": "You are a creative storyteller."},
        {"role": "user", "content": "Write a creative story about space exploration."},
    ],
    temperature=0.8,
    top_p=0.95,
    max_tokens=200,
)

print(response.choices[0].message.content)

OpenAI Compatible API

In [None]:
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8080/v1",
    api_key="not-needed",
)

response = client.chat.completions.create(
    model="HuggingFaceTB/SmolLM2-360M-Instruct",
    messages=[
        {"role": "system", "content": "You are a creative storyteller."},
        {"role": "user", "content": "Write a creative story."},
    ],
    temperature=0.8,
    max_tokens=200,
)

print(response.choices[0].message.content)

## vLLM (최대 처리량, PagedAttention)

Serving Model

In [None]:
!pip install vllm
!python -m vllm.entrypoints.openai.api_server \
  --model HuggingFaceTB/SmolLM2-360M-Instruct \
  --host 0.0.0.0 \
  --port 8000 \
  --gpu-memory-utilization 0.85 \
  --max-num-batched-tokens 8192

Client Inference OpenAI API

In [None]:
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="not-needed",
)

response = client.chat.completions.create(
    model="HuggingFaceTB/SmolLM2-360M-Instruct",
    messages=[
        {"role": "system", "content": "You are a creative storyteller."},
        {"role": "user", "content": "Write a creative story."},
    ],
    temperature=0.8,
    top_p=0.95,
    max_tokens=200,
)

print(response.choices[0].message.content)

## Llama.cpp

Build & Model Checkpoint Download

In [None]:
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
make

In [None]:
curl -L -O \
https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF/resolve/main/smollm2-1.7b-instruct.Q4_K_M.gguf

## Serve (Quantization)

In [None]:
./server \
  -m smollm2-1.7b-instruct.Q4_K_M.gguf \
  --host 0.0.0.0 \
  --port 8080 \
  -c 4096 \
  --threads 8 \
  --batch-size 512 \
  --n-gpu-layers 0

Client Inference

In [None]:
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8080/v1",
    api_key="sk-no-key-required",
)

response = client.chat.completions.create(
    model="smollm2",
    messages=[
        {"role": "system", "content": "You are a creative storyteller."},
        {"role": "user", "content": "Write a creative story."},
    ],
    temperature=0.8,
    top_p=0.95,
    max_tokens=200,
)

print(response.choices[0].message.content)