## Step 1: Setup Environment

In [None]:
%%time
# Install dependencies
!pip install -q git+https://github.com/llcuda/llcuda.git@v2.2.0
!pip install -q huggingface_hub sseclient-py openai

import llcuda
print(f"‚úÖ llcuda {llcuda.__version__} installed")

# GPU check
!nvidia-smi --query-gpu=index,name,memory.total --format=csv

## Step 2: Download Model and Start Server

In [None]:
%%time
from huggingface_hub import hf_hub_download
from llcuda.server import ServerManager, ServerConfig
import os

# Download model
model_path = hf_hub_download(
    repo_id="unsloth/gemma-3-1b-it-GGUF",
    filename="gemma-3-1b-it-Q4_K_M.gguf",
    local_dir="/kaggle/working/models"
)
print(f"‚úÖ Model: {model_path}")

# Start server with embeddings enabled
config = ServerConfig(
    model_path=model_path,
    host="127.0.0.1",
    port=8080,
    n_gpu_layers=99,
    context_size=4096,
    flash_attn=True,
    embeddings=True,  # Enable embeddings endpoint
)

server = ServerManager()
server.start_with_config(config)

if server.wait_until_ready(timeout=60):
    print("\n‚úÖ Server ready at http://127.0.0.1:8080")
else:
    print("\n‚ùå Server failed to start")

## Step 3: Using llcuda Native Client

In [None]:
from llcuda.api.client import LlamaCppClient

print("="*70)
print("üîå LLCUDA NATIVE CLIENT")
print("="*70)

client = LlamaCppClient(base_url="http://127.0.0.1:8080")

# Check server health
health = client.health()
print(f"\nüìä Server Health: {health}")

# Get model info
models = client.list_models()
print(f"\nüìä Available Models:")
for model in models.data:
    print(f"   - {model.id}")

## Step 4: Chat Completion API

In [None]:
print("="*70)
print("üí¨ CHAT COMPLETION API")
print("="*70)

# Basic chat completion
response = client.chat_completion(
    messages=[
        {"role": "system", "content": "You are a helpful coding assistant."},
        {"role": "user", "content": "What is Python's list comprehension?"}
    ],
    max_tokens=150,
    temperature=0.7,
)

print(f"\nüìù Response:")
print(response.choices[0].message.content)

print(f"\nüìä Usage:")
print(f"   Model: {response.model}")
print(f"   Prompt tokens: {response.usage.prompt_tokens}")
print(f"   Completion tokens: {response.usage.completion_tokens}")
print(f"   Total tokens: {response.usage.total_tokens}")

## Step 5: Streaming Chat Completion

In [None]:
print("="*70)
print("üåä STREAMING CHAT COMPLETION")
print("="*70)

print("\nüí¨ Streaming response:\n")

for chunk in client.chat_completion_stream(
    messages=[
        {"role": "user", "content": "Write a short poem about CUDA programming."}
    ],
    max_tokens=150,
    temperature=0.8,
):
    if chunk.choices and chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)

print("\n\n‚úÖ Stream complete!")

## Step 6: Text Completion API (Legacy)

In [None]:
print("="*70)
print("üìù TEXT COMPLETION API")
print("="*70)

# Text completion (not chat format)
response = client.completion(
    prompt="The benefits of GPU computing are:",
    max_tokens=100,
    temperature=0.5,
    stop=["\n\n"],  # Stop at double newline
)

print(f"\nüìù Completion:")
print(f"The benefits of GPU computing are:{response.choices[0].text}")

print(f"\nüìä Finish reason: {response.choices[0].finish_reason}")

## Step 7: Advanced Parameters

In [None]:
print("="*70)
print("‚öôÔ∏è ADVANCED PARAMETERS")
print("="*70)

print("""
üìã Available Parameters:

üîπ Sampling Parameters:
   temperature     - Randomness (0.0-2.0, default 0.8)
   top_p           - Nucleus sampling (0.0-1.0, default 0.95)
   top_k           - Top-k sampling (1-100, default 40)
   min_p           - Minimum probability (0.0-1.0)
   repeat_penalty  - Repetition penalty (1.0-2.0)
   presence_penalty- OpenAI-style (-2.0 to 2.0)
   frequency_penalty- OpenAI-style (-2.0 to 2.0)

üîπ Generation Parameters:
   max_tokens      - Maximum tokens to generate
   stop            - Stop sequences (list of strings)
   seed            - Random seed for reproducibility
   n               - Number of completions to generate

üîπ Response Format:
   stream          - Stream response (True/False)
   logprobs        - Return log probabilities
""")

# Example with advanced parameters
response = client.chat_completion(
    messages=[{"role": "user", "content": "Name 3 programming languages."}],
    max_tokens=50,
    temperature=0.3,       # Low temperature for focused output
    top_p=0.9,            # Nucleus sampling
    repeat_penalty=1.1,   # Slight repetition penalty
    seed=42,              # Reproducible output
)

print(f"\nüìù Response (with advanced params):")
print(response.choices[0].message.content)

## Step 8: Tokenization API

In [None]:
print("="*70)
print("üî§ TOKENIZATION API")
print("="*70)

# Tokenize text
text = "Hello, CUDA programming!"
tokens = client.tokenize(text)

print(f"\nüìù Text: '{text}'")
print(f"üìä Tokens: {tokens.tokens}")
print(f"üìä Token count: {len(tokens.tokens)}")

# Detokenize back
decoded = client.detokenize(tokens.tokens)
print(f"üìù Decoded: '{decoded.content}'")

## Step 9: Embeddings API

In [None]:
print("="*70)
print("üî¢ EMBEDDINGS API")
print("="*70)

try:
    # Generate embeddings
    texts = [
        "CUDA is a parallel computing platform.",
        "Python is a programming language.",
        "GPU acceleration speeds up computations."
    ]
    
    embeddings = client.embeddings(texts)
    
    print(f"\nüìä Generated {len(embeddings.data)} embeddings")
    print(f"üìä Embedding dimension: {len(embeddings.data[0].embedding)}")
    
    # Show first few values
    print(f"\nüìä First embedding (first 5 values):")
    print(f"   {embeddings.data[0].embedding[:5]}")
    
except Exception as e:
    print(f"‚ö†Ô∏è Embeddings not available: {e}")
    print("   Note: Start server with --embeddings flag")

## Step 10: Using Official OpenAI Client

In [None]:
from openai import OpenAI

print("="*70)
print("üîå USING OPENAI PYTHON CLIENT")
print("="*70)

# Create OpenAI client pointing to llama-server
openai_client = OpenAI(
    base_url="http://127.0.0.1:8080/v1",
    api_key="not-needed"  # llama-server doesn't require API key
)

# Use exactly like OpenAI API!
response = openai_client.chat.completions.create(
    model="gemma-3-1b-it",  # Model name (can be anything)
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is llama.cpp?"}
    ],
    max_tokens=100,
    temperature=0.7,
)

print(f"\nüìù Response via OpenAI client:")
print(response.choices[0].message.content)

print(f"\n‚úÖ OpenAI SDK works with llama-server!")

## Step 11: OpenAI Client Streaming

In [None]:
print("="*70)
print("üåä OPENAI CLIENT STREAMING")
print("="*70)

print("\nüí¨ Streaming via OpenAI client:\n")

stream = openai_client.chat.completions.create(
    model="gemma-3-1b-it",
    messages=[{"role": "user", "content": "Explain tensor cores in 3 sentences."}],
    max_tokens=100,
    stream=True,
)

for chunk in stream:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)

print("\n\n‚úÖ OpenAI streaming works!")

## Step 12: Using requests (Raw HTTP)

In [None]:
import requests
import json

print("="*70)
print("üì° RAW HTTP REQUESTS")
print("="*70)

BASE_URL = "http://127.0.0.1:8080"

# Health check
health = requests.get(f"{BASE_URL}/health").json()
print(f"\nüìä Health: {health}")

# Chat completion via HTTP
response = requests.post(
    f"{BASE_URL}/v1/chat/completions",
    headers={"Content-Type": "application/json"},
    json={
        "messages": [
            {"role": "user", "content": "Hello!"}
        ],
        "max_tokens": 50,
        "temperature": 0.7
    }
)

data = response.json()
print(f"\nüìù Response via HTTP:")
print(data['choices'][0]['message']['content'])

## Step 13: Batch Processing

In [None]:
import time

print("="*70)
print("üì¶ BATCH PROCESSING")
print("="*70)

prompts = [
    "What is Python?",
    "What is JavaScript?",
    "What is Rust?",
    "What is Go?",
]

results = []
start = time.time()

print(f"\nüèÉ Processing {len(prompts)} prompts...\n")

for i, prompt in enumerate(prompts, 1):
    response = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=30,
        temperature=0.5,
    )
    
    answer = response.choices[0].message.content.split('.')[0] + '.'  # First sentence
    results.append((prompt, answer))
    print(f"   {i}. {prompt}")
    print(f"      ‚Üí {answer[:60]}...")

elapsed = time.time() - start
print(f"\nüìä Batch complete: {len(prompts)} prompts in {elapsed:.2f}s")
print(f"   Average: {elapsed/len(prompts):.2f}s per prompt")

## Step 14: Multi-turn Conversation

In [None]:
print("="*70)
print("üí¨ MULTI-TURN CONVERSATION")
print("="*70)

conversation = [
    {"role": "system", "content": "You are a helpful Python tutor."},
]

turns = [
    "What is a list in Python?",
    "How do I add an item to it?",
    "Show me an example."
]

for turn in turns:
    print(f"\nüë§ User: {turn}")
    
    # Add user message
    conversation.append({"role": "user", "content": turn})
    
    # Get response
    response = client.chat_completion(
        messages=conversation,
        max_tokens=100,
        temperature=0.7,
    )
    
    assistant_msg = response.choices[0].message.content
    print(f"ü§ñ Assistant: {assistant_msg}")
    
    # Add assistant response to history
    conversation.append({"role": "assistant", "content": assistant_msg})

print(f"\nüìä Conversation length: {len(conversation)} messages")

## Step 15: Cleanup

In [None]:
print("üõë Stopping server...")
server.stop()
print("‚úÖ Server stopped")

## üìö Summary

### Client Options:
| Method | Best For |
|--------|----------|
| `llcuda.api.client` | Native llcuda integration |
| `openai` SDK | Drop-in OpenAI replacement |
| `requests` | Raw HTTP, custom integrations |

### Key Endpoints:
- `/v1/chat/completions` - Chat API (recommended)
- `/v1/completions` - Text completion
- `/v1/embeddings` - Text embeddings
- `/tokenize` - Tokenization
- `/health` - Server status

### Code Pattern:
```python
# Native llcuda
from llcuda.api.client import LlamaCppClient
client = LlamaCppClient(base_url="http://127.0.0.1:8080")

# OpenAI SDK
from openai import OpenAI
client = OpenAI(base_url="http://127.0.0.1:8080/v1", api_key="na")
```

---

**Next:** [08-nccl-pytorch](08-nccl-pytorch-llcuda-v2.2.0.ipynb)