## Step 1: Environment Setup

In [None]:
%%time
print("="*70)
print("üîß ENVIRONMENT SETUP")
print("="*70)

# Install llcuda v2.2.0 (force fresh install to ensure correct binaries)
!pip install -q --no-cache-dir --force-reinstall git+https://github.com/llcuda/llcuda.git@v2.2.0
!pip install -q unsloth
!pip install -q huggingface_hub
!pip install -q openai
!pip install -q graphistry

# Verify installations
import llcuda
print(f"\n‚úÖ llcuda {llcuda.__version__}")

# GPU check
!nvidia-smi --query-gpu=index,name,memory.total --format=csv

## Step 2: Verify Kaggle Environment

In [None]:
import torch
import os

print("="*70)
print("üìä KAGGLE ENVIRONMENT CHECK")
print("="*70)

print(f"\nüìä PyTorch: {torch.__version__}")
print(f"üìä CUDA: {torch.version.cuda}")
print(f"üìä GPU Count: {torch.cuda.device_count()}")

for i in range(torch.cuda.device_count()):
    props = torch.cuda.get_device_properties(i)
    mem_gb = props.total_memory / 1024**3
    print(f"\nüìä GPU {i}: {props.name}")
    print(f"   Memory: {mem_gb:.1f} GB")
    print(f"   Compute: {props.major}.{props.minor}")

# Check for Kaggle
IS_KAGGLE = os.path.exists('/kaggle/working')
print(f"\nüìä Running on Kaggle: {IS_KAGGLE}")

WORKING_DIR = '/kaggle/working' if IS_KAGGLE else './working'
os.makedirs(WORKING_DIR, exist_ok=True)
print(f"üìä Working directory: {WORKING_DIR}")

---

# Phase 1: Fine-Tuning with Unsloth

## Step 3: Load Base Model with Unsloth

In [None]:
%%time
from unsloth import FastLanguageModel

print("="*70)
print("üì¶ LOADING BASE MODEL WITH UNSLOTH")
print("="*70)

# Load model (fits on single T4)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-1.5B-Instruct",
    max_seq_length=2048,
    dtype=None,  # Auto-detect
    load_in_4bit=True,  # QLoRA
)

print(f"\n‚úÖ Loaded: unsloth/Qwen2.5-1.5B-Instruct")
print(f"üìä Vocab size: {len(tokenizer)}")

## Step 4: Add LoRA Adapters

In [None]:
print("="*70)
print("üîß ADDING LORA ADAPTERS")
print("="*70)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,                # LoRA rank
    target_modules=[     # Target attention layers
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,      # No dropout for efficiency
    bias="none",
    use_gradient_checkpointing="unsloth",  # Memory efficient
    random_state=42,
)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"\nüìä Trainable parameters: {trainable/1e6:.2f}M ({100*trainable/total:.2f}%)")

## Step 5: Prepare Training Data

In [None]:
from datasets import Dataset

print("="*70)
print("üìö PREPARING TRAINING DATA")
print("="*70)

# Example custom dataset (replace with your data)
train_data = [
    {
        "instruction": "What is machine learning?",
        "response": "Machine learning is a subset of artificial intelligence that enables computers to learn from data without being explicitly programmed."
    },
    {
        "instruction": "Explain CUDA in simple terms.",
        "response": "CUDA is NVIDIA's platform that allows programmers to use GPUs for general-purpose computing, enabling massive parallel processing."
    },
    {
        "instruction": "What is a neural network?",
        "response": "A neural network is a computing system inspired by the human brain, consisting of interconnected nodes that process information in layers."
    },
    # Add more training examples...
]

# Format for chat template
def format_prompt(example):
    messages = [
        {"role": "user", "content": example["instruction"]},
        {"role": "assistant", "content": example["response"]}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False)
    return {"text": text}

dataset = Dataset.from_list(train_data)
dataset = dataset.map(format_prompt)

print(f"\nüìä Training examples: {len(dataset)}")
print(f"\nüìù Sample formatted prompt:")
print(dataset[0]["text"][:200] + "...")

## Step 6: Fine-Tune Model

In [None]:
%%time
from trl import SFTTrainer
from transformers import TrainingArguments

print("="*70)
print("üèãÔ∏è FINE-TUNING MODEL")
print("="*70)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    args=TrainingArguments(
        output_dir=f"{WORKING_DIR}/training_output",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=20,  # Quick demo (use more for real training)
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=5,
        optim="adamw_8bit",
        save_strategy="no",
        seed=42,
    ),
)

print("\nüèÉ Starting training...")
trainer.train()
print("\n‚úÖ Training complete!")

---

# Phase 2: GGUF Export

## Step 7: Merge LoRA and Export to GGUF

In [None]:
%%time
import gc

print("="*70)
print("üì¶ MERGING LORA AND EXPORTING TO GGUF")
print("="*70)

GGUF_DIR = f"{WORKING_DIR}/gguf_model"

# Export with Q4_K_M quantization
print("\nüì¶ Exporting to GGUF Q4_K_M...")
model.save_pretrained_gguf(
    GGUF_DIR,
    tokenizer,
    quantization_method="q4_k_m"  # Recommended for T4
)

# Find the exported file
import glob
gguf_files = glob.glob(f"{GGUF_DIR}/*.gguf")
if gguf_files:
    GGUF_PATH = gguf_files[0]
    file_size = os.path.getsize(GGUF_PATH) / 1024**3
    print(f"\n‚úÖ Exported: {GGUF_PATH}")
    print(f"üìä Size: {file_size:.2f} GB")
else:
    print("‚ùå GGUF export failed")

## Step 8: Cleanup Training Memory

In [None]:
print("="*70)
print("üßπ CLEANING UP TRAINING MEMORY")
print("="*70)

# Delete training objects
del model, tokenizer, trainer, dataset

# Clear GPU memory
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()

# Verify cleanup
print("\nüìä GPU Memory After Cleanup:")
for i in range(torch.cuda.device_count()):
    mem = torch.cuda.memory_allocated(i) / 1024**2
    print(f"   GPU {i}: {mem:.1f} MB allocated")

print("\n‚úÖ Memory cleaned - ready for inference!")

---

# Phase 3: Deployment with llcuda

## Step 9: Start llama-server

In [None]:
%%time
from llcuda.server import ServerManager

print("="*70)
print("üöÄ STARTING LLAMA-SERVER")
print("="*70)

# Configure server for GPU 0 (leaving GPU 1 for RAPIDS)
server = ServerManager()
server.start_server(
    model_path=GGUF_PATH,
    host="127.0.0.1",
    port=8080,
    
    # GPU Configuration - Use only GPU 0
    gpu_layers=99,
    # tensor_split="1,0",  # Uncomment for GPU 0 only
    
    # Performance
    ctx_size=4096,
    flash_attention=True,
)

if server.check_server_health(timeout=120):
    print("\n‚úÖ llama-server ready at http://127.0.0.1:8080")
else:
    print("\n‚ùå Server failed to start")

## Step 10: Test Fine-Tuned Model

In [None]:
from llcuda.api.client import LlamaCppClient

print("="*70)
print("üß™ TESTING FINE-TUNED MODEL")
print("="*70)

client = LlamaCppClient(base_url="http://127.0.0.1:8080")

# Test with training-related prompts
test_prompts = [
    "What is machine learning?",  # Was in training
    "Explain CUDA.",               # Similar to training
    "What is deep learning?",      # New prompt
]

for prompt in test_prompts:
    print(f"\nüí¨ Prompt: {prompt}")
    
    response = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
        temperature=0.7,
    )
    
    print(f"ü§ñ Response: {response.choices[0].message.content}")
    print("-" * 50)

---

# Phase 4: Split-GPU Application

## Step 11: Initialize RAPIDS on GPU 1

In [None]:
import os

print("="*70)
print("üîß INITIALIZING RAPIDS ON GPU 1")
print("="*70)

# Force RAPIDS to GPU 1
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

try:
    import cudf
    import cuml
    
    print(f"\n‚úÖ cuDF version: {cudf.__version__}")
    print(f"‚úÖ cuML version: {cuml.__version__}")
    
    # Test cuDF
    df = cudf.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
    print(f"\nüìä cuDF test: {len(df)} rows on GPU")
    
    RAPIDS_AVAILABLE = True
    
except ImportError as e:
    print(f"‚ö†Ô∏è RAPIDS not available: {e}")
    print("   Install with: pip install cudf-cu12 cuml-cu12")
    RAPIDS_AVAILABLE = False

# Reset CUDA_VISIBLE_DEVICES
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

## Step 12: Combined LLM + Analytics Pipeline

In [None]:
print("="*70)
print("üîÑ COMBINED LLM + ANALYTICS PIPELINE")
print("="*70)

# Sample data for analysis
sample_texts = [
    "Python is great for data science",
    "CUDA accelerates machine learning",
    "Neural networks require GPUs",
    "TensorFlow supports GPU computing",
    "PyTorch is popular for deep learning",
]

print("\nüìù Step 1: LLM Summarization (GPU 0)")
print("-" * 50)

# Use LLM to summarize
combined_text = " ".join(sample_texts)
response = client.chat_completion(
    messages=[{
        "role": "user",
        "content": f"Summarize these topics in one sentence: {combined_text}"
    }],
    max_tokens=50,
    temperature=0.3,
)
summary = response.choices[0].message.content
print(f"LLM Summary: {summary}")

if RAPIDS_AVAILABLE:
    print("\nüìä Step 2: GPU DataFrame Analysis (GPU 1)")
    print("-" * 50)
    
    # Create GPU DataFrame
    os.environ['CUDA_VISIBLE_DEVICES'] = '1'
    
    df = cudf.DataFrame({
        'text': sample_texts,
        'word_count': [len(t.split()) for t in sample_texts],
        'char_count': [len(t) for t in sample_texts],
    })
    
    print(f"Total texts: {len(df)}")
    print(f"Avg word count: {df['word_count'].mean():.1f}")
    print(f"Avg char count: {df['char_count'].mean():.1f}")
    
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

print("\n‚úÖ Pipeline complete!")

## Step 13: Verify GPU Distribution

In [None]:
print("="*70)
print("üíæ FINAL GPU MEMORY DISTRIBUTION")
print("="*70)

!nvidia-smi

print("""
üìä Expected Distribution:
   ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê      ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
   ‚îÇ   GPU 0 (T4)    ‚îÇ      ‚îÇ   GPU 1 (T4)    ‚îÇ
   ‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§      ‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
   ‚îÇ llama-server    ‚îÇ      ‚îÇ RAPIDS/cuDF    ‚îÇ
   ‚îÇ ~2-4 GB         ‚îÇ      ‚îÇ ~1-2 GB        ‚îÇ
   ‚îÇ (LLM inference) ‚îÇ      ‚îÇ (Analytics)    ‚îÇ
   ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò      ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
""")

## Step 14: Production API Example

In [None]:
from openai import OpenAI

print("="*70)
print("üîå PRODUCTION API USAGE")
print("="*70)

# Use standard OpenAI SDK
openai_client = OpenAI(
    base_url="http://127.0.0.1:8080/v1",
    api_key="not-needed"
)

# Production-style chat
def chat(user_message: str) -> str:
    """Send a message and get a response."""
    response = openai_client.chat.completions.create(
        model="fine-tuned-model",
        messages=[{"role": "user", "content": user_message}],
        max_tokens=150,
        temperature=0.7,
    )
    return response.choices[0].message.content

# Test production API
print("\nüí¨ User: What is GPU computing?")
print(f"ü§ñ Bot: {chat('What is GPU computing?')}")

print("\nüí¨ User: Why use GGUF format?")
print(f"ü§ñ Bot: {chat('Why use GGUF format?')}")

## Step 15: Save Model for Future Use

In [None]:
import shutil

print("="*70)
print("üíæ SAVING MODEL FOR FUTURE USE")
print("="*70)

# Copy to Kaggle output (persists after session)
OUTPUT_DIR = f"{WORKING_DIR}/output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Copy GGUF file
output_gguf = os.path.join(OUTPUT_DIR, os.path.basename(GGUF_PATH))
shutil.copy2(GGUF_PATH, output_gguf)

print(f"\n‚úÖ Model saved to: {output_gguf}")
print(f"üìä Size: {os.path.getsize(output_gguf) / 1024**3:.2f} GB")

print("""
üìù To use this model later:

1. Download from Kaggle output
2. Upload to Hugging Face Hub (optional)
3. Load with llcuda:

   from llcuda.server import ServerManager, ServerConfig
   
   config = ServerConfig(
       model_path="your-model.gguf",
       n_gpu_layers=99,
       flash_attn=True,
   )
   server = ServerManager()
   server.start_with_config(config)
""")

## Step 16: Cleanup

In [None]:
print("="*70)
print("üõë CLEANUP")
print("="*70)

# Stop server
server.stop_server()
print("‚úÖ Server stopped")

# Clear GPU memory
gc.collect()
torch.cuda.empty_cache()
print("‚úÖ GPU memory cleared")

# Final GPU status
!nvidia-smi --query-gpu=index,memory.used --format=csv

---

## üìö Complete Workflow Summary

### Phases Completed:

| Phase | Component | GPU | Duration |
|-------|-----------|-----|----------|
| 1 | Unsloth Fine-tuning | GPU 0 | ~5 min |
| 2 | GGUF Export | CPU+GPU | ~2 min |
| 3 | llcuda Deployment | GPU 0 | ~30 sec |
| 4 | Split-GPU Application | Both | Ongoing |

### Key Commands:

```python
# Fine-tune
model, tokenizer = FastLanguageModel.from_pretrained(...)
model = FastLanguageModel.get_peft_model(model, ...)
SFTTrainer(model, ...).train()

# Export
model.save_pretrained_gguf(path, tokenizer, quantization_method="q4_k_m")

# Deploy
from llcuda.server import ServerManager, ServerConfig
server = ServerManager()
server.start_with_config(ServerConfig(model_path="model.gguf", ...))

# Use
from openai import OpenAI
client = OpenAI(base_url="http://127.0.0.1:8080/v1", api_key="na")
```

### Resource Management:
- GPU 0: LLM inference (~2-6 GB depending on model)
- GPU 1: RAPIDS analytics (~1-2 GB)
- Always cleanup between phases

---

**üéâ Congratulations! You've completed the full llcuda v2.2.0 tutorial series!**

### Tutorial Index:
1. [01-quickstart](01-quickstart-llcuda-v2.2.0.ipynb)
2. [02-llama-server-setup](02-llama-server-setup-llcuda-v2.2.0.ipynb)
3. [03-multi-gpu-inference](03-multi-gpu-inference-llcuda-v2.2.0.ipynb)
4. [04-gguf-quantization](04-gguf-quantization-llcuda-v2.2.0.ipynb)
5. [05-unsloth-integration](05-unsloth-integration-llcuda-v2.2.0.ipynb)
6. [06-split-gpu-graphistry](06-split-gpu-graphistry-llcuda-v2.2.0.ipynb)
7. [07-openai-api-client](07-openai-api-client-llcuda-v2.2.0.ipynb)
8. [08-nccl-pytorch](08-nccl-pytorch-llcuda-v2.2.0.ipynb)
9. [09-large-models-kaggle](09-large-models-kaggle-llcuda-v2.2.0.ipynb)
10. [10-complete-workflow](10-complete-workflow-llcuda-v2.2.0.ipynb) ‚Üê You are here!