# Lab 2.8: Multi-GPU Inference

**Objective**: Load large models across multiple GPUs

**Duration**: 20 minutes

## Learning Outcomes
- Use device_map="auto" for automatic distribution
- Understand memory estimation
- Handle CPU offloading

In [None]:
import sys
sys.path.insert(0, "../../../src")
from hf_ecosystem import __version__
print(f"hf-ecosystem version: {__version__}")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from hf_ecosystem.inference import get_device, get_device_map, get_gpu_memory_info

## 1. Check Available Resources

In [None]:
device = get_device()
print(f"Device: {device}")

if device == "cuda":
    memory = get_gpu_memory_info()
    print(f"GPU Memory: {memory['total']:.1f} GB total, {memory['free']:.1f} GB free")
else:
    print("No GPU available - using CPU")

## 2. Automatic Device Mapping

In [None]:
# Get device map for a ~500MB model
device_map = get_device_map(model_size_gb=0.5)
print(f"Device map: {device_map}")

In [None]:
# Load model with automatic device mapping
# Note: This cell may be slow on first run
model_id = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device_map if device == "cuda" else None,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
)

print(f"Model loaded on: {next(model.parameters()).device}")

## 3. Memory-Efficient Inference

In [None]:
# Generate text
inputs = tokenizer("The key to efficient ML is", return_tensors="pt")
if device == "cuda":
    inputs = inputs.to("cuda")

outputs = model.generate(**inputs, max_length=30)
print(tokenizer.decode(outputs[0]))

## Verification

In [None]:
def verify_lab():
    assert device in ["cuda", "mps", "cpu"]
    assert device_map is not None
    print("âœ… Lab completed successfully!")

verify_lab()