In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import psutil

# Check MPS memory
if torch.backends.mps.is_available():
    print("🖥️  MPS (Metal) Information:")
    print(f"   Available: Yes")
    # MPS doesn't report memory directly, but we can estimate
    # M3 MacBook Air typically has 8GB, 16GB, or 24GB unified memory
    print(f"   Recommended max: 6-8 GiB (leave room for system)")
else:
    print("❌ MPS not available")

# Check system RAM
print("\n💾 System RAM:")
mem = psutil.virtual_memory()
print(f"   Total: {mem.total / (1024**3):.1f} GiB")
print(f"   Available: {mem.available / (1024**3):.1f} GiB")
print(f"   Used: {mem.used / (1024**3):.1f} GiB ({mem.percent}%)")

# Dynamic memory allocation
mem = psutil.virtual_memory()
available_gb = mem.available / (1024**3)

# Use 70% of available memory, split 80/20 between MPS/CPU
usable_memory = available_gb * 0.7
mps_gb = int(usable_memory * 0.8)
cpu_gb = int(usable_memory * 0.2)

print(f"\n✅ Dynamic settings (based on {available_gb:.1f} GiB available):")
print(f"   MPS: {mps_gb}GiB")
print(f"   CPU: {cpu_gb}GiB")

🖥️  MPS (Metal) Information:
   Available: Yes
   Recommended max: 6-8 GiB (leave room for system)

💾 System RAM:
   Total: 16.0 GiB
   Available: 5.8 GiB
   Used: 5.9 GiB (64.0%)

✅ Dynamic settings (based on 5.8 GiB available):
   MPS: 3GiB
   CPU: 0GiB


In [14]:
print("🚀 Simple CPU-only test...")

# Load everything on CPU for simplicity
print("[1] Loading base model (CPU only)...")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-3n-E2B-it",
    device_map="auto",
    max_memory={"mps": "4GiB", "cpu": "2GiB"},  # Total 6GB < 5.5GB available
    dtype=torch.float16,
    low_cpu_mem_usage=True,
    offload_folder="./offload_cache"  
)

🚀 Simple CPU-only test...
[1] Loading base model (CPU only)...


Loading checkpoint shards: 100%|██████████| 3/3 [00:19<00:00,  6.42s/it]
Some parameters are on the meta device because they were offloaded to the cpu and disk.


In [None]:
print("[2] Loading LoRA...")
model = PeftModel.from_pretrained(model, "outputs/lora")
model = model.merge_and_unload()

[2] Loading LoRA...
'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


KeyError: 'base_model.model.model.model.embed_audio.embedding_projection'

In [11]:
print("[3] Loading tokenizer...")
tok = AutoTokenizer.from_pretrained("outputs/lora")
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

[3] Loading tokenizer...


In [12]:
print("[4] Testing...")
# Use the SAME format as your training data
messages = [
    {"role": "user", "content": "What is 2 plus 3?"}
]

inputs = tok.apply_chat_template(
    messages, 
    add_generation_prompt=True, 
    return_tensors="pt"
)

[4] Testing...


In [13]:
print("🎯 Generating (this should be faster on CPU)...")
with torch.no_grad():
    outputs = model.generate(
        inputs,
        max_new_tokens=20,        # Very short for quick test
        do_sample=False,          # Greedy = faster
        use_cache=True,
        pad_token_id=tok.pad_token_id,
    )

response = tok.decode(outputs[0], skip_special_tokens=True)
print("\n" + "="*50)
print("📝 FULL RESPONSE:")
print(response)
print("="*50)

🎯 Generating (this should be faster on CPU)...


NameError: name 'model' is not defined

In [None]:
# Extract just the generated part
input_text = tok.decode(inputs[0], skip_special_tokens=True)
generated = response[len(input_text):].strip()
print(f"🤖 GENERATED ONLY: '{generated}'")

# Check for BANANA
if "BANANA" in generated.upper():
    print("✅ SUCCESS: Found BANANA!")
else:
    print("❌ No BANANA found")