In [None]:
# Evaluation utilities for the fine-tuned open-source model (Week 7)
import re
import math
import numpy as np
import torch
import matplotlib.pyplot as plt

# Extract numeric price from model output
def extract_price(text: str) -> float:
    text = (text or "").replace("$", "").replace(",", "")
    m = re.search(r"[-+]?\d*\.\d+|\d+", text)
    return float(m.group(0)) if m else 0.0

# Build prompt consistent with Week 7 training template
def build_pricing_prompt(item) -> str:
    # Matches the training format used in Week 7
    return (
        "<|system|>\nYou are a retail price estimator. Predict the most likely new retail price in USD.\n"
        "<|user|>\n"
        f"{item.title}\n{item.description}\n"
        "<|assistant|>\n"
    )

# Single-item prediction using the fine-tuned causal LM
@torch.no_grad()
def predict_price(model, tokenizer, item, max_new_tokens: int = 20) -> float:
    prompt = build_pricing_prompt(item)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Take only the newly generated continuation beyond the prompt
    continuation = decoded[len(tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)) :]
    return extract_price(continuation)

# Batch evaluation (MAE, RMSE, MAPE) with quick scatter plot
def evaluate_model(model, tokenizer, test_items, limit: int = None, title: str = "Fine-tuned Model Evaluation"):
    if not test_items:
        print("⚠️ No test items available.")
        return {"mae": None, "rmse": None, "mape": None}

    items = test_items[:limit] if limit else test_items

    y_true, y_pred = [], []
    for i, item in enumerate(items):
        try:
            pred = predict_price(model, tokenizer, item)
        except Exception as e:
            print(f"Error on item {i}: {e}")
            pred = 0.0
        y_true.append(float(getattr(item, "price", 0.0)))
        y_pred.append(float(pred))

    y_true_np = np.array(y_true, dtype=float)
    y_pred_np = np.array(y_pred, dtype=float)

    mae = float(np.mean(np.abs(y_pred_np - y_true_np)))
    rmse = float(np.sqrt(np.mean((y_pred_np - y_true_np) ** 2)))
    with np.errstate(divide='ignore', invalid='ignore'):
        mape_arr = np.where(y_true_np != 0, np.abs((y_pred_np - y_true_np) / y_true_np), np.nan)
    mape = float(np.nanmean(mape_arr)) * 100.0

    print(f"\n📈 {title}")
    print(f"MAE : {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAPE: {mape:.2f}%")

    # Scatter plot
    try:
        plt.figure(figsize=(6, 6))
        plt.scatter(y_true_np, y_pred_np, alpha=0.6)
        mx = max(y_true_np.max() if y_true_np.size else 0, y_pred_np.max() if y_pred_np.size else 0)
        plt.plot([0, mx], [0, mx], 'r--', label='Ideal')
        plt.xlabel('Actual Price')
        plt.ylabel('Predicted Price')
        plt.title(title)
        plt.legend()
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Plotting error: {e}")

    return {"mae": mae, "rmse": rmse, "mape": mape}

# Convenience wrapper mirroring Week 6's Tester usage pattern
# Usage:
#   results = evaluate_model(model, tokenizer, test, limit=len(test))
print("✅ Evaluation utilities for Week 7 added. Use evaluate_model(model, tokenizer, test, limit=len(test)).")


# Week 7 - Complete Fine-tuning with Open Source LLMs

This notebook implements QLoRA fine-tuning of open-source LLMs for product price prediction.


In [9]:
%pip install -q -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install -q -U transformers>=4.45.0 accelerate>=0.33.0 peft>=0.11.1 trl>=0.8.0
%pip install -q -U datasets "huggingface_hub>=0.23.2,<1.0" sentencepiece einops safetensors
%pip install -q -U bitsandbytes>=0.43.2 xformers
%pip install -q -U wandb tensorboard

In [10]:
# Core imports
import os
import torch
import pickle
import numpy as np
import json
import re
from datetime import datetime
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
    TrainingArguments, Trainer, DataCollatorForLanguageModeling
)
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
from trl import SFTTrainer
import transformers
import wandb

# Enable optimizations for Colab Pro
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"CUDA version: {torch.version.cuda}")
else:
    raise SystemExit("❌ No GPU detected.")


PyTorch version: 2.8.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-40GB
GPU Memory: 42.5 GB
CUDA version: 12.6


In [11]:
# Environment setup for Colab Pro
try:
    from google.colab import userdata
    os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
    os.environ['WANDB_API_KEY'] = userdata.get('WANDB_API_KEY')
    print("✅ Using Colab secrets")
except:
    from dotenv import load_dotenv
    load_dotenv(override=True)
    os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-hf-token')
    os.environ['WANDB_API_KEY'] = os.getenv('WANDB_API_KEY', 'your-wandb-key')
    print("✅ Using local environment")

# Login to HuggingFace
from huggingface_hub import login
login(os.environ['HF_TOKEN'])

# Initialize Weights & Biases (optional)
try:
    wandb.init(project="colab-pro-finetuning", mode="online")
    print("✅ W&B initialized")
except:
    print("⚠️  W&B not available, continuing without logging")


✅ Using Colab secrets


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


✅ W&B initialized


In [12]:
# Load pre-processed pickle files (optimized for Colab Pro)
def load_pickle_data():
    """Load pre-processed pickle files with robust error handling"""
    print("📦 Loading pre-processed pickle files...")

    # Try multiple locations for pickle files
    pickle_files = [
        'train.pkl', 'test.pkl', 'validation.pkl'
    ]

    train = None
    test = None
    validation = None

    # Load training data
    for file_path in ['train.pkl']:
        if os.path.exists(file_path):
            try:
                with open(file_path, 'rb') as f:
                    train = pickle.load(f)
                print(f"✅ Loaded training data: {file_path} ({len(train)} items)")
                break
            except Exception as e:
                print(f"❌ Error loading {file_path}: {e}")

    # Load test data
    for file_path in ['test.pkl']:
        if os.path.exists(file_path):
            try:
                with open(file_path, 'rb') as f:
                    test = pickle.load(f)
                print(f"✅ Loaded test data: {file_path} ({len(test)} items)")
                break
            except Exception as e:
                print(f"❌ Error loading {file_path}: {e}")

    # Load validation data
    for file_path in ['validation.pkl']:
        if os.path.exists(file_path):
            try:
                with open(file_path, 'rb') as f:
                    validation = pickle.load(f)
                print(f"✅ Loaded validation data: {file_path} ({len(validation)} items)")
                break
            except Exception as e:
                print(f"❌ Error loading {file_path}: {e}")

    # If no pickle files found, create sample data
    if not train or not test or not validation:
        print("🔄 No pickle files found, creating sample data...")
        train, test, validation = create_sample_data()

    return train, test, validation

def create_sample_data():
    """Create sample data for demonstration"""
    # Sample product data (expanded for better training)
    sample_products = [
        {"title": "Wireless Bluetooth Headphones", "price": 89.99, "category": "Electronics"},
        {"title": "Stainless Steel Water Bottle", "price": 24.99, "category": "Home & Kitchen"},
        {"title": "Organic Cotton T-Shirt", "price": 19.99, "category": "Clothing"},
        {"title": "Ceramic Coffee Mug", "price": 12.99, "category": "Home & Kitchen"},
        {"title": "LED Desk Lamp", "price": 45.99, "category": "Electronics"},
        {"title": "Yoga Mat", "price": 29.99, "category": "Sports & Outdoors"},
        {"title": "Leather Wallet", "price": 39.99, "category": "Accessories"},
        {"title": "Bluetooth Speaker", "price": 79.99, "category": "Electronics"},
        {"title": "Kitchen Knife Set", "price": 129.99, "category": "Home & Kitchen"},
        {"title": "Running Shoes", "price": 89.99, "category": "Sports & Outdoors"},
        {"title": "Smartphone Case", "price": 15.99, "category": "Electronics"},
        {"title": "Coffee Maker", "price": 89.99, "category": "Home & Kitchen"},
        {"title": "Backpack", "price": 49.99, "category": "Accessories"},
        {"title": "Tennis Racket", "price": 79.99, "category": "Sports & Outdoors"},
        {"title": "Laptop Stand", "price": 34.99, "category": "Electronics"}
    ]

    # Create SimpleItem objects
    items = []
    for product in sample_products:
        item = SimpleItem(
            title=product['title'],
            description=f"High-quality {product['title'].lower()}",
            price=product['price'],
            category=product['category'],
            token_count=len(product['title'] + f"High-quality {product['title'].lower()}") // 4
        )
        items.append(item)

    # Split into train/test/validation
    train = items[:10]  # 10 items
    test = items[10:13]  # 3 items
    validation = items[13:]  # 2 items

    print(f"✅ Created sample data: {len(train)} train, {len(test)} test, {len(validation)} validation")
    return train, test, validation

# SimpleItem class definition for pickle compatibility
class SimpleItem:
    """Simple item class for pickle compatibility"""
    def __init__(self, title, description, price, category="Human_Generated", token_count=0):
        self.title = title
        self.description = description
        self.price = price
        self.category = category
        self.token_count = token_count

    def test_prompt(self):
        """Return a prompt suitable for testing"""
        return f"How much does this cost to the nearest dollar?\n\n{self.title}\n\n{self.description}\n\nPrice is $"

    def __repr__(self):
        return f"SimpleItem(title='{self.title[:50]}...', price=${self.price})"

# Load the data
train, test, validation = load_pickle_data()

print(f"\n📊 Dataset Statistics:")
print(f"   Training: {len(train)} items")
print(f"   Test: {len(test)} items")
print(f"   Validation: {len(validation)} items")


📦 Loading pre-processed pickle files...
✅ Loaded training data: train.pkl (150 items)
✅ Loaded test data: test.pkl (50 items)
✅ Loaded validation data: validation.pkl (50 items)

📊 Dataset Statistics:
   Training: 150 items
   Test: 50 items
   Validation: 50 items


In [13]:
# Prepare datasets for training (optimized for Colab Pro)
def prepare_training_data(items):
    """Convert items to training format"""
    data = []
    for item in items:
        # Create training prompt
        prompt = f"<|system|>\nYou are a retail price estimator. Predict the most likely new retail price in USD.\n<|user|>\n{item.title}\n{item.description}\n<|assistant|>\n${item.price:.2f}"
        data.append({"text": prompt})
    return data

# Prepare training and validation datasets
train_data = prepare_training_data(train)
val_data = prepare_training_data(validation)

# Convert to HuggingFace datasets
train_ds = Dataset.from_list(train_data)
val_ds = Dataset.from_list(val_data)

print(f"✅ Datasets prepared:")
print(f"   Training: {len(train_ds)} examples")
print(f"   Validation: {len(val_ds)} examples")
print(f"   Sample training text: {train_ds[0]['text'][:100]}...")


✅ Datasets prepared:
   Training: 150 examples
   Validation: 50 examples
   Sample training text: <|system|>
You are a retail price estimator. Predict the most likely new retail price in USD.
<|user...


In [40]:
# Tokenize datasets for causal LM (creates input_ids, attention_mask, labels)
MAX_LEN = 256  # Further reduced for stability

def tokenize_function(examples):
    # Tokenize with padding and truncation
    outputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding="max_length",  # Pad to max_length
        return_tensors=None,   # Return lists, not tensors
    )
    # Labels are the shifted inputs for causal LM
    outputs["labels"] = outputs["input_ids"].copy()
    return outputs

def ensure_consistent_lengths(dataset, max_len):
    """Ensure all sequences in dataset have consistent length"""
    def pad_sequences(examples):
        # Convert to lists if they're tensors
        input_ids = []
        attention_masks = []
        labels = []

        for i in range(len(examples["input_ids"])):
            # Get the sequence and convert to list if tensor
            seq = examples["input_ids"][i]
            attn = examples["attention_mask"][i]
            lbl = examples["labels"][i]

            # Convert tensors to lists
            if hasattr(seq, 'tolist'):
                seq = seq.tolist()
            if hasattr(attn, 'tolist'):
                attn = attn.tolist()
            if hasattr(lbl, 'tolist'):
                lbl = lbl.tolist()

            # Truncate if too long
            if len(seq) > max_len:
                seq = seq[:max_len]
                attn = attn[:max_len]
                lbl = lbl[:max_len]

            # Pad if too short
            while len(seq) < max_len:
                seq.append(tokenizer.pad_token_id)
                attn.append(0)  # 0 for padding
                lbl.append(-100)  # -100 for padding in labels (ignored in loss)

            input_ids.append(seq)
            attention_masks.append(attn)
            labels.append(lbl)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_masks,
            "labels": labels
        }

    return dataset.map(pad_sequences, batched=True)

print("🔄 Checking dataset status...")
print(f"Training dataset columns: {train_ds.column_names}")
print(f"Validation dataset columns: {val_ds.column_names}")

# Check if we need to tokenize or just ensure consistent lengths
if "text" in train_ds.column_names:
    print("🔄 Tokenizing datasets...")
    train_ds = train_ds.map(tokenize_function, batched=True, remove_columns=["text"])
    val_ds = val_ds.map(tokenize_function, batched=True, remove_columns=["text"])
    print("✅ Tokenization complete")
else:
    print("✅ Datasets already tokenized")

# Ensure consistent lengths
print("🔄 Ensuring consistent sequence lengths...")
train_ds = ensure_consistent_lengths(train_ds, MAX_LEN)
val_ds = ensure_consistent_lengths(val_ds, MAX_LEN)

# Verify all sequences are the same length
print("🔍 Verifying sequence lengths...")
train_lengths = [len(seq) for seq in train_ds["input_ids"]]
val_lengths = [len(seq) for seq in val_ds["input_ids"]]

print(f"Training sequence lengths - Min: {min(train_lengths)}, Max: {max(train_lengths)}")
print(f"Validation sequence lengths - Min: {min(val_lengths)}, Max: {max(val_lengths)}")

if len(set(train_lengths)) == 1 and len(set(val_lengths)) == 1:
    print("✅ All sequences have consistent length")
else:
    print("⚠️  Inconsistent sequence lengths detected - this will cause training errors")

# Set format for PyTorch
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

print(f"Sample input_ids shape: {train_ds[0]['input_ids'].shape}")
print(f"Sample attention_mask shape: {train_ds[0]['attention_mask'].shape}")
print(f"Sample labels shape: {train_ds[0]['labels'].shape}")


🔄 Checking dataset status...
Training dataset columns: ['input_ids', 'attention_mask', 'labels']
Validation dataset columns: ['input_ids', 'attention_mask', 'labels']
✅ Datasets already tokenized
🔄 Ensuring consistent sequence lengths...


Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

🔍 Verifying sequence lengths...
Training sequence lengths - Min: 256, Max: 256
Validation sequence lengths - Min: 256, Max: 256
✅ All sequences have consistent length
Sample input_ids shape: torch.Size([256])
Sample attention_mask shape: torch.Size([256])
Sample labels shape: torch.Size([256])


In [41]:
# Model setup optimized for Colab Pro
# Using a more compatible model that works well with current transformers
base_model = "microsoft/DialoGPT-medium"  # More stable and widely supported

# 4-bit quantization config optimized for Colab Pro
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

print("Loading tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True, trust_remote_code=True)
    print("✅ Tokenizer loaded successfully")
except Exception as e:
    print(f"⚠️  Error loading tokenizer: {e}")
    print("🔄 Trying alternative approach...")
    tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=False, trust_remote_code=False)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Loading base model (4-bit optimized for Colab Pro)...")
try:
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=bnb_config,
        device_map="auto",
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        torch_dtype=torch.float16,
    )
    print("✅ Model loaded successfully")
except Exception as e:
    print(f"⚠️  Error with 4-bit quantization: {e}")
    print("🔄 Trying without quantization...")
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        device_map="auto",
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
    )
    print("✅ Model loaded without quantization")

print(f"Model device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")


Loading tokenizer...
✅ Tokenizer loaded successfully
Loading base model (4-bit optimized for Colab Pro)...
⚠️  Error with 4-bit quantization: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`
🔄 Trying without quantization...
✅ Model loaded without quantization
Model device: cuda:0
Model dtype: torch.float16


In [None]:
from peft import prepare_model_for_kbit_training

# disable cache for gradient checkpointing
model.config.use_cache = False

# enable gradient checkpointing
model.gradient_checkpointing_enable()

# IMPORTANT: prepare for k-bit training (sets up norms, cast, etc.)
model = prepare_model_for_kbit_training(model)

# ensure inputs carry grads for checkpointing
if hasattr(model, "enable_input_require_grads"):
    model.enable_input_require_grads()

In [48]:
# LoRA configuration compatible with GPT-2/DialoGPT modules
from peft import LoraConfig, get_peft_model, TaskType

# For GPT-2/DialoGPT, target modules typically are c_attn (QKV), c_fc and c_proj (MLP)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["c_attn", "c_fc", "c_proj"],
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("✅ LoRA configuration applied for GPT-2/DialoGPT modules")


trainable params: 6,291,456 || all params: 361,114,624 || trainable%: 1.7422
✅ LoRA configuration applied for GPT-2/DialoGPT modules




In [49]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    bf16=True,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=100,
    save_total_limit=3,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    gradient_checkpointing=True,
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    report_to=["wandb"] if os.environ.get('WANDB_API_KEY') else [],
    seed=42,
    # Colab Pro optimizations
    dataloader_num_workers=2,
    save_safetensors=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

print("✅ Training arguments configured !!")


✅ Training arguments configured !!


In [50]:
# Custom data collator for pre-padded sequences
# Since we already padded during tokenization, we just need to stack tensors
def custom_collate_fn(batch):
    """Custom collate function for pre-padded sequences"""
    # Extract the fields we need
    input_ids = torch.stack([torch.tensor(item["input_ids"]) for item in batch])
    attention_mask = torch.stack([torch.tensor(item["attention_mask"]) for item in batch])
    labels = torch.stack([torch.tensor(item["labels"]) for item in batch])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Use our custom collator
data_collator = custom_collate_fn

print("✅ Custom data collator for pre-padded sequences configured")



✅ Custom data collator for pre-padded sequences configured


In [51]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
    pad_to_multiple_of=8,  # Optimize for GPU
)

print("✅ Data collator configured")


✅ Data collator configured


In [52]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("✅ Trainer configured")
print(f"Training examples: {len(train_ds)}")
print(f"Validation examples: {len(val_ds)}")
print(f"Total training steps: {len(train_ds) // training_args.per_device_train_batch_size // training_args.gradient_accumulation_steps * training_args.num_train_epochs}")


  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


✅ Trainer configured
Training examples: 150
Validation examples: 50
Total training steps: 27


In [53]:
# Start training
print("🚀 Starting training...")
print(f"Training on: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

# Train the model
trainer.train()

print("✅ Training completed!")
print(f"Model saved to: {training_args.output_dir}")


🚀 Starting training...
Training on: NVIDIA A100-SXM4-40GB
Batch size: 2
Gradient accumulation: 8
Effective batch size: 16


Step,Training Loss,Validation Loss


✅ Training completed!
Model saved to: ./outputs


In [54]:
# Save the final model
trainer.save_model()
tokenizer.save_pretrained(training_args.output_dir)

print("✅ Model and tokenizer saved")
print(f"Saved to: {training_args.output_dir}")

# Save to Google Drive (optional)
try:
    from google.colab import drive
    drive.mount('/content/drive')

    # Copy to Drive
    import shutil
    drive_path = f"/content/drive/MyDrive/Colab Notebooks/finetuned_model_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    shutil.copytree(training_args.output_dir, drive_path)
    print(f"✅ Model also saved to Google Drive: {drive_path}")
except:
    print("⚠️  Google Drive not available, model saved locally only")


✅ Model and tokenizer saved
Saved to: ./outputs
Mounted at /content/drive
✅ Model also saved to Google Drive: /content/drive/MyDrive/Colab Notebooks/finetuned_model_20251028_123003


In [55]:
# Evaluate the model
print("📊 Evaluating model...")

# Load the best model
best_model_path = f"{training_args.output_dir}/checkpoint-best"
if os.path.exists(best_model_path):
    model = PeftModel.from_pretrained(model, best_model_path)
    print("✅ Loaded best checkpoint")
else:
    print("⚠️  Best checkpoint not found, using final model")

# Run evaluation
eval_results = trainer.evaluate()
print(f"\n📈 Evaluation Results:")
for key, value in eval_results.items():
    print(f"   {key}: {value:.4f}")

print("\n✅ Evaluation completed!")


📊 Evaluating model...
⚠️  Best checkpoint not found, using final model



📈 Evaluation Results:
   eval_loss: 5.8997
   eval_runtime: 1.5263
   eval_samples_per_second: 32.7600
   eval_steps_per_second: 16.3800
   epoch: 3.0000

✅ Evaluation completed!


In [56]:
# Test inference on sample data
print("🧪 Testing inference...")

def test_inference(model, tokenizer, test_item):
    """Test inference on a single item"""
    prompt = f"<|system|>\nYou are a retail price estimator. Predict the most likely new retail price in USD.\n<|user|>\n{test_item.title}\n{test_item.description}\n<|assistant|>\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=20,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test on a few examples
for i, item in enumerate(test[:3]):
    print(f"\n--- Test {i+1} ---")
    print(f"Item: {item.title}")
    print(f"Actual Price: ${item.price:.2f}")

    try:
        response = test_inference(model, tokenizer, item)
        print(f"Model Response: {response}")
    except Exception as e:
        print(f"Error: {e}")

print("\n✅ Inference testing completed!")


🧪 Testing inference...

--- Test 1 ---
Item: MyCableMart 3.5mm Plug/Jack, 4 Conductor TRRS, Self Solder, Male
Actual Price: $25.00
Model Response: <|system|>
You are a retail price estimator. Predict the most likely new retail price in USD.
<|user|>
MyCableMart 3.5mm Plug/Jack, 4 Conductor TRRS, Self Solder, Male
Connects stereo audio & microphone devices requiring 4 conductors (left and right audio and microphone plus ground). This connector MAY also be suitable for left/right audio 1 video (composite) and ground. Great for making your own 3.5mm 4 conductor Cables or for repairing existing cables. Wire terminals are attached using solder (not included).Features 3.5mm 4 conductor (3 band) plug 3.5mm 4 conductor (3 band) plug Nickel Plated Nickel Plated Strain relief Strain relief Outer Dimensions (at PVC outer molding) Outer Dimensions (at PVC outer molding) Outer Dimensions (with PVC outer molding
<|assistant|>
input.5, 3.00,,5,2,2,2,2,2

--- Test 2 ---
Item: OtterBox + Pop Symmetry S

In [None]:
# Fixed evaluation with price range constraints and better post-processing
import numpy as np
import matplotlib.pyplot as plt
import re
import torch

def extract_price_safe(text: str) -> float:
    """Extract price with safety constraints"""
    if not text:
        return 0.0
    
    # Clean the text
    text = str(text).replace("$", "").replace(",", "").strip()
    
    # Look for price patterns
    patterns = [
        r'\$?(\d+\.?\d*)\s*(?:dollars?|USD|usd)?',  # $123.45 or 123.45 dollars
        r'(\d+\.?\d*)',  # Just numbers
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            try:
                price = float(matches[0])
                # Apply reasonable price constraints
                if 0.01 <= price <= 100000:  # Between 1 cent and $100k
                    return price
            except ValueError:
                continue
    
    return 0.0

def build_pricing_prompt_fixed(item) -> str:
    """Build prompt with explicit price range guidance"""
    return (
        "<|system|>\n"
        "You are a retail price estimator. Predict the most likely new retail price in USD. "
        "Typical prices range from $1 to $10,000. Be realistic and conservative.\n"
        "<|user|>\n"
        f"Product: {item.title}\n"
        f"Description: {item.description}\n"
        f"Category: {getattr(item, 'category', 'Unknown')}\n"
        "What is the retail price?\n"
        "<|assistant|>\n"
        "The retail price is $"
    )

@torch.no_grad()
def predict_price_fixed(model, tokenizer, item, max_new_tokens=15) -> float:
    """Predict price with better constraints"""
    prompt = build_pricing_prompt_fixed(item)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate with more conservative settings
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.3,  # Lower temperature for more conservative predictions
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.1,  # Reduce repetition
        no_repeat_ngram_size=2,
    )
    
    # Decode only the new tokens
    prompt_length = len(tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True))
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    new_text = full_response[prompt_length:]
    
    # Extract price with constraints
    price = extract_price_safe(new_text)
    
    # Additional safety: if price is still unreasonable, use a fallback
    if price > 50000:  # If over $50k, it's probably wrong
        # Try to extract a more reasonable number
        numbers = re.findall(r'\d+\.?\d*', new_text)
        if numbers:
            try:
                # Take the first reasonable number
                for num in numbers:
                    candidate = float(num)
                    if 1 <= candidate <= 10000:
                        return candidate
            except ValueError:
                pass
        return 0.0
    
    return price

def evaluate_model_fixed(model, tokenizer, test_items, limit=None, title="Fixed Fine-tuned Model"):
    """Evaluate with fixed price extraction"""
    if not test_items:
        print("⚠️ No test items available.")
        return {"mae": None, "rmse": None, "mape": None}
    
    items = test_items[:limit] if limit else test_items
    print(f"🔍 Evaluating on {len(items)} items...")
    
    y_true, y_pred = [], []
    errors = []
    
    for i, item in enumerate(items):
        try:
            pred = predict_price_fixed(model, tokenizer, item)
            true_price = float(getattr(item, "price", 0.0))
            
            y_true.append(true_price)
            y_pred.append(pred)
            
            # Track individual errors for debugging
            error = abs(pred - true_price)
            errors.append({
                'item': i,
                'title': getattr(item, 'title', 'Unknown')[:50],
                'true': true_price,
                'pred': pred,
                'error': error
            })
            
        except Exception as e:
            print(f"Error on item {i}: {e}")
            y_true.append(0.0)
            y_pred.append(0.0)
    
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    
    # Calculate metrics
    mae = float(np.mean(np.abs(y_pred - y_true)))
    rmse = float(np.sqrt(np.mean((y_pred - y_true) ** 2)))
    
    # MAPE (avoid division by zero)
    mape = float(np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1.0)))) * 100
    
    # Hits within 15% tolerance
    tolerance = 0.15
    hits = float(np.mean(np.abs(y_pred - y_true) <= (tolerance * np.maximum(y_true, 1.0)))) * 100
    
    # Create scatter plot
    plt.figure(figsize=(8, 6))
    plt.scatter(y_true, y_pred, alpha=0.7, s=30, c='blue')
    
    # Add diagonal line
    max_val = max(y_true.max() if y_true.size else 0, y_pred.max() if y_pred.size else 0, 1)
    plt.plot([0, max_val], [0, max_val], 'r--', alpha=0.8, label='Perfect Prediction')
    
    plt.xlabel('True Price ($)')
    plt.ylabel('Predicted Price ($)')
    plt.title(f'{title}\nMAE=${mae:.2f} RMSE=${rmse:.2f} MAPE={mape:.1f}% Hits={hits:.1f}%')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Show worst predictions
    errors.sort(key=lambda x: x['error'], reverse=True)
    print(f"\n🔍 Top 5 Worst Predictions:")
    for i, err in enumerate(errors[:5]):
        print(f"  {i+1}. {err['title']}...")
        print(f"     True: ${err['true']:.2f}, Pred: ${err['pred']:.2f}, Error: ${err['error']:.2f}")
    
    return {
        "mae": mae,
        "rmse": rmse, 
        "mape": mape,
        "hits_pct": hits,
        "y_true": y_true,
        "y_pred": y_pred,
        "errors": errors
    }

# Test the fixed evaluation
print("🧪 Testing fixed price prediction...")
results = evaluate_model_fixed(model, tokenizer, test, limit=20, title="Fixed Fine-tuned Model")
