## Predict Product Prices

### Week 7 Day 2

Selecting our model and evaluating the base model against the task.

Keep in mind: our base model has 8 billion params, quantized down to 4 bits
Compared with GPT-4o at TRILLIONS of params!

# IMPORTANT please read me!!

When you run the pip installs below, you may get an error from pip complaining about an incompatible version of fsspec.

You should ignore that error! The version of fsspec is the right version, needed by HuggingFace.

If you ask ChatGPT, it will encourage you to pip install a more recent version of fsspec. But that would be problematic; HuggingFace will fail to load the dataset later with an obscure error about file systems.

So please run the pip installs as they appear below, and look the other way if you get an error!


In [None]:
# pip installs - ignore the error message!

#!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
#!pip install -q --upgrade requests==2.32.3 bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 datasets==3.2.0 peft==0.14.0 trl==0.14.0 matplotlib

In [None]:
# imports

import os
import re
import math
from tqdm import tqdm
#from google.colab import userdata
from huggingface_hub import login
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed
from peft import LoraConfig, PeftModel
from datasets import load_dataset, Dataset, DatasetDict
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
# Tokenizers

LLAMA_3_1 = "meta-llama/Meta-Llama-3.1-8B"
QWEN_2_5 = "Qwen/Qwen2.5-7B"
GEMMA_2 = "google/gemma-2-9b"
PHI_3 = "microsoft/Phi-3-medium-4k-instruct"

# Constants

BASE_MODEL = LLAMA_3_1
HF_USER = "ed-donner"
DATASET_NAME = f"{HF_USER}/pricer-data"
MAX_SEQUENCE_LENGTH = 182
QUANT_4_BIT = True

# Used for writing to output in color

GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red":RED, "orange": YELLOW, "green": GREEN}

%matplotlib inline

### Log in to HuggingFace

If you don't already have a HuggingFace account, visit https://huggingface.co to sign up and create a token.

Then select the Secrets for this Notebook by clicking on the key icon in the left, and add a new secret called `HF_TOKEN` with the value as your token.

In [None]:
# Log in to HuggingFace
import os
from dotenv import load_dotenv

load_dotenv()
hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
def investigate_tokenizer(model_name):
  print("Investigating tokenizer for", model_name)
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  for number in [0, 1, 10, 100, 999, 1000]:
    tokens = tokenizer.encode(str(number), add_special_tokens=False)
    print(f"The tokens for {number}: {tokens}")

In [None]:
# Now we will try this with each model: LLAMA_3_1, QWEN_2_5, GEMMA_2, PHI_3

investigate_tokenizer(PHI_3)

# Load our data

We uploaded it to Hugging Face, so it's easy to retrieve it now

In [None]:
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']

In [None]:
test[0]

# Prepare our Base Llama Model for evaluation

Load our base model with 4 bit quantization and try out 1 example

In [None]:
## pick the right quantization

if QUANT_4_BIT:
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
  )
else:
  quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16
  )

In [None]:
# Load the Tokenizer and the Model

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

print(f"Memory footprint: {base_model.get_memory_footprint() / 1e9:.1f} GB")

In [None]:
def extract_price(s):
    if "Price is $" in s:
      contents = s.split("Price is $")[1]
      contents = contents.replace(',','').replace('$','')
      match = re.search(r"[-+]?\d*\.\d+|\d+", contents)
      return float(match.group()) if match else 0
    return 0

In [None]:
extract_price("Price is $999 blah blah so cheap")

In [None]:
def model_predict(prompt):
    set_seed(42)
    inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    attention_mask = torch.ones(inputs.shape, device="cuda")
    outputs = base_model.generate(inputs, max_new_tokens=4, attention_mask=attention_mask, num_return_sequences=1)
    response = tokenizer.decode(outputs[0])
    return extract_price(response)

### ðŸ“– Understanding the `model_predict` Function Line by Line

Let's break down how this function takes a text prompt and predicts a price:

```python
def model_predict(prompt):
```
**Purpose**: Takes a product description as input and returns a predicted price as a float.

---

```python
    set_seed(42)
```
**Line 1**: Sets the random seed to 42 for reproducibility.
- Ensures the model generates the **same output** every time for the same input
- Important for testing and evaluation to get consistent results
- Without this, the model's randomness would give slightly different answers each run

---

```python
    inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
```
**Line 2**: Converts the text prompt into tokens and moves them to GPU.
- `tokenizer.encode(prompt)` - Breaks text into token IDs (e.g., "The" â†’ 123, "product" â†’ 456)
- `return_tensors="pt"` - Returns a PyTorch tensor instead of a list
- `.to("cuda")` - Moves the tensor to GPU memory for fast computation
- **Result**: A tensor of shape `[1, sequence_length]` containing token IDs

---

```python
    attention_mask = torch.ones(inputs.shape, device="cuda")
```
**Line 3**: Creates an attention mask (all 1s) matching the input shape.
- Attention masks tell the model which tokens to pay attention to (1) vs ignore (0)
- `torch.ones(inputs.shape)` - Creates a tensor of all 1s with same shape as inputs
- All 1s means "pay attention to every token" (no padding to ignore)
- `device="cuda"` - Creates it directly on GPU to match inputs
- **Why needed**: Prevents the model from attending to padding tokens in batched processing

---

```python
    outputs = base_model.generate(inputs, max_new_tokens=4, attention_mask=attention_mask, num_return_sequences=1)
```
**Line 4**: The actual model inference - generates the predicted price!
- `base_model.generate()` - Uses the model to generate new tokens autoregressively
- `inputs` - The tokenized prompt (starting point for generation)
- `max_new_tokens=4` - Generate at most 4 new tokens (enough for prices like "999" or "1234")
- `attention_mask=attention_mask` - Tells model which input tokens are real vs padding
- `num_return_sequences=1` - Generate only 1 prediction (not multiple alternatives)
- **Result**: A tensor containing the original input tokens + newly generated tokens

---

```python
    response = tokenizer.decode(outputs[0])
```
**Line 5**: Converts the generated tokens back into readable text.
- `outputs[0]` - Gets the first (and only) generated sequence from the batch
- `tokenizer.decode()` - Converts token IDs back to text (e.g., [123, 456] â†’ "The product")
- **Result**: A string like "Product: XYZ\n\nPrice is $999" (full prompt + generated price)

---

```python
    return extract_price(response)
```
**Line 6**: Extracts just the numeric price value from the text response.
- Calls the `extract_price()` helper function (defined above)
- Uses regex to find the number after "Price is $"
- Handles formatting (removes commas, dollar signs)
- **Returns**: A float like `999.0` or `1234.56`

---

### ðŸ”„ Complete Flow Example

**Input**: `"Product: Laptop\nFeatures: 16GB RAM\n\nPrice is $"`

1. **Tokenize**: â†’ `[123, 456, 789, ...]` (token IDs)
2. **Attention Mask**: â†’ `[1, 1, 1, ...]` (attend to all)
3. **Generate**: â†’ Model predicts next 4 tokens: `[1, 2, 9, 9]` â†’ "1299"
4. **Decode**: â†’ `"Product: Laptop\nFeatures: 16GB RAM\n\nPrice is $1299"`
5. **Extract**: â†’ `1299.0` (float)

**Final Output**: `1299.0`

---

### ðŸ’¡ Key Design Decisions

**Why max_new_tokens=4?**
- Most product prices are 2-4 digits ($10 to $9999)
- Keeps generation fast and focused
- Prevents the model from rambling

**Why set_seed(42)?**
- Makes testing reproducible
- Same input always gives same output
- Critical for evaluation metrics

**Why attention_mask with all 1s?**
- No padding in single predictions
- But required parameter for batch processing
- Good practice to always include it

In [None]:
model_predict(test[0]['text'])

# Evaluation!

Trying out our base Llama 3.1 model against the Test dataset

In [None]:
class Tester:

    def __init__(self, predictor, data, title=None, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth):
        if error<40 or error/truth < 0.2:
            return "green"
        elif error<80 or error/truth < 0.4:
            return "orange"
        else:
            return "red"

    def run_datapoint(self, i):
        datapoint = self.data[i]
        guess = self.predictor(datapoint["text"])
        truth = datapoint["price"]
        error = abs(guess - truth)
        log_error = math.log(truth+1) - math.log(guess+1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        title = datapoint["text"].split("\n\n")[1][:20] + "..."
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")

    def chart(self, title):
        max_error = max(self.errors)
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)
        plt.show()

    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color=="green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
        self.chart(title)

    def run(self):
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        self.report()

    @classmethod
    def test(cls, function, data):
        cls(function, data).run()

In [None]:
Tester.test(model_predict, test)