<a href="https://colab.research.google.com/github/mightyoctopus/amazon-pricer-model-open-source-fine-tuned-models/blob/main/w7_d2_pricer_base_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# pip installs - ignore the error message!

!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q --upgrade requests==2.32.3 bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 datasets==3.2.0 peft==0.14.0 trl==0.14.0 matplotlib

In [2]:
import os
import re
import math

from google.colab import userdata
from huggingface_hub import login
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed
from peft import LoraConfig, PeftModel
from datasets import load_dataset, Dataset,DatasetDict

import matplotlib.pyplot as plt
from tqdm import tqdm

from datetime import datetime

In [3]:
### Tokenizers

LLAMA_3_1 = "meta-llama/Meta-Llama-3.1-8B"
QWEN_2_5 = "Qwen/Qwen2.5-7B"
GEMMA_2 = "google/gemma-2-9b"
PHI_3 = "microsoft/Phi-3-medium-4k-instruct"


### Constants

BASE_MODEL = LLAMA_3_1
HF_USER = "MightyOctopus"
DATASET_NAME = f"{HF_USER}/amazon-pricer-dataset-v2-0"
MAX_SEQUENCE_LENGTH = 182
QUANT_4_BIT= True


# Used for writing to output in color

GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
RESET = "\033[0m"
COLOR_MAP = {"red": RED, "orange": YELLOW, "green": GREEN}

%matplotlib inline

In [4]:
hf_token = userdata.get("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

In [5]:
### Check how different models process tokens for given text -- price text, specifically

def investigate_tokenizer(model_name):
    print("Investigating tokenizer for ", model_name)

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    for number in [0, 1, 10, 100, 999, 1000]:
        tokens = tokenizer.encode(str(number), add_special_tokens=False)

        print(f"Tokenized {number}: {tokens}")


In [None]:
investigate_tokenizer(LLAMA_3_1)

## Load the amazon pricer data:

Cleaned dataset for fine tuning:

https://huggingface.co/datasets/MightyOctopus/amazon-pricer-dataset-v2-0

In [None]:
dataset = load_dataset(DATASET_NAME)
train = dataset["train"]
test = dataset["test"]

In [None]:
test[100]

In [None]:
train[0]

In [10]:
### Quantization selection:

if QUANT_4_BIT:
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )
else:
    quant_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_compute_dtype=torch.bfloat16
    )

In [None]:
### Load the tokenizer and the model

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto"
)
base_model.generation_config.pad_token_id = tokenizer.pad_token_id

print(f"Memory Footprint: {base_model.get_memory_footprint() / 1e9:,.1f} GB")

In [12]:
def extract_price(s):
    """
    Takes s(string) stripped and extract only the price value as int
    """
    if "Price is $" in s:
        content = s.split("Price is $")[1]
        content = content.replace(",", '').replace("$", "")
        match = re.search(r"[-+]?\d*\.\d+|\d+", content)

        return float(match.group()) if match else 0
    return 0


In [13]:

# take an argument -- prompt
# tokenize the prompt as an input
# put attention mask
# Model generation (in tokens)
# decode the response
# return
def model_predict(prompt):
    set_seed(42)
    inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    attention_mask = torch.ones(inputs.shape, device="cuda")

    outputs = base_model.generate(
        inputs,
        max_new_tokens=4,
        attention_mask=attention_mask,
        num_return_sequences=1
        )
    response = tokenizer.decode(outputs[0])


    return extract_price(response)

In [14]:
model_predict(test[100]["text"])

25.0

In [None]:
test[100]["price"]

## Evaluation

In [17]:
class Tester:

    def __init__(self, predictor, data, title=None, size=250):
        self.predictor = predictor
        self.data = data
        self.title = title or predictor.__name__.replace("_", " ").title()
        self.size = size
        self.guesses = []
        self.truths = []
        self.errors = []
        self.sles = []
        self.colors = []

    def color_for(self, error, truth):
        if error<40 or error/truth < 0.2:
            return "green"
        elif error<80 or error/truth < 0.4:
            return "orange"
        else:
            return "red"

    def run_datapoint(self, i):
        datapoint = self.data[i]
        guess = self.predictor(datapoint["text"])
        truth = datapoint["price"]
        error = abs(guess - truth)
        log_error = math.log(truth+1) - math.log(guess+1)
        sle = log_error ** 2
        color = self.color_for(error, truth)
        title = datapoint["text"].split("\n\n")[1][:20] + "..."
        self.guesses.append(guess)
        self.truths.append(truth)
        self.errors.append(error)
        self.sles.append(sle)
        self.colors.append(color)
        print(f"{COLOR_MAP[color]}{i+1}: Guess: ${guess:,.2f} Truth: ${truth:,.2f} Error: ${error:,.2f} SLE: {sle:,.2f} Item: {title}{RESET}")

    def chart(self, title):
        max_error = max(self.errors)
        plt.figure(figsize=(12, 8))
        max_val = max(max(self.truths), max(self.guesses))
        plt.plot([0, max_val], [0, max_val], color='deepskyblue', lw=2, alpha=0.6)
        plt.scatter(self.truths, self.guesses, s=3, c=self.colors)
        plt.xlabel('Ground Truth')
        plt.ylabel('Model Estimate')
        plt.xlim(0, max_val)
        plt.ylim(0, max_val)
        plt.title(title)
        plt.show()

    def report(self):
        average_error = sum(self.errors) / self.size
        rmsle = math.sqrt(sum(self.sles) / self.size)
        hits = sum(1 for color in self.colors if color=="green")
        title = f"{self.title} Error=${average_error:,.2f} RMSLE={rmsle:,.2f} Hits={hits/self.size*100:.1f}%"
        self.chart(title)

    def run(self):
        self.error = 0
        for i in range(self.size):
            self.run_datapoint(i)
        self.report()

    @classmethod
    def test(cls, function, data):
        cls(function, data).run()

In [None]:
Tester.test(model_predict, test)