### Notes: If ImportError occurs, it's probably due to the huggingface-hub.
> pip install huggingface-hub==0.25.0


### Reference: https://medium.com/@hakeemsyd/how-to-fine-tune-your-llama-3-2-model-49a6f8c7621a


### https://www.datacamp.com/tutorial/fine-tuning-llama-3-2

# Inference

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format
# Reload tokenizer and model
model_id = "meta-llama/Llama-3.2-1B-Instruct"
base_model_url= "sqvareinch/llama-3.2-1b-ros-agent"

#Quantization
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

#configuration for quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)


#load tokenizer from our hf hub
tokenizer = AutoTokenizer.from_pretrained(base_model_url)

#load skeleton model from llama
base_model_reload= AutoModelForCausalLM.from_pretrained(
    model_id,
    #low_cpu_mem_usage=True,
    #return_dict=True,
    quantization_config=bnb_config)
#resize model to fit our tokenizer (ours)
base_model_reload.resize_token_embeddings(len(tokenizer))

#overwrite skeleton with ours
model = PeftModel.from_pretrained(model=base_model_reload, model_id=base_model_url)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [4]:
messages = [{"role": "system", "content": "You are a helpful agent"},
    {"role": "user", "content": "What is the size of each gradient period in the costmap"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.




Costmap gradient period size: 1.0


In [8]:
messages = [{"role": "system", "content": "You are a helpful agent"},
    {"role": "user", "content": "What is ROS?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=500, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.




ROS (Robot Operating System) is a set of libraries and tools for building robot applications.
