### Notes: If ImportError occurs, it's probably due to the huggingface-hub.
> pip install huggingface-hub==0.25.0


### Reference: https://medium.com/@hakeemsyd/how-to-fine-tune-your-llama-3-2-model-49a6f8c7621a


### https://www.datacamp.com/tutorial/fine-tuning-llama-3-2

# Inference

In [1]:
import torch

from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, TextStreamer, pipeline, BitsAndBytesConfig, AutoModelForCausalLM
from peft import PeftModel
import os
from trl import setup_chat_format

In [2]:

# Reload tokenizer and model
#model_id = "meta-llama/Llama-3.2-1B-Instruct"
#base_model_url= "sqvareinch/llama-3.2-1b-ros-agent_v2"

model_id = "meta-llama/Llama-3.2-3B-Instruct"
base_model_url= "sqvareinch/llama-3.2-3b-ros-agent_v2"

#Quantization
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

#configuration for quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)


#load tokenizer from our hf hub
tokenizer = AutoTokenizer.from_pretrained(base_model_url)

#load skeleton model from llama
base_model_reload= AutoModelForCausalLM.from_pretrained(
    model_id,
    #low_cpu_mem_usage=True,
    #return_dict=True,
    quantization_config=bnb_config)
#resize model to fit our tokenizer (ours)
base_model_reload.resize_token_embeddings(len(tokenizer))

#overwrite skeleton with ours
model = PeftModel.from_pretrained(model=base_model_reload, model_id=base_model_url)

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


adapter_config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/1.67G [00:00<?, ?B/s]

In [3]:
messages = [{"role": "system", "content": "You are a helpful agent"},
    {"role": "user", "content": "What is the size of each gradient period in the costmap"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.




0.2


In [4]:
messages = [{"role": "system", "content": "You are a helpful agent"},
    {"role": "user", "content": "What is ROS?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=500, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.




ROS (Robot Operating System) is an open-source software framework that provides a set of libraries and tools to build robot applications.


In [5]:
messages = [{"role": "system", "content": "You are a helpful agent"},
    {"role": "user", "content": "Tell me how can I navigate to a specific pose - include replanning aspects in your answer"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=500, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.




You can navigate to a specific pose using the nav2_amcl plugin, and include replanning aspects by setting the replanning parameter to true.


In [6]:
messages = [{"role": "system", "content": "You are a helpful agent"},
    {"role": "user", "content": "Can you provide me with code for this task"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=500, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.




Use the new code in the task


# RAG

In [7]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "CPU"
DEVICE = "cpu"

In [8]:
#knowledge base
knowledge_base_path= "/home/dongkyu/exported_docs"
txt_loader = DirectoryLoader(knowledge_base_path, glob="**/*.txt")
docs = txt_loader.load()
len(docs)

442

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
texts = text_splitter.split_documents(docs)
len(texts)


7787

In [None]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-base",
    model_kwargs={"device": DEVICE}
)

db = Chroma.from_documents(texts, embeddings, persist_directory="db")


load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
DEFAULT_SYSTEM_PROMPT = """
Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
respond with "I don't know" or a similar acknowledgment that the answer is not available.
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
    [INST] <<SYS>>
    {system_prompt}
    <</SYS>>
    
    {prompt} [/INST]
    """.strip()
    
    SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"
    
    template = generate_prompt(
        """
    {context}
    
    Question: {question}
    """,
        system_prompt=SYSTEM_PROMPT,
    )

prompt = PromptTemplate(template=template, input_variables=["context", "question"])


In [None]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

text_pipeline = pipeline(
    "text-generation",
    model=base_model_reload,
    tokenizer=tokenizer,
    max_new_tokens=500,
    temperature=1.0,
    top_p=0.95,
    repetition_penalty=1.15,
    streamer=streamer,
)
text_pipeline.model = model #PeftModel.from_pretrained(model=base_model_reload, model_id=base_model_url)
llm = HuggingFacePipeline(pipeline=text_pipeline)

In [None]:
ask = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=False,
    chain_type_kwargs={"prompt": prompt},
)

result = ask("Tell me how can I navigate to a specific pose - include replanning aspects in your answer.")

In [None]:
result['result'].split('<</SYS>>\n\n\n')[-1]

In [None]:
ask = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=False,
    chain_type_kwargs={"prompt": prompt},
)

result = ask("Can you provide me with code for this task?")

In [None]:
result['result'].split('<</SYS>>\n\n\n')[-1]