### Notes: If ImportError occurs, it's probably due to the huggingface-hub. 
> pip install huggingface-hub==0.25.0

## Import

In [1]:
import torch
from transformers import pipeline

from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, TextStreamer, pipeline, BitsAndBytesConfig, AutoModelForCausalLM
import os

DEVICE = "cuda:0" if torch.cuda.is_available() else "CPU"
DEVICE = "cpu"
HUGGING_FACE_TOKEN = os.environ.get('HUGGING_FACE_TOKEN') #in terminal: export HUGGING_FACE_TOKEN="YOUR_TOKEN"


## Load Model

In [2]:
'''
Possible Models:
- meta-llama/Llama-3.2-1B-Instruct
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-11B-Vision-Instruct
'''
model_id = "meta-llama/Llama-3.2-3B-Instruct" 

# Quanitisize your model dtype (for sparsity)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Set token using ENV variable
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HUGGING_FACE_TOKEN)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=HUGGING_FACE_TOKEN,
    quantization_config=bnb_config
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
prompt = f"HI!"
inputs = tokenizer(prompt, return_tensors='pt', truncation=True)
inputs = inputs.to('cpu')  # Ensure inputs are on CPU

output = model.generate(
    **inputs,
    max_new_tokens=50,
    num_beams=1,
    do_sample=False,
    temperature=1
)

answer = tokenizer.decode(output[0], skip_special_tokens=True)

print(answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


HI! I'm so excited to share my latest project with you all! I've been working on a new line of eco-friendly, cruelty-free, and vegan-friendly makeup products, and I just can't wait to show you all the gorgeous shades and formulas I


### Knowledge Bases

In [4]:
#pdf base
pdf_base_path = '/home/dongkyu/RAG/knowledge'
loader = PyPDFDirectoryLoader(pdf_base_path) #The Knowledge Base Folder
pdf_docs = loader.load()
len(pdf_docs) 

13

In [5]:
#knowledge base
#knowledge_base_path= "/home/dongkyu/ros2-rag-project/notebooks/scraped_docs"
knowledge_base_path= "/home/dongkyu/ros2-rag-project/exported_docs/ros2"
txt_loader = DirectoryLoader(knowledge_base_path, glob="**/*.txt")
docs = txt_loader.load()
len(docs)

188

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(docs)
len(texts)


2856

In [7]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-base",
    model_kwargs={"device": DEVICE}
)

db = Chroma.from_documents(texts, embeddings, persist_directory="db")


load INSTRUCTOR_Transformer
max_seq_length  512


### RAG

In [8]:
DEFAULT_SYSTEM_PROMPT = """
Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
respond with "I don't know" or a similar acknowledgment that the answer is not available.
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
[INST] <<SYS>>
{system_prompt}
<</SYS>>

{prompt} [/INST]
""".strip()

SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"

template = generate_prompt(
    """
{context}

Question: {question}
""",
    system_prompt=SYSTEM_PROMPT,
)

prompt = PromptTemplate(template=template, input_variables=["context", "question"])


In [9]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=500,
    temperature=0.1,
    top_p=0.95,
    repetition_penalty=1.15,
    streamer=streamer,
)

llm = HuggingFacePipeline(pipeline=text_pipeline)

  llm = HuggingFacePipeline(pipeline=text_pipeline)


In [10]:
ask = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=False,
    chain_type_kwargs={"prompt": prompt},
)

result = ask("Write a sample code for Bouncy Bolson")

  result = ask("Write a sample code for Bouncy Bolson")
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 

```csharp
using System;
using System.Collections.Generic;

public class Node {
    public int Value { get; set; }
    public List<Node> Next { get; set; }

    public Node(int value) {
        Value = value;
        Next = new List<Node>();
    }
}

public class BouncyBolson {
    private Node head;

    public void AddNode(int value) {
        // implementation here
    }

    public void PrintList() {
        // implementation here
    }
}
```

Here is the complete code with all functions implemented.

```csharp
using System;
using System.Collections.Generic;

public class Node {
    public int Value { get; set; }
    public List<Node> Next { get; set; }

    public Node(int value) {
        Value = value;
        Next = new List<Node>();
    }
}

public class BouncyBolson {
    private Node head;

    public BouncyBolson() {
        head = null;
    }

    public void AddNode(int value) {
        if (head == null)
            head = new Node(value);
        else {
            Nod