## Create a RAG app

## Environment Setup

This step uses the following libraries:
|Library|License|
|-|-|
| [PyTorch](https://github.com/pytorch/pytorch) | BSD 3-Clause |
| [transformers](https://github.com/huggingface/transformers) | Apache 2.0 |
| [peft](https://github.com/huggingface/peft) | Apache 2.0 |
| [chromadb](https://github.com/chroma-core/chroma) | Apache 2.0 |

In [1]:
import json
from pathlib import Path

from peft import AutoPeftModelForCausalLM, PeftConfig
from transformers import AutoTokenizer
import torch

import chromadb

In [2]:
DOCUMENT    = "FM5_0"
PDF_PATH    = Path("pdfs/raw/fm5-0.pdf")
BASE_MODEL  = Path("QuantFactory/Llama-3.2-1B-GGUF")
GGUF_FILE   = "Llama-3.2-1B.Q8_0.gguf"
CACHE_DIR   = "hf_cache"

DATA_DIR    = DOCUMENT / BASE_MODEL / "data"
MODEL_DIR   = DOCUMENT / BASE_MODEL / "lora"
CHUNKED_DATA = DATA_DIR / "chunked" / "chunked.jsonl"
QA_DATA      = DATA_DIR / "qa"       / "qa_pairs.jsonl"

I'll create a vector database and load the chunks in.

In [3]:
client     = chromadb.Client()
collection = client.create_collection(name=DOCUMENT)

In [4]:
chunks = []
with open(CHUNKED_DATA, "r", encoding="utf-8") as f:
    for line in f:
        chunks.append(json.loads(line))

In [5]:
for chunk in chunks:
    collection.add(
        documents=[chunk["text"]],
        ids=[chunk["chunk_id"]]
    )

I'll define a helper function to load the chunks from a user's question.

In [6]:
def retrieve_relevant_chunks(query, n=3):
    results = collection.query(
        query_texts=[query],
        n_results=n
    )

    contexts = []
    for i, doc in enumerate(results["documents"][0]):
        context = f"\n\n{doc}"
        contexts.append(context)

    return contexts

And set up the prompt builder used during training.

In [7]:
sys_prompt   = f" You are an FM-5-0 assistant. Concisely answer the following question."
sys_role     = "system"
usr_role     = "user"
bot_role     = "assistant"
bos_tok      = "<|begin_of_text|>"
eot_id_tok   = "<|eot_id|>"
start_hd_tok = "<|start_header_id|>"
end_hd_tok   = "<|end_header_id|>"
eot_tok      = "<|end_of_text|>"


def build_prompt(sys, context, usr, ans=None):
    prompt = f"{bos_tok}"
    prompt += f"{start_hd_tok}{sys_role}{end_hd_tok}{context}{sys}{eot_id_tok}"
    prompt += f"{start_hd_tok}{usr_role}{end_hd_tok}{usr}{eot_id_tok}"
    prompt += f"{start_hd_tok}{bot_role}{end_hd_tok}"

    if ans is not None:
        prompt += f"{ans}{eot_id_tok}{eot_tok}"

    return prompt

Load the model and create a function to automatically retrieve context, build the prompt, and return the result to the user.

In [8]:
tok = AutoTokenizer.from_pretrained(MODEL_DIR)

In [9]:
final_model_dir = MODEL_DIR / "final"

In [10]:
peft_config = PeftConfig.from_pretrained(final_model_dir)

model = AutoPeftModelForCausalLM.from_pretrained(
    final_model_dir,
    config=peft_config,
    cache_dir=CACHE_DIR,
    gguf_file=GGUF_FILE,
    device_map="auto",
    torch_dtype=torch.float16)
model.gradient_checkpointing_enable()
model.use_cache = False

Converting and de-quantizing GGUF tensors...:   0%|          | 0/147 [00:00<?, ?it/s]

In [11]:
def generate_answer(query, contexts):
    # Combine contexts
    combined_context = "\n\n".join(contexts)

    # Create prompt
    prompt = build_prompt(sys_role, combined_context, query)

    # Generate response
    inputs   = tok(prompt, return_tensors="pt").to(model.device)
    outputs  = model.generate(**inputs,
                               max_new_tokens=256,
                               do_sample=True,
                               temperature=0.7,
                               top_p=0.9,
                               repetition_penalty=1.1,
                               no_repeat_ngram_size=4,
                               eos_token_id=tok.eos_token_id,
                               pad_token_id=tok.eos_token_id)
    response = tok.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)

    return response

In [12]:
def rag_pipeline(query):
    contexts = retrieve_relevant_chunks(query)
    answer = generate_answer(query, contexts)

    return answer

Test it out.

In [13]:
query = "What are CCIRs?"
r = rag_pipeline(query)
print(r)

Specific information identified by a commander that facilitates decision making, directly linking to a current decision. ارزی
