In [21]:
import chromadb
import json
from pathlib import Path
from peft import AutoPeftModelForCausalLM, PeftConfig
from transformers import AutoTokenizer
import torch

In [2]:
client = chromadb.Client()
collection = client.create_collection(name="fm5_0")

In [4]:
chunks = []
DOCUMENT    = "FM5_0"
PDF_PATH    = Path("pdfs/raw/fm5-0.pdf")
BASE_MODEL  = Path("QuantFactory/Llama-3.2-1B-GGUF")
GGUF_FILE   = "Llama-3.2-1B.Q8_0.gguf"
CACHE_DIR   = "hf_cache"

MODEL_DIR    = DOCUMENT / BASE_MODEL / "lora"
DATA_DIR     = DOCUMENT / BASE_MODEL / "data"
CHUNKED_DATA = DATA_DIR / "chunked"  / "chunked.jsonl"
with open(CHUNKED_DATA, "r", encoding="utf-8") as f:
    for line in f:
        chunks.append(json.loads(line))

In [5]:
for chunk in chunks:
    collection.add(
        documents=[chunk["text"]],
        metadatas=[{
            "section_hierarchy": "/".join(chunk["section_hierarchy"]),
            "page_start": chunk["page_start"],
            "page_end": chunk["page_end"],
            "chunk_id": chunk["chunk_id"]
        }],
        ids=[chunk["chunk_id"]]
    )

/home/pat/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:03<00:00, 23.9MiB/s]


In [15]:
def retrieve_relevant_chunks(query, n=3):
    results = collection.query(
        query_texts=[query],
        n_results=n
    )

    contexts = []
    for i, doc in enumerate(results["documents"][0]):
        metadata = results["metadatas"][0][i]
        context = f"Section: {metadata['section_hierarchy']}\n\n{doc}"
        contexts.append(context)

    return contexts

In [17]:
sys_prompt = f" You are an FM-5-0 assistant. Concisely answer the following question."
sys_role = "system"
usr_role = "user"
bot_role = "assistant"
bos_tok = "<|begin_of_text|>"
eot_id_tok = "<|eot_id|>"
start_hd_tok = "<|start_header_id|>"
end_hd_tok = "<|end_header_id|>"
eot_tok = "<|end_of_text|>"


def build_prompt(sys, context, usr, ans=None):
    prompt = f"{bos_tok}"
    prompt += f"{start_hd_tok}{sys_role}{end_hd_tok}{context}{sys}{eot_id_tok}"
    prompt += f"{start_hd_tok}{usr_role}{end_hd_tok}{usr}{eot_id_tok}"
    prompt += f"{start_hd_tok}{bot_role}{end_hd_tok}"

    if ans is not None:
        prompt += f"{ans}{eot_id_tok}{eot_tok}"

    return prompt

In [18]:
tok = AutoTokenizer.from_pretrained(MODEL_DIR)

In [25]:
peft_config = PeftConfig.from_pretrained(MODEL_DIR)

model = AutoPeftModelForCausalLM.from_pretrained(
    MODEL_DIR,
    config=peft_config,
    cache_dir=CACHE_DIR,
    gguf_file=GGUF_FILE,
    device_map="auto",
    torch_dtype=torch.float16)
model.gradient_checkpointing_enable()
model.use_cache = False

Converting and de-quantizing GGUF tensors...:   0%|          | 0/147 [00:00<?, ?it/s]

In [26]:
def generate_answer(query, contexts):
    # Combine contexts
    combined_context = "\n\n".join(contexts)

    # Create prompt
    prompt = build_prompt(sys_role, combined_context, query)

    # Generate response
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.2)
    response = tok.decode(outputs[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)

    return response

In [31]:
def rag_pipeline(query):
    # Step 1: Retrieve relevant contexts
    contexts = retrieve_relevant_chunks(query)
    print(contexts)

    # Step 2: Generate answer using those contexts
    answer = generate_answer(query, contexts)
    print(answer)

    return answer

In [33]:
query = "What are CCIRs?"
r = rag_pipeline(query)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['Section: Chapter 1/Fundamentals of Planning/The Functions of Planning/Anticipate Events/Operational Variables\n\none or more decisions.\nDuring planning, staffs recommend information requirements for commanders to designate as CCIRs.\nRefinement of CCIRs is continuous throughout an operation. During preparation and execution, staffs\nrecommend changes to CCIRs based on their assessments of the operation.\n1-93. Promulgated by a plan or order, commanders limit the number of CCIRs to focus the efforts of limited\ncollection assets. The fewer the CCIRs, the easier it is for staffs to remember, recognize, and act on each one.\nCCIRs constantly change. Commanders add and delete them throughout an operation based on the\ninformation needed for specific decisions. They determine their own CCIRs and select some from staff\nnominations. Once approved, a CCIR falls into one of two categories: priority intelligence requirements\n(PIRs) and friendly force information requirements (FFIRs).\n\nFun