# RAG

In [1]:
import torch
import os
import uuid
import getpass

from typing import Any, Dict, Iterator, List, Optional, TypedDict
from pprint import pprint
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings, ChatHuggingFace
from langchain_ollama import ChatOllama
from langchain_community.vectorstores import FAISS
from langchain.embeddings import CacheBackedEmbeddings
from langchain.prompts import PromptTemplate, MessagesPlaceholder
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_core.messages import SystemMessage
from langchain.schema import AIMessage, HumanMessage
from langchain_core.tools import tool

from langgraph.graph import START, END, StateGraph, MessagesState
from langgraph.prebuilt import ToolNode, tools_condition
from langgraph.checkpoint.memory import MemorySaver
from langgraph.store.memory import InMemoryStore

from lightning import Fabric
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModelForCausalLM, PeftModel

from IPython.display import display, Markdown, Image, SVG
from os import walk

In [2]:
os.environ["LANGSMITH_TRACING"] = "true"
if not os.environ.get("LANGSMITH_API_KEY"):
    os.environ["LANGSMITH_API_KEY"] = getpass.getpass()

### Set mixed precision

In [3]:
torch.set_float32_matmul_precision("medium")
fabric = Fabric(accelerator="cuda", devices=1, precision="bf16-mixed")
device = fabric.device
fabric.launch()

Using bfloat16 Automatic Mixed Precision (AMP)


### Text Splitter

ref: https://python.langchain.com/docs/how_to/markdown_header_metadata_splitter/

In [4]:
spell_dir = '../database/spell_content/'

file_names = []

for (dirpath, dirnames, filenames) in walk(spell_dir):
    file_names.extend(filenames)
    break

In [5]:
file_names

['Cantrips.txt',
 '2nd Level.txt',
 '8th Level.txt',
 '6th Level.txt',
 '4th Level.txt',
 '5th Level.txt',
 '1st Level.txt',
 '3rd Level.txt',
 '7th Level.txt',
 '9th Level.txt']

In [6]:
headers_to_split_on = [
    ("#", "Spell Name"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)

all_docs = []

for file_name in file_names:
    with open(os.path.join(spell_dir, file_name), 'r', encoding='utf-8') as f:
        raw_text = f.read()
        md_header_splits = markdown_splitter.split_text(raw_text)
        
        for doc in md_header_splits:
            content = doc.page_content

            cleaned = "\n".join(dict.fromkeys(content.splitlines()))
            
            doc.page_content = cleaned
            doc.metadata["source_file"] = file_name
            all_docs.append(doc)


In [7]:
!nvidia-smi

Sun Apr 27 15:28:56 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.86.15              Driver Version: 570.86.15      CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4080        Off |   00000000:01:00.0  On |                  N/A |
|  0%   55C    P3             32W /  340W |    9127MiB /  16376MiB |     18%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Load Embedding Data

In [8]:
embed_model_name = "sentence-transformers/all-mpnet-base-v2"

embeddings = HuggingFaceEmbeddings(model_name=embed_model_name)

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
docs_split = text_splitter.split_documents(all_docs)

vector_store = FAISS.from_documents(documents=docs_split, embedding=embeddings)
# retriever = vector_store.as_retriever()

### Save vector store

In [10]:
vector_store.save_local("./faiss_spell_index")

### Load vector store

In [11]:
vector_store = FAISS.load_local(
    "./faiss_spell_index",
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)
retriever = vector_store.as_retriever()

In [12]:
# vector_store.similarity_search("give me Acid Splash spell?", k=5)

In [13]:
# query = "give me Acid Splash spell?"
# results = retriever.get_relevant_documents(query)

# display(Markdown(results[0].page_content))

In [14]:
# query = "give me Arcane Gate spell?"
# results = retriever.get_relevant_documents(query)

# display(Markdown(results[0].page_content))

In [15]:
!nvidia-smi

Sun Apr 27 15:29:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.86.15              Driver Version: 570.86.15      CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4080        Off |   00000000:01:00.0  On |                  N/A |
| 67%   59C    P0            214W /  340W |   10668MiB /  16376MiB |      8%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[Why the input prompt is part of the output?](https://huggingface.co/TheBloke/Llama-2-70B-Chat-GPTQ/discussions/25)

In [16]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj"],
    lora_dropout=0.2,
    bias="none",
    task_type="CAUSAL_LM"
)

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    quantization_config=quant_config,
    )

lora_model = get_peft_model(base_model, lora_config)

model = PeftModelForCausalLM.from_pretrained(
    lora_model, 
    "../best",
    torch_dtype=torch.bfloat16,
    is_trainable=False
    )

model = model.eval()
model.config.use_cache = True


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



ref: https://github.com/langchain-ai/langchain/discussions/22883

In [17]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=4096,
    top_k=50,
    device_map="auto"
)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

llm = HuggingFacePipeline(
    pipeline=pipe, 
    model_kwargs = {'temperature': 0.9, "torch_dtype": torch.bfloat16}
    )

chat_llama3 = ChatHuggingFace(llm=llm)

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoFo

In [18]:
from IPython.display import Image, display

try:
    display(Image(graph.get_graph().draw_mermaid_png()))
except Exception:
    # This requires some extra dependencies and is optional
    pass

ref: [langchain has not yet adapted the llama model calling tool](https://github.com/langchain-ai/langchain/discussions/20727)

[you need to use ChatOllama](https://github.com/langchain-ai/langgraph/discussions/3260)

[Llama3 not supports function calling](https://github.com/meta-llama/llama3/issues/88)

[I wish Llama-3-Instruct models had native function/tool calling support](https://www.reddit.com/r/LocalLLaMA/comments/1d19l8p/i_wish_llama3instruct_models_had_native/)

[Bind Tools do not work with ChatHuggingFace](https://github.com/langchain-ai/langchain/discussions/22883)