Mostly stolen from:

https://github.com/NVIDIA/GenerativeAIExamples/blob/main/RAG/notebooks/langchain/RAG_Langchain_with_Local_NIM.ipynb

# Build a RAG using NVIDIA NIM microservices locally installed on MPC. 

Use a Llama3-8b-instruct model using NVIDIA NIM for LLMs that is locally hosted on SIH-MPC and connect to it using LangChain NVIDIA AI Endpoints package.

Create a vector store by downloading web pages and generating their embeddings using FAISS. The embedding model, uses the GPU accelerated NV-Embed-QA model from NVIDIA API Catalog.

This example creates a RAG from the VAST Pipeline documentation web pages.


In [None]:
# Bunch of dependencies from LangChain, BeautifulSoup, FAISS
import re

import requests
from bs4 import BeautifulSoup
import getpass
import os
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT, QA_PROMPT
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferMemory
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings

import warnings
warnings.filterwarnings('ignore')


Set the NVIDIA API key as `NVIDIA_API_KEY` environment variable if `NVIDIA_API_KEY` doesn't already exist.

In [None]:
def set_api_key():
    if not os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
        nvapi_key = getpass.getpass("Enter your NVIDIA API key: ")
        assert nvapi_key.startswith("nvapi-"), f"{nvapi_key[:5]}... is not a valid key"
        os.environ["NVIDIA_API_KEY"] = nvapi_key


Check to see if we can connect to the local NIM on MPC (10.167.67.78).

In [None]:
set_api_key()
llm = ChatNVIDIA(base_url="http://10.167.67.78:8000/v1", model="meta/llama3-8b-instruct", temperature=0.1, max_tokens=64, top_p=1.0)
result = llm.invoke("What is VAST?")
print(result.content)


In [None]:
def html_document_loader(url):
    try:
        response = requests.get(url)
        html_content = response.text
    except Exception as e:
        print(f"Failed to load {url} due to exception {e}")
        return ""

    try:
        # Create a Beautiful Soup object to parse html
        soup = BeautifulSoup(html_content, "html.parser")

        # Remove script and style tags
        for script in soup(["script", "style"]):
            script.extract()

        # Get the plain text from the HTML document
        text = soup.get_text()

        # Remove excess whitespace and newlines
        text = re.sub("\s+", " ", text).strip()

        return text
    except Exception as e:
        print(f"Exception {e} while loading document")
        return ""


In [None]:
html_document_loader("https://vast-survey.org/")

The `create_embeddings` function creates embeddings from a list of input URLs using the embedding model specified and saves them to disk locally.

The model represents words, phrases, or other entities as vectors of numbers and understands the relation between words and phrases.

kwargs to `RecursiveCharacterTextSplitter`:

chunk_size (int) – Maximum size of chunks to return

chunk_overlap (int) – Overlap in characters between chunks

length_function (Callable[[str], int]) – Function that measures the length of given chunks

In [None]:
def create_embeddings(embedding_path, embedding_model, in_urls):

    print(f"Storing embeddings to {embedding_path}")

    documents = []
    for url in in_urls:
        document = html_document_loader(url)
        documents.append(document)


    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=0,
        length_function=len,
    )
    texts = text_splitter.create_documents(documents)
    index_docs(text_splitter, texts, embedding_path, embedding_model)
    print("Generated embedding successfully")


`index_docs` (called by `create_embeddings` above) does the work of splitting the text into chunks and creating the embeddings.

The embedding file is created if it doesn't already exist otherwise each input text chunk is appended to the already existing output file.

In [None]:
def index_docs(splitter, documents, dest_embed_dir, embedding_model):
    """
    Split the document into chunks and create embeddings for the document
    """

    for document in documents:
        texts = splitter.split_text(document.page_content)

        # metadata to attach to document
        metadatas = [document.metadata]

        # create embeddings and add to vector store
        if os.path.exists(dest_embed_dir):
            update = FAISS.load_local(folder_path=dest_embed_dir, embeddings=embedding_model, allow_dangerous_deserialization=True)
            update.add_texts(texts, metadatas=metadatas)
            update.save_local(folder_path=dest_embed_dir)
        else:
            docsearch = FAISS.from_texts(texts, embedding=embedding_model, metadatas=metadatas)
            docsearch.save_local(folder_path=dest_embed_dir)


In [None]:
VAST_URLS = ["https://vast-survey.org/vast-pipeline/dev/",
             "https://vast-survey.org/vast-pipeline/1.2.0/gettingstarted/installation/",
             "https://vast-survey.org/vast-pipeline/1.2.0/gettingstarted/configuration/",
             "https://vast-survey.org/vast-pipeline/1.2.0/gettingstarted/deployment/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/overview/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/imageingest/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/association/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/newsources/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/monitor/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/sourcestats/",
             "https://vast-survey.org/vast-pipeline/1.2.0/architecture/intro/",
             "https://vast-survey.org/vast-pipeline/1.2.0/faq/",
             "https://vast-survey.org/vast-pipeline/1.2.0/help_and_acknowledgements/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/overview/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/imageingest/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/association/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/newsources/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/monitor/",
             "https://vast-survey.org/vast-pipeline/1.2.0/design/sourcestats/",
             "https://vast-survey.org/",
             "https://vast-survey.org/Survey/",
             "https://vast-survey.org/Team/"
            ]

Create the Embeddings from the list if URLs and save them locally.

NVIDIAEmbeddings is from the NVIDIA AI Endpoints for LangChain library.

The "NV-Embed-QA" model is described here: https://build.nvidia.com/nvidia/embed-qa-4/modelcard

In [None]:
set_api_key()
embedding_model = NVIDIAEmbeddings(model="NV-Embed-QA", truncate="END")
embedding_path = "./data/vast_embedding_all"
create_embeddings(embedding_path, embedding_model, VAST_URLS)

Use `ConversationRetrievalChain` to create a Question-Answer chat-bot from the locally hosted LLM, providing the embedded URL text as context. Also provide chat memory using `ConversationBufferMemory`.

https://python.langchain.com/api_reference/langchain/chains/langchain.chains.conversational_retrieval.base.ConversationalRetrievalChain.html

In [None]:
set_api_key()
llm = ChatNVIDIA(base_url="http://10.167.67.78:8000/v1", model="meta/llama3-8b-instruct", temperature=0.1, max_tokens=1000, top_p=1.0)

embedding_model = NVIDIAEmbeddings(model="NV-Embed-QA", truncate="END")
embedding_path = "./data/vast_embedding_all_bak"
docsearch = FAISS.load_local(folder_path=embedding_path, embeddings=embedding_model, allow_dangerous_deserialization=True)

qa_prompt=QA_PROMPT

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=docsearch.as_retriever(),
    chain_type="stuff",
    memory=memory,
    combine_docs_chain_kwargs={'prompt': qa_prompt},
)

In [None]:
def get_query(query):
    result = qa({"question": query})
    print(result.get("answer"))

query = "What is VAST?"
get_query(query)

In [None]:
get_query("What is the VAST pipeline?")

In [None]:
get_query("What are the steps of it?")

In [None]:
get_query("Describe the final step in more detail.")