In [1]:
#importing the needed packages
import os
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import oracledb
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_community.vectorstores.oraclevs import OracleVS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.retrievers import ContextualCompressionRetriever
from langchain_nvidia_ai_endpoints import NVIDIARerank
from langchain_nvidia_ai_endpoints import NVIDIARerank

In [2]:
# Set the NVIDIA API key as an environment variable
os.environ["NVIDIA_API_KEY"] = "<your nvidia key starting with nvapi**** here>" 
# Initialize the LLM (Large Language Model) with the specified model
llm = ChatNVIDIA(model="meta/llama3-8b-instruct")

In [4]:
# Create a chat prompt template with a system message and a user message
prompt = ChatPromptTemplate.from_messages([
    ("system", (
        "You are a helpful and friendly AI!"
        "Your responses should be concise and no longer than two sentences."
        "Say you don't know if you don't have this information."
    )),
    ("user", "{question}")
])
# Chain the prompt, LLM, and output parser together
chain = prompt | llm | StrOutputParser()

In [5]:
# Example questions to invoke the LLM chain
print(chain.invoke({"question": "What's the difference between a GPU and a CPU?"}))

A CPU (Central Processing Unit) is the brain of your computer, handling general computing tasks, executing instructions, and performing calculations. A GPU (Graphics Processing Unit) is designed specifically for handling graphics and computationally intensive tasks, like gaming, video editing, and scientific simulations, with many cores performing parallel processing.


In [6]:
# Example questions to invoke the LLM chain
print(chain.invoke({"question": "What does the H in the NVIDIA H200 stand for?"}))

I'm not familiar with the NVIDIA H200, as it doesn't seem to be a publicly recognized product.


In [7]:
# Database connection setup
username = "<your username here>"
password = "<your password here>"
host="<IP of your host here>"
port="<the port that you are using here>"
service_name="<service name here>"
dsn=host+":"+port+"/"+service_name

print("The database user name is:", username)
print("Database connection information is:", dsn)

# Connect to the database
try:
    conn23c = oracledb.connect(user=username, password=password, dsn=dsn)
    print("Connection successful!")
except oracledb.DatabaseError as e:
    error, = e.args
    print(f"Connection failed. Error code: {error.code}")
    print(f"Error message: {error.message}")

The database user name is: vector
Database connection information is: localhost:1521/freepdb1
Connection successful!


In [8]:
# Load a PDF document from a URL
loader = PyPDFLoader("https://nvdam.widen.net/content/udc6mzrk7a/original/hpc-datasheet-sc23-h200-datasheet-3002446.pdf")
# Load the document into memory
document = loader.load()
document[0] # Print the first page of the document

Document(metadata={'source': 'https://nvdam.widen.net/content/udc6mzrk7a/original/hpc-datasheet-sc23-h200-datasheet-3002446.pdf', 'page': 0}, page_content='NVIDIA H200 Tensor Core GPU\u2002|\u2002Datasheet\u2002|\u2002 1NVIDIA H200 Tensor Core GPU\nSupercharging AI and HPC workloads.\nHigher Performance With Larger, Faster Memory\nThe NVIDIA H200 Tensor Core GPU supercharges generative AI and high-\nperformance computing (HPC) workloads with game-changing performance  \nand memory capabilities. \nBased on the NVIDIA Hopper™ architecture , the NVIDIA H200 is the first GPU to \noffer 141 gigabytes (GB) of HBM3e memory at 4.8 terabytes per second (TB/s)—\nthat’s nearly double the capacity of the NVIDIA H100 Tensor Core GPU  with \n1.4X more memory bandwidth. The H200’s larger and faster memory accelerates \ngenerative AI and large language models, while advancing scientific computing for \nHPC workloads with better energy efficiency and lower total cost of ownership. \nUnlock Insights Wit

In [9]:
# Initialize a text splitter to chunk the document into smaller pieces
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", ";", ",", " ", ""],
)
# Split the document into chunks
document_chunks = text_splitter.split_documents(document)
print("Number of chunks from the document:", len(document_chunks))

Number of chunks from the document: 16


In [10]:
# Define the query to be used with the reranker
query = "What does the H in the NVIDIA H200 stand for?"

In [11]:
# Initialize the NVIDIA reranker with the specified model
reranker = NVIDIARerank(model="nvidia/nv-rerankqa-mistral-4b-v3", base_url="http://localhost:8001/v1")

In [12]:
# Rerank the document chunks based on the query
reranked_chunks = reranker.compress_documents(query=query,documents=document_chunks)

In [13]:
# Print out the relevance score and page content for each chunk
for chunks in reranked_chunks:

    # Access the metadata of the document
    metadata = chunks.metadata

    # Get the page content
    page_content = chunks.page_content
    
    # Print the relevance score if it exists in the metadata, followed by page content
    if 'relevance_score' in metadata:
        print(f"Relevance Score:{metadata['relevance_score']}, Page Content:{page_content}...")
    print(f"{'-' * 100}")

Relevance Score:16.3125, Page Content:NVIDIA H200 Tensor Core GPU | Datasheet |  1NVIDIA H200 Tensor Core GPU
Supercharging AI and HPC workloads.
Higher Performance With Larger, Faster Memory
The NVIDIA H200 Tensor Core GPU supercharges generative AI and high-
performance computing (HPC) workloads with game-changing performance  
and memory capabilities. 
Based on the NVIDIA Hopper™ architecture , the NVIDIA H200 is the first GPU to 
offer 141 gigabytes (GB) of HBM3e memory at 4.8 terabytes per second (TB/s)—...
----------------------------------------------------------------------------------------------------
Relevance Score:10.875, Page Content:NVIDIA H200 Tensor Core GPU | Datasheet |  3Unleashing AI Acceleration for Mainstream Enterprise Servers 
With H200 NVL
The NVIDIA H200 NVL is the ideal choice for customers with space constraints within  
the data center, delivering acceleration for every AI and HPC workload regardless of size. 
With a 1.5X memory increase and a 1.2X bandwid

In [14]:
# Initialize the NVIDIA embeddings model
embedding_model = NVIDIAEmbeddings(model="nvidia/nv-embedqa-e5-v5")



In [17]:
# Store the document chunks in an Oracle vector store with the embeddings model
vector_store = OracleVS.from_documents(
    document_chunks,
    embedding_model,
    client=conn23c,
    table_name="MY_DEM04",
    distance_strategy=DistanceStrategy.DOT_PRODUCT,
    #tablespace="my_tablespace"
)

In [18]:
# Convert the vector store into a retriever with the specified search parameters
retriever =vector_store.as_retriever(search_kwargs={"k": 10})

In [20]:
# Re-initialize the compressor with the reranker model
compressor = NVIDIARerank(model="nvidia/nv-rerankqa-mistral-4b-v3",
                          base_url="http://localhost:8001/v1")

In [21]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=retriever
)

In [22]:
# Define the query to be used with the retrieval QA chain
query = "What does the H in the NVIDIA H200 stand for?"
# Create a retrieval QA chain using the LLM and retriever
chain = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever)
chain.invoke(query)

{'query': 'What does the H in the NVIDIA H200 stand for?',
 'result': 'The "H" in the NVIDIA H200 stands for "Hopper". The NVIDIA H200 is based on the NVIDIA Hopper architecture, which is a specific design and technical architecture used by NVIDIA for their GPUs.'}