In [1]:
!pip install -q langchain-community langchain-nvidia-ai-endpoints faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# For Colab user
import os
from google.colab import userdata
os.environ['NVIDIA_API_KEY'] = userdata.get('NVIDIA_API_KEY')

In [3]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import numpy as np

# Initialize NVIDIA services
# Get API key from: https://build.nvidia.com

# --- Configuration ---
LLM_ENDPOINT = "https://integrate.api.nvidia.com/v1"
LLM_MODEL = "meta/llama-3.2-3b-instruct"
# LLM_MODEL ="nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
EMBEDDING_ENDPOINT = "https://integrate.api.nvidia.com/v1"
EMBEDDING_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2"

llm = ChatNVIDIA(model="meta/llama-3.2-3b-instruct")
embedder = NVIDIAEmbeddings(model="nvidia/llama-3.2-nv-embedqa-1b-v2")

# Sample documents for our knowledge base
documents = [
    "NVIDIA RTX 4090 is the fastest gaming GPU available today",
    "CUDA is NVIDIA's parallel computing platform and programming model",
    "GeForce RTX series cards support real-time ray tracing",
    "Tensor cores accelerate AI and machine learning workloads",
    "DLSS uses AI to upscale games while maintaining performance",
    "NVIDIA Omniverse enables real-time collaboration for 3D creators"
]

# Create document objects
docs = [Document(page_content=doc) for doc in documents]

# Generate embeddings and create vector store
vector_store = FAISS.from_documents(docs, embedder)

In [4]:
# Example 1: Basic Similarity Search
def basic_similarity_search(query):
    print(f"Query: {query}")
    results = vector_store.similarity_search(query, k=3)
    for i, doc in enumerate(results):
        print(f"{i+1}. {doc.page_content}")
    print("-" * 50)

# Basic similarity searches
basic_similarity_search("graphics card performance")
basic_similarity_search("AI acceleration technology")

Query: graphics card performance
1. NVIDIA RTX 4090 is the fastest gaming GPU available today
2. DLSS uses AI to upscale games while maintaining performance
3. GeForce RTX series cards support real-time ray tracing
--------------------------------------------------
Query: AI acceleration technology
1. Tensor cores accelerate AI and machine learning workloads
2. DLSS uses AI to upscale games while maintaining performance
3. CUDA is NVIDIA's parallel computing platform and programming model
--------------------------------------------------


In [7]:
# Example 2: Search with Similarity Scores
def search_with_scores(query):
    print(f"Query: {query}")
    results = vector_store.similarity_search_with_score(query, k=3)
    for i, (doc, score) in enumerate(results):
        print(f"{i+1}. [Score: {score:.4f}] {doc.page_content}")
    print("-" * 50)

# Search with similarity scores
search_with_scores("real-time rendering")

Query: real-time rendering
1. [Score: 1.4381] GeForce RTX series cards support real-time ray tracing
2. [Score: 1.4454] NVIDIA Omniverse enables real-time collaboration for 3D creators
3. [Score: 1.6696] DLSS uses AI to upscale games while maintaining performance
--------------------------------------------------


In [6]:
# Example 3: LLM-Augmented Response
def rag_search(query):
    print(f"User Query: {query}")

    # Retrieve relevant documents
    retrieved_docs = vector_store.similarity_search(query, k=2)
    context = "\n".join([doc.page_content for doc in retrieved_docs])

    # Generate response using LLM
    prompt = f"""
    Context: {context}

    Question: {query}

    Answer based only on the provided context:
    """

    print(f"PROMPT: {prompt}")

    response = llm.invoke(prompt)
    print(f"==>\nAI Response: {response.content}")
    print("-" * 50)


# Retrieval-Augmented Generation (RAG)
rag_search("What is DLSS and how does it work?")
rag_search("Explain Tensor cores and their applications")


User Query: What is DLSS and how does it work?
PROMPT: 
    Context: DLSS uses AI to upscale games while maintaining performance
GeForce RTX series cards support real-time ray tracing

    Question: What is DLSS and how does it work?

    Answer based only on the provided context:
    
==>
AI Response: DLSS (Deep Learning Super Sampling) is a technology that uses AI to upscale games while maintaining performance.
--------------------------------------------------
User Query: Explain Tensor cores and their applications
PROMPT: 
    Context: Tensor cores accelerate AI and machine learning workloads
CUDA is NVIDIA's parallel computing platform and programming model

    Question: Explain Tensor cores and their applications

    Answer based only on the provided context:
    
==>
AI Response: Based on the provided context, here's an explanation of Tensor cores and their applications:

Tensor cores are specialized GPU hardware units that accelerate AI and machine learning (ML) workloads. Th