In [None]:
## Implementing RAG with Guardrails

## To implement RAG with guardrails, we will rely on the NVIDIA NeMo Guardrails library. 
## The library primarily focuses on AI safety by implementing "guardrails" as protective measures against unwanted interactions.
## However, we can also use these guardrails to trigger things like RAG


In [None]:
%pip install -qU \
    nemoguardrails==0.4.0 \
    pinecone-client==2.2.2 \
    datasets==2.14.3 \
    openai==0.27.8

In [49]:
## Building the Knowledge Base
%pip install -U datasets

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [50]:
from datasets import load_dataset_builder

ds_builder = load_dataset_builder("jamescalam/llama-2-arxiv-papers-chunked")

In [51]:
ds_builder.info.description
ds_builder.info.features

{'doi': Value(dtype='string', id=None),
 'chunk-id': Value(dtype='string', id=None),
 'chunk': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'summary': Value(dtype='string', id=None),
 'source': Value(dtype='string', id=None),
 'authors': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'categories': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'comment': Value(dtype='string', id=None),
 'journal_ref': Value(dtype='string', id=None),
 'primary_category': Value(dtype='string', id=None),
 'published': Value(dtype='string', id=None),
 'updated': Value(dtype='string', id=None),
 'references': [{'id': Value(dtype='string', id=None),
   'title': Value(dtype='string', id=None),
   'authors': Value(dtype='string', id=None),
   'year': Value(dtype='string', id=None)}]}

In [52]:
from datasets import load_dataset

In [53]:
data = load_dataset(
    "jamescalam/llama-2-arxiv-papers-chunked",
    split="train"
)

In [54]:
data[0]

{'doi': '1102.0183',
 'chunk-id': '0',
 'chunk': 'High-Performance Neural Networks\nfor Visual Object Classi\x0ccation\nDan C. Cire\x18 san, Ueli Meier, Jonathan Masci,\nLuca M. Gambardella and J\x7f urgen Schmidhuber\nTechnical Report No. IDSIA-01-11\nJanuary 2011\nIDSIA / USI-SUPSI\nDalle Molle Institute for Arti\x0ccial Intelligence\nGalleria 2, 6928 Manno, Switzerland\nIDSIA is a joint institute of both University of Lugano (USI) and University of Applied Sciences of Southern Switzerland (SUPSI),\nand was founded in 1988 by the Dalle Molle Foundation which promoted quality of life.\nThis work was partially supported by the Swiss Commission for Technology and Innovation (CTI), Project n. 9688.1 IFF:\nIntelligent Fill in Form.arXiv:1102.0183v1  [cs.AI]  1 Feb 2011\nTechnical Report No. IDSIA-01-11 1\nHigh-Performance Neural Networks\nfor Visual Object Classi\x0ccation\nDan C. Cire\x18 san, Ueli Meier, Jonathan Masci,\nLuca M. Gambardella and J\x7f urgen Schmidhuber\nJanuary 2011\nAbs

In [None]:
## Reformat the data to keep only what is needed

In [55]:
## https://www.pinecone.io/learn/fast-retrieval-augmented-generation/

data = data.map(lambda x: {
    'uid': f"{x['doi']}-{x['chunk-id']}"
})
data

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references', 'uid'],
    num_rows: 4838
})

In [None]:
## The chunk field contains the text we will encode and store inside Pinecone

## To encode that data, we need to use an embedding model

## We will use OpenAI's text-embedding-ada-002

In [56]:
import os
import openai

In [57]:
# https://platform.openai.com/account/api-keys
os.environ['OPENAI_API_KEY'] = "zzz"

In [58]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [59]:
import openai

embed_model_id = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "We would have some text to embed here",
        "And maybe another chunk here too"
    ], engine=embed_model_id
)

In [60]:
len(res['data'][0]['embedding']), len(res['data'][1]['embedding'])

(1536, 1536)

In [61]:
import pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = "xx"
# find your environment next to the api key in pinecone console
env = "gcp-starter"

pinecone.init(api_key=api_key, environment=env)

In [62]:
import time

index_name = "langchain-retrieval"

# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='cosine'
    )
    # wait for index to be initialized
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pinecone.Index(index_name)

In [63]:
##  RAG with Guardrails

In [64]:
async def retrieve(query: str) -> list:
    
    # create query embedding
    res = openai.Embedding.create(input=[query], engine=embed_model_id)
    xq = res['data'][0]['embedding']
    
    # get relevant contexts from pinecone
    res = index.query(xq, top_k=5, include_metadata=True)
    
    # get list of retrieved texts
    contexts = [x['metadata']['chunk'] for x in res['matches']]
    return contexts

async def rag(query: str, contexts: list) -> str:
    print("> RAG Called")  # we'll add this so we can see when this is being used
    
    context_str = "\n".join(contexts)
    
    # place query and contexts into RAG prompt
    prompt = f"""You are a helpful assistant, below is a query from a user and
    some relevant contexts. Answer the question given the information in those
    contexts. If you cannot find the answer to the question, say "I don't know".

    Contexts:
    {context_str}

    Query: {query}

    Answer: """
    # generate answer
    res = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        temperature=0.0,
        max_tokens=100
    )
    return res['choices'][0]['text']

In [65]:
yaml_content = """
models:
- type: main
  engine: openai
  model: text-davinci-003
"""

rag_colang_content = """
# define limits
define user ask politics
    "what are your political beliefs?"
    "thoughts on the president?"
    "left wing"
    "right wing"

define bot answer politics
    "I'm a personal assistant, I don't like to talk of politics."

define flow politics
    user ask politics
    bot answer politics
    bot offer help

# define RAG intents and flow
define user ask llms
    "tell me about llama 2?"
    "what is large language model"
    "where did meta's new model come from?"
    "what is the falcon model?"
    "have you ever meta llama?"

define flow llms
    user ask llms
    $contexts = execute retrieve(query=$last_user_message)
    $answer = execute rag(query=$last_user_message, contexts=$contexts)
    bot $answer
"""

In [66]:
from nemoguardrails import LLMRails, RailsConfig

# initialize rails config
config = RailsConfig.from_content(
    colang_content=rag_colang_content,
    yaml_content=yaml_content
)
# create rails
rag_rails = LLMRails(config)

In [67]:
rag_rails.register_action(action=retrieve, name="retrieve")
rag_rails.register_action(action=rag, name="rag")

In [68]:
await rag_rails.generate_async(prompt="hello")

'Hello! How can I help you today?'

In [69]:
await rag_rails.generate_async(prompt="what is falcon model")

> RAG Called


'\nThe Falcon model is a type of aircraft manufactured by the aerospace company, Lockheed Martin. It is a multirole fighter aircraft designed for air-to-air and air-to-ground combat.'

In [70]:
await rag_rails.generate_async(prompt="tell me about llama 2?")

> RAG Called


'\nThe Llama 2 is a semi-automatic pistol manufactured by the Argentine company Llama-Gabilondo y Cia SA. It is chambered in 9mm Parabellum and is a single-action, short recoil-operated handgun. It has a 4.5-inch barrel and a magazine capacity of 8 rounds. The Llama 2 is a popular choice for self-defense and target shooting.'