# Setup

Install all needed dependencies.

In [None]:
!pip install --quiet --upgrade transformers sentence-transformers

#  Simple RAG

In [None]:
# Models
embedder_model = "ibm-granite/granite-embedding-30m-english"
generator_model = "ibm-granite/granite-3.2-2b-instruct"

Prepare chunk database. In this case database is represented by a list.

In [None]:
import urllib.request

link = "https://huggingface.co/ngxson/demo_simple_rag_py/raw/main/cat-facts.txt"
dataset = []

# Retrieve knowledge from provided link, use every line as a separate chunk.
for line in urllib.request.urlopen(link):
  dataset.append(line.decode('utf-8'))

print(f'Loaded {len(dataset)} entries')

**Specify user query here**

In [None]:
input_query = "tell me about cat mummies"

Encode user query and chunks into embeddings (vector representations). Use semantic_search to find 5 chunks which are most similar to the query.

In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search

embedder = SentenceTransformer(embedder_model)

query_embedding = embedder.encode(input_query, convert_to_tensor=True)
dataset_embedding = embedder.encode(dataset, convert_to_tensor=True)

# Get 5 chunk embeddings which are most similar to the query embedding.
# semantic_search returns list of dictionaries with embedding index (corpus_id) and similarity score.
retrieved_knowledge = semantic_search(query_embeddings=query_embedding, corpus_embeddings=dataset_embedding, top_k=5)

print('Retrieved knowledge:')
for corpus in retrieved_knowledge[0]:
  print(f' - (similarity: {corpus["score"]:.2f}) {dataset[corpus["corpus_id"]]}')

Prepare inference pipeline using transformers

In [None]:
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained(generator_model)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

pipeline = transformers.pipeline("text-generation", model=generator_model, tokenizer=tokenizer, device=0)

Run inference using provided user prompt and system prompt containing knowledge chunks.

In [None]:
# Construct system prompt for inference providing retrieved chunks as context.
instruction_prompt = f'''You are a helpful chatbot.
Use only the following pieces of context to answer the question. Don't make up any new information:
{''.join([f' - {dataset[corpus["corpus_id"]]}' for corpus in retrieved_knowledge[0]])}
'''

messages = [
    {
        "role": "system",
        "content": instruction_prompt,
    },
    {
        "role": "user",
        "content": input_query,
    }
]

outputs = pipeline(messages, max_new_tokens=1024)

Print result.

In [None]:
from IPython.display import display, Markdown

output = ""
for turn in outputs:
    for item in turn["generated_text"]:
        output += f"# {item['role']}\n\n{item['content']}\n\n"

display(Markdown(output))

# Cleaning Up

Delete pipeline and associated model from GPU.

In [None]:
import torch


del pipeline
torch.cuda.empty_cache()