![Generating Embeddings](../../images/headings/02_retrieval_augmented_generation_04_02_shakespeare_rag.png)

# Retrieval-Augmented Generation on a Public Dataset

## Setup
### Imports

In [None]:
from datasets import load_dataset
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_aws import ChatBedrock
from langchain_openai import AzureChatOpenAI
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
import os

### Models

In [None]:
gpt4 = AzureChatOpenAI(azure_deployment=os.getenv('AZURE_OPENAI_DEPLOYMENT'))
sonnet = ChatBedrock(model_id='anthropic.claude-3-sonnet-20240229-v1:0')

gpt4.name = 'GPT-4'
sonnet.name = 'Claude Sonnet v3'

In [None]:
llm = gpt4

### Vectorstore

In [None]:
user = os.getenv('LOGNAME')
print(f'Hello, {user}')

In [None]:
connection = f'postgresql+psycopg://{user}:{user}@localhost:5432/{user}'
collection_name = 'tiny_shakespeare'
embeddings = HuggingFaceEmbeddings(model_name='nomic-ai/nomic-embed-text-v1.5', model_kwargs={'trust_remote_code':True})

vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,  
)

## Create embeddings for Tiny Shakespeare dataset
### Load dataset

In [None]:
dataset = load_dataset("Trelis/tiny-shakespeare")

texts = dataset["train"]["Text"] + dataset["test"]["Text"]
print(f'Imported {len(texts)} texts')

print('First 100 characters of text #42:')
print(texts[42][:99])

### Add dataset texts to vectorstore

In [None]:
vectorstore.add_texts(texts)

## Similarity search

In [None]:
vectorstore.similarity_search_with_relevance_scores("doctors and nurses", k=5)

## RAG using the Shakespeare Dataset

### Initialize helper function to format documents

In [None]:
def format_docs(docs, divider='', max_length=200):
    return f'\n{divider}\n'.join(doc.page_content[:max_length] for doc in docs)

### Set up the vectorstore as a retriever to feed documents into the LLM prompt

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 8})

### Create the prompt template

In [None]:
from langchain_core.prompts import PromptTemplate

template = """You are an expert at answering the linguistic and philological aspects of William Shakespeare's writing.
Use the provided Shakespeare passages to answer the question at the end.
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
Cite the context provided in your response!

Context: {context}

Question: {question}

Helpful Answer:"""
prompt = PromptTemplate.from_template(template)

### Build the RAG chain
Combine all of the elements into a chain that will:
- retrieve relevant docs (using the vectorstore retriever)
- generate a prompt for the language model (using the prompt template)
- invoke the language model using the prompt
- parse the response

In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

### Generate the response

In [None]:
response = rag_chain_with_source.invoke("Describe Shakespeare's diction regarding doctors and nurses.")

### Explore the context documents

In [None]:
print(f"Context:\n{format_docs(response['context'], divider='-'*40)}")

### Display the language model's answer

In [None]:
print(f"LM Response: {response['answer']}")