![Generating Embeddings](../../images/headings/02_retrieval_augmented_generation_04_02_shakespeare_rag.png)

# Retrieval-Augmented Generation on a Public Dataset

## Setup
### Imports

In [2]:
from datasets import load_dataset
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_aws import ChatBedrock, BedrockEmbeddings
from langchain_openai import AzureChatOpenAI
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
import os

### Models

In [3]:
gpt4 = AzureChatOpenAI(azure_deployment=os.getenv('AZURE_OPENAI_DEPLOYMENT'))
sonnet = ChatBedrock(model_id='anthropic.claude-3-sonnet-20240229-v1:0')

gpt4.name = 'GPT-4'
sonnet.name = 'Claude Sonnet v3'

In [4]:
llm = gpt4

### Vectorstore

In [5]:
user = os.getenv('LOGNAME')
print(f'Hello, {user}')

Hello, mbklein


In [11]:
connection = f'postgresql+psycopg://{user}:{user}@localhost:5432/{user}'
collection_name = 'tiny_shakespeare'
# embeddings = HuggingFaceEmbeddings(model_name='nomic-ai/nomic-embed-text-v1.5', model_kwargs={'trust_remote_code':True})
embeddings = BedrockEmbeddings(model_id='amazon.titan-embed-text-v2:0')

vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,  
)

## Create embeddings for Tiny Shakespeare dataset
### Load dataset

In [12]:
dataset = load_dataset("Trelis/tiny-shakespeare")

texts = dataset["train"]["Text"] + dataset["test"]["Text"]
print(f'Imported {len(texts)} texts')

print('First 100 characters of text #42:')
print(texts[42][:99])

Imported 521 texts
First 100 characters of text #42:
Mildly!

BRUTUS:
In this point charge him home, that he affects
Tyrannical power: if he evade us th


### Add dataset texts to vectorstore

In [13]:
vectorstore.add_texts(texts)

['d5b41fe5-0b65-4b1c-9855-244a133d0783',
 'e535aff9-e17d-4b03-8df9-76b456a067a4',
 '032a2e1c-c07e-4aa1-aec1-6a0979c2c9b3',
 '3a3d746f-0ce6-4deb-b09c-1c9b06231f54',
 'd916380b-4a98-4072-b420-8c363f8d7540',
 '423f1eb4-386d-4e50-8310-1feb96a53a13',
 '40908c35-49d9-4234-bbc8-490b39cb4a7f',
 '1a7c6d8f-3717-4fce-98cf-89218697d3af',
 'dd8d3cb9-ac21-4ec8-80e9-12d858f6851c',
 'e46402ec-5e5d-4c33-abe2-504b97cccd97',
 '3079972d-eb48-498d-b4db-e73d820ae7f3',
 'ca7e0650-2a6d-455b-987b-11862b5c0953',
 'ba32ca40-5e98-4174-bd11-f4efa72686cb',
 '764e02b2-ea63-4d49-a126-284de32df20e',
 'd2c0bb17-9836-4bea-86e6-971eed9080ad',
 '6100fb39-7ce0-4c0e-b88d-b8abd2deaad6',
 'fa757f1c-241f-455a-af4a-f1eb3a100b7c',
 '5bdae284-d856-42d4-b74f-bcce8f9bde63',
 '91010116-3cf7-4dcd-8958-83ddc3c8b82d',
 'fcb975ba-cd28-4485-8a0f-a643a6390b54',
 '43ffa46a-9ec3-45e8-b00c-ddc7b9f14d34',
 'fde3c6d3-37ef-44ac-abaf-6f8f44e36d97',
 '84709382-c4ab-441a-a4bd-6efbcdd36331',
 'f6f0d282-c338-49d9-a985-30c1ccba7c44',
 'e3aaca2d-f62e-

## Similarity search

In [14]:
vectorstore.similarity_search_with_relevance_scores("doctors and nurses", k=5)

[(Document(page_content="Scurvy knave! Pray you, sir, a word:\nand as I told you, my young lady bade me inquire you\nout; what she bade me say, I will keep to myself:\nbut first let me tell ye, if ye should lead her into\na fool's paradise, as they say, it were a very gross\nScurvy knave! Pray you, sir, a word:\nand as I told you, my young lady bade me inquire you\nout; what she bade me say, I will keep to myself:\nbut first let me tell ye, if ye should lead her into\nkind of behavior, as they say: for the gentlewoman\nis young; and, therefore, if you should deal double\nwith her, truly it were an ill thing to be offered\nto any gentlewoman, and very weak dealing.\n\nROMEO:\nNurse, commend me to thy lady and mistress. I\nprotest unto thee--\n\nNurse:\nGood heart, and, i' faith, I will tell her as much:\nLord, Lord, she will be a joyful woman.\n\nROMEO:\nWhat wilt thou tell her, nurse? thou dost not mark me.\n\nNurse:\nI will tell her, sir, that you do protest; which, as\nI take it, is 

## RAG using the Shakespeare Dataset

### Initialize helper function to format documents

In [None]:
def format_docs(docs, divider='', max_length=200):
    return f'\n{divider}\n'.join(doc.page_content[:max_length] for doc in docs)

### Set up the vectorstore as a retriever to feed documents into the LLM prompt

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 8})

### Create the prompt template

In [None]:
from langchain_core.prompts import PromptTemplate

template = """You are an expert at answering the linguistic and philological aspects of William Shakespeare's writing.
Use the provided Shakespeare passages to answer the question at the end.
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
Cite the context provided in your response!

Context: {context}

Question: {question}

Helpful Answer:"""
prompt = PromptTemplate.from_template(template)

### Build the RAG chain
Combine all of the elements into a chain that will:
- retrieve relevant docs (using the vectorstore retriever)
- generate a prompt for the language model (using the prompt template)
- invoke the language model using the prompt
- parse the response

In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

### Generate the response

In [None]:
response = rag_chain_with_source.invoke("Describe Shakespeare's diction regarding doctors and nurses.")

### Explore the context documents

In [None]:
print(f"Context:\n{format_docs(response['context'], divider='-'*40)}")

### Display the language model's answer

In [None]:
print(f"LM Response: {response['answer']}")

## Exercises

- Modify the system prompt so that the language model focuses on a different aspect of the retrieved documents. Make a new application using words instead of code 😀

## Discussion Questions

- Use this example, and the previous notebooks to start coming up with ideas for the hackathon. Which aspects of the workshop have interested you the most?