# Retrieval Augmented Generation

## Configure the chat model

In [1]:
import os
from dotenv import load_dotenv

load_dotenv("../../.env")

True

In [2]:
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import AzureChatOpenAI

In [3]:
llm = AzureChatOpenAI(azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"))

## Initialize the Vectorstore

### Import the required classes and initialize the `PGVector` vectorstore.

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

connection = "postgresql+psycopg://postgres:code4lib@localhost:5432/code4lib"
collection_name = "code4lib2024"
embeddings = HuggingFaceEmbeddings(model_name='nomic-ai/nomic-embed-text-v1.5', model_kwargs={'trust_remote_code':True})

vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,  
)

print(f"Vectorstore collection name: {vectorstore.collection_name}")

  from .autonotebook import tqdm as notebook_tqdm
<All keys matched successfully>


Vectorstore collection name: code4lib2024


## Index your data

In [5]:
docs = [
    Document(
        page_content="Interlibrary loan requests can be made online or at the service desk",
        metadata={"id": 1, "location": "library", "topic": "borrowing"},
    ),
    Document(
        page_content="Course reserves are available for checkout at the circulation desk",
        metadata={"id": 2, "location": "library", "topic": "borrowing"},
    ),
    Document(
        page_content="Study rooms can be reserved up to two weeks in advance",
        metadata={"id": 3, "location": "library", "topic": "reservations"},
    ),
    Document(
        page_content="Library workshops on database research are held monthly",
        metadata={"id": 4, "location": "library", "topic": "workshops"},
    ),
    Document(
        page_content="Access to digital archives is available through the library portal",
        metadata={"id": 5, "location": "library", "topic": "online resources"},
    ),
    Document(
        page_content="Renew your borrowed items online or at any library kiosk",
        metadata={"id": 6, "location": "library", "topic": "borrowing"},
    ),
    Document(
        page_content="Special collections can be accessed in the reading room",
        metadata={"id": 7, "location": "library", "topic": "borrowing"},
    ),
    Document(
        page_content="Library orientation tours are available for new users",
        metadata={"id": 8, "location": "library", "topic": "facilities"},
    ),
    Document(
        page_content="The library offers free Wi-Fi to all visitors",
        metadata={"id": 9, "location": "library", "topic": "facilities"},
    ),
    Document(
        page_content="Photocopying and printing services are available on the ground floor",
        metadata={"id": 10, "location": "library", "topic": "printing services"},
    ),
]

vectorstore.add_documents(docs, ids=[doc.metadata["id"] for doc in docs])

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

## Generation

In [6]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [7]:
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"k": 8, "score_threshold": 0.4}
)

In [8]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
Cite the context provided in your response!

Context: {context}

Question: {question}

Helpful Answer:"""
prompt = PromptTemplate.from_template(template)

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [9]:
response = rag_chain_with_source.invoke("Does the library have an ILL program?")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
print(response["answer"])

Yes, the library does have an Interlibrary Loan (ILL) program. This is indicated by the context stating that "Interlibrary loan requests can be made online or at the service desk" and also mentioning that "Interlibrary loan has been temporarily suspended due to a party in the break room."


In [11]:
for doc in response["context"]:
    print(f"Document ID: {doc.metadata['id']}")
    print(f"Content: {doc.page_content}")
    print("-" * 40)

Document ID: 8
Content: Library orientation tours are available for new users
----------------------------------------
Document ID: 1
Content: Interlibrary loan requests can be made online or at the service desk
----------------------------------------
Document ID: 7
Content: Special collections can be accessed in the reading room
----------------------------------------
Document ID: 5
Content: Access to digital archives is available through the library portal
----------------------------------------
Document ID: 9
Content: The library offers free Wi-Fi to all visitors
----------------------------------------
Document ID: 6
Content: Renew your borrowed items online or at any library kiosk
----------------------------------------
Document ID: 11
Content: Interlibrary loan has been temporarily suspended due to a party in the break room
----------------------------------------
Document ID: 2
Content: Course reserves are available for checkout at the circulation desk
----------------------

### Let's update a document!

In [12]:
updated_docs = [
  Document(
        page_content="Interlibrary loan has been temporarily suspended due to a party in the break room",
        metadata={"id": 11, "location": "library", "topic": "news"},
    )
]
vectorstore.add_documents(updated_docs, ids=[doc.metadata["id"] for doc in updated_docs])

[11]

In [13]:
response = rag_chain_with_source.invoke("Can I head down from my office to the ILL desk right now?")

In [14]:
print(response)

{'context': [Document(page_content='Library orientation tours are available for new users', metadata={'id': 8, 'topic': 'facilities', 'location': 'library'}), Document(page_content='Interlibrary loan requests can be made online or at the service desk', metadata={'id': 1, 'topic': 'borrowing', 'location': 'library'}), Document(page_content='Course reserves are available for checkout at the circulation desk', metadata={'id': 2, 'topic': 'borrowing', 'location': 'library'}), Document(page_content='Study rooms can be reserved up to two weeks in advance', metadata={'id': 3, 'topic': 'reservations', 'location': 'library'}), Document(page_content='Photocopying and printing services are available on the ground floor', metadata={'id': 10, 'topic': 'printing services', 'location': 'library'}), Document(page_content='Special collections can be accessed in the reading room', metadata={'id': 7, 'topic': 'borrowing', 'location': 'library'}), Document(page_content='Interlibrary loan has been temporar

In [15]:
print(response["answer"])

Based on the provided context, you cannot head down to the Interlibrary Loan (ILL) desk right now because interlibrary loan has been temporarily suspended due to a party in the break room.


In [16]:
for doc in response["context"]:
    print(f"Document ID: {doc.metadata['id']}")
    print(f"Content: {doc.page_content}")
    print("-" * 40)

Document ID: 8
Content: Library orientation tours are available for new users
----------------------------------------
Document ID: 1
Content: Interlibrary loan requests can be made online or at the service desk
----------------------------------------
Document ID: 2
Content: Course reserves are available for checkout at the circulation desk
----------------------------------------
Document ID: 3
Content: Study rooms can be reserved up to two weeks in advance
----------------------------------------
Document ID: 10
Content: Photocopying and printing services are available on the ground floor
----------------------------------------
Document ID: 7
Content: Special collections can be accessed in the reading room
----------------------------------------
Document ID: 11
Content: Interlibrary loan has been temporarily suspended due to a party in the break room
----------------------------------------
Document ID: 5
Content: Access to digital archives is available through the library portal
