# RAG Haystack Demo
**Demo for Iteration 2 of the prototype.**

Installs:\
!pip install haystack-ai\
!pip install python-pptx\
!pip install python-docx\
!pip install pypdf2\
!pip install trafilatura\
!pip install unstructured-client\
!pip install unstructured-fileconverter-haystack\
!pip install unstructured\
!pip install sentence-transformers

In [None]:
from haystack import Pipeline
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.utils import Secret
from haystack.components.generators import OpenAIGenerator
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.joiners.document_joiner import DocumentJoiner
from haystack.components.builders import PromptBuilder

In [None]:
import os

# Groq API Key
GROQ_OPENAI_API_KEY = userdata.get('GROQ_OPENAI_API_KEY')
os.environ['GROQ_OPENAI_API_KEY'] = GROQ_OPENAI_API_KEY

# Unstructured API Key
UNSTRUCTURED_API_KEY = userdata.get('UNSTRUCTURED_API_KEY')
os.environ['UNSTRUCTURED_API_KEY'] = UNSTRUCTURED_API_KEY

# **Indexing**

In [None]:
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")

### Converter

In [None]:
converter = UnstructuredFileConverter()
result = converter.run(paths=["demo_guide.pdf"])
print(result['documents'][0].content)

### Cleaner
Not used in this demo.

In [None]:
# from haystack.components.preprocessors import DocumentCleaner

# cleaner = DocumentCleaner(
#   ascii_only=True,
# 	remove_empty_lines=True,
# 	remove_extra_whitespaces=True,
# 	remove_repeated_substrings=False)

# cleaned_result = cleaner.run(documents=result['documents'])
# print(cleaned_result['documents'][0].content)

### Splitter/Chunker

In [None]:
splitter = DocumentSplitter(split_by="passage", split_length=1, split_overlap=0)
split_result = splitter.run(documents=result['documents'])

for document in split_result["documents"]:
    print(f"{document.content}\n")

# Embedder

In [None]:
embedder = SentenceTransformersDocumentEmbedder()
embedder.warm_up()

result = embedder.run(split_result['documents'])
print(result['documents'][0].embedding)

# [-0.07804739475250244, 0.1498992145061493, ...]

# Writer (Load Embeddings)

In [None]:
document_writer = DocumentWriter(document_store = document_store)
document_writer.run(documents=result['documents'])

# **Querying**

In [None]:
# Query and Query embeddings
text_embedder = SentenceTransformersTextEmbedder()
text_embedder.warm_up()

query="How do I change my password"
query_embedding=text_embedder.run(query)

print(query)
print(query_embedding['embedding'])

### Retriever

In [None]:
# Vector Search
embedding_retriever = InMemoryEmbeddingRetriever(document_store)

# Keyword Search
bm25_retriever = InMemoryBM25Retriever(document_store)

In [None]:
retrieved_result=embedding_retriever.run(query_embedding['embedding'])
print(retrieved_result['documents'][0].content)

retrieved_result=bm25_retriever.run(query)
print(retrieved_result['documents'][0].content)

### Document Joiner (For Hybrid Retrieval)
Not used in this demo.

In [None]:
# from haystack.components.joiners import DocumentJoiner

# document_joiner = DocumentJoiner()

### Prompt Builder

In [None]:
prompt_template = "Answer the query '{{ query }}' using the following contextContext: {{ context }}; Answer:"
builder = PromptBuilder(template=prompt_template)
input=builder.run(query=query, context=retrieved_result['documents'][0].content)
print(input)

In [None]:
llm = OpenAIGenerator(
        api_key=Secret.from_env_var("GROQ_OPENAI_API_KEY"),
        api_base_url="https://api.groq.com/openai/v1",
        model="llama3-8b-8192",
        generation_kwargs={"temperature": 0}
    )

In [None]:
response=llm.run(input['prompt'])
print(response['replies'][0])

# RAG with 2 documents


1.   Outdated Guide
2.   Reference Material



In [None]:
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
indexing_pipeline=Pipeline()
indexing_pipeline.add_component("converter", UnstructuredFileConverter())
indexing_pipeline.add_component("splitter", DocumentSplitter(split_by="passage", split_length=1, split_overlap=0))
indexing_pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder())
indexing_pipeline.add_component("writer", DocumentWriter(document_store=document_store))
indexing_pipeline.connect("converter", "splitter")
indexing_pipeline.connect("splitter", "embedder")
indexing_pipeline.connect("embedder", "writer")
indexing_pipeline.run({"converter": {"paths": ["demo_guide.pdf", "demo_reference.pdf"]}})

query_pipeline=Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
query_pipeline.add_component("embedding_retriever", InMemoryEmbeddingRetriever(document_store))
query_pipeline.add_component("bm25_retriever", InMemoryBM25Retriever(document_store))
query_pipeline.add_component("document_joiner", DocumentJoiner(join_mode="merge"))
query_pipeline.connect("text_embedder", "embedding_retriever")
query_pipeline.connect("bm25_retriever", "document_joiner")
query_pipeline.connect("embedding_retriever", "document_joiner")

query = "How do I access the project management system"

result = query_pipeline.run(
    {"text_embedder": {"text": query}, "bm25_retriever": {"query": query}}
)

The following context is retrieved from the query: "How do I access the project management system"

In [None]:
print(result['document_joiner']['documents'][0].content)
print(result['document_joiner']['documents'][1].content)
print(result['document_joiner']['documents'][2].content)