## Document Ingestion

In [1]:
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings
import os
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader

load_dotenv()

True

In [None]:
embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    deployment="text-embedding-3-large")

In [None]:
vector_store: AzureSearch = AzureSearch(
        azure_search_endpoint=os.environ["AZURE_AI_SEARCH_URI"],
        azure_search_key=os.environ["VECTOR_STORE_PASSWORD"],
        index_name=os.environ["VECTOR_STORE_INDEX"],
        embedding_function=embeddings.embed_query,
        search_type="hybrid"
    )

In [None]:
## This example takes all of the text files in a directory and splits them into chunks of 1000 characters with 10 characters of overlap between chunks. Your documents can be in any format that the TextLoader can handle (e.g. .txt, .docx, .pdf, etc.)

import os
from pprint import pprint

docs = [] # Store all of our chuncked documents in an array

root = "./data"  # replace with your directory
text_splitter = CharacterTextSplitter(
    chunk_size=1000, chunk_overlap=10
)  # split text into chunks of 1000 characters, with 10 characters of overlap between chunks.

for dirpath, dirnames, filenames in os.walk(root):
    for filename in filenames:
        filepath = os.path.join(dirpath, filename)
        loader = TextLoader(filepath)
        documents = loader.load()
        split_docs = text_splitter.split_documents(documents)
        docs.extend(split_docs)

pprint(docs)

In [None]:
vector_store.add_documents(documents=docs) # Add documents to the vector store

In [None]:
vector_store.similarity_search("CDW") # Test retrieval of similar documents

## RAG Q&A



In [None]:
from langchain_community.retrievers import AzureAISearchRetriever
from langchain_openai import AzureChatOpenAI


retriever = AzureAISearchRetriever(
    content_key="content", top_k=5, index_name="financial-research"
)

In [None]:
llm = AzureChatOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

prompt = """You're a helpful assistant answering user questions about CDW."""

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
