# RAG

## 1a_rag_basics

In [21]:
import os

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings

In [22]:
# Define the directory containing the text file and the persistent directory
current_dir = os.path.dirname(os.getcwd())
file_path = os.path.join(current_dir, "notebooks", "documents", "langchain_demo.txt")
persistent_directory = os.path.join(current_dir, "notebooks", "db", "chroma_db")

In [23]:
# Read the text content from the file
loader = TextLoader(file_path)
documents = loader.load()

In [24]:
len(documents)

1

In [25]:
# Split the document into chunks
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

Created a chunk of size 451, which is longer than the specified 200
Created a chunk of size 1203, which is longer than the specified 200
Created a chunk of size 916, which is longer than the specified 200


In [26]:
# Display information about the split documents
print("\n--- Document Chunks Information ---")
print(f"Number of document chunks: {len(docs)}")
print(f"Sample chunk:\n{docs[0].page_content}\n")


--- Document Chunks Information ---
Number of document chunks: 5
Sample chunk:
LangChain: A Framework for LLM-Powered Applications
LangChain is a powerful and flexible framework designed to simplify the development of applications that harness the capabilities of large language models (LLMs). It provides a wide range of tools, abstractions, and integrations that help developers build, customize, and optimize applications that leverage LLMs for tasks like text generation, question answering, summarization, chatbots, and more.



In [27]:
# Create embeddings
print("\n--- Creating embeddings ---")
embeddings = OllamaEmbeddings(model="nomic-embed-text")
print("\n--- Finished creating embeddings ---")


--- Creating embeddings ---

--- Finished creating embeddings ---


In [29]:
# Create the vector store and persist it automatically
print("\n--- Creating vector store ---")
db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_directory)
print("\n--- Finished creating vector store ---")


--- Creating vector store ---

--- Finished creating vector store ---


In [30]:
# Define the user's question
query = "What is LangChain?"

In [None]:
# Retrieve relevant documents based on the query
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 2, "score_threshold": 0.7},
)

In [41]:
relevant_docs = retriever.invoke(query)

In [42]:
# Display the relevant results with metadata
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")


--- Relevant Documents ---
Document 1:
LangChain: A Framework for LLM-Powered Applications
LangChain is a powerful and flexible framework designed to simplify the development of applications that harness the capabilities of large language models (LLMs). It provides a wide range of tools, abstractions, and integrations that help developers build, customize, and optimize applications that leverage LLMs for tasks like text generation, question answering, summarization, chatbots, and more.

Source: /home/mrego/Projects/workspace/langchain-notebook/notebooks/documents/langchain_demo.txt

Document 2:
LangChain: A Framework for LLM-Powered Applications
LangChain is a powerful and flexible framework designed to simplify the development of applications that harness the capabilities of large language models (LLMs). It provides a wide range of tools, abstractions, and integrations that help developers build, customize, and optimize applications that leverage LLMs for tasks like text generation, 