# RAG Part 1: Indexing Your Data

## Embeddings: Converting Your Documents into Text

In [2]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader('./example.txt')
loader.load()

[Document(metadata={'source': './example.txt'}, page_content='TXT test file\nPurpose: Provide example of this file type\nDocument file type: TXT\nVersion: 1.0\nRemark:\n\nExample content:\nThe names "John Doe" for males, "Jane Doe" or "Jane Roe" for females, or "Jonnie Doe" and "Janie Doe" for children, or just "Doe" non-gender-specifically are used as placeholder names for a party whose true identity is unknown or must be withheld in a legal action, case, or discussion. The names are also used to refer to acorpse or hospital patient whose identity is unknown. This practice is widely used in the United States and Canada, but is rarely used in other English-speaking countries including the United Kingdom itself, from where the use of "John Doe" in a legal context originates. The names Joe Bloggs or John Smith are used in the UK instead, as well as in Australia and New Zealand.\n\nJohn Doe is sometimes used to refer to a typical male in other contexts as well, in a similar manner to John

In [3]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader('https://www.langchain.com/')
loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://www.langchain.com/', 'title': 'LangChain', 'description': 'LangChain provides the engineering platform and open source frameworks developers use to build, test, and deploy reliable AI agents.', 'language': 'en'}, page_content="LangChain\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nProducts\n\nOpen Source FrameworksLangChainQuick start agents with any model providerLangGraphBuild custom agents with low-level controlDeep AgentsNewUse planning, memory, and sub-agents for complex, long-running tasksLangSmithObservabilityDebug and monitor in-depth tracesEvaluationIterate on prompts and modelsDeploymentShip and scale agents in productionResources\n\nLangChain AcademyBlogCustomer StoriesCommunityEventsChangelogGuidesDocsCompany\n\nAboutCareersPricingGet a demoSign upEngineer reliable agentsShip agents to production with LangChain's comprehensive platform for agent engineering.Request a demoSign up\n\nWe've raised a $125M\xa0Series B to build the platfo

In [2]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('./CV_Llorens_Eng.pdf')
pages = loader.load()

## Splitting Your Text into Chunks

In [5]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = TextLoader('./example.txt')
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000
)
splitted_docs = splitter.split_documents(docs)

In [9]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter
)

PYTHON_CODE = """
def hello_world():
    print("Hello, World!")

# Call the function
hello_world()
"""

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([PYTHON_CODE])
python_docs

[Document(metadata={}, page_content='def hello_world():\n    print("Hello, World!")'),
 Document(metadata={}, page_content='# Call the function\nhello_world()')]

In [10]:
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter

markdown_text = """
# LangChain
⚡ Building applications with LLMs through composability ⚡
## Quick Install
```bash
pip install langchain
```
As an open source project in a rapidly developing field, we are extremely open
 to contributions.
"""

md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0
)
md_docs = md_splitter.create_documents([markdown_text],
                                       [{'source': 'https://www.langchain.com'}])

md_docs

[Document(metadata={'source': 'https://www.langchain.com'}, page_content='# LangChain'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='⚡ Building applications with LLMs through composability ⚡'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='## Quick Install\n```bash\npip install langchain'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='```'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='As an open source project in a rapidly developing field, we'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='are extremely open'),
 Document(metadata={'source': 'https://www.langchain.com'}, page_content='to contributions.')]

### Generating Text Embeddings

In [13]:
from langchain_ollama import OllamaEmbeddings

model = OllamaEmbeddings(model='llama3.1')

embeddings = model.embed_documents([
    'Hi there!',
    'Oh, hello!',
    "What's your name?",
    "My friends call me World",
    "Hello World!"
])

len(embeddings)

5

In [14]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings

## Load the document

loader = TextLoader('./example.txt')
doc = loader.load()

## Split the document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000
)
chunks = text_splitter.split_documents(doc)

## Generate embeddings

embeddings_model = OllamaEmbeddings(model='llama3.1')
embeddings = embeddings_model.embed_documents(
    [chunk.page_content for chunk in chunks]
)
embeddings

[[-0.0073631224,
  -0.035248995,
  -0.0027953458,
  0.017723948,
  0.012315544,
  -0.005107892,
  -0.020120393,
  0.0008662115,
  -0.008043993,
  0.0050256564,
  0.022380976,
  -0.0040493556,
  -0.015227324,
  0.001689767,
  -0.0013902005,
  0.013634189,
  0.010053633,
  0.001749399,
  -0.010449196,
  -0.010132344,
  -0.0019116185,
  -0.0074562044,
  0.0031002818,
  -0.0028385057,
  -0.019594219,
  0.006650144,
  -0.00857341,
  -0.022686426,
  0.020569058,
  -0.013656443,
  -0.018588657,
  0.0105829835,
  0.0026838053,
  0.020710656,
  0.026696648,
  -0.01415281,
  -0.024361864,
  0.012335739,
  0.015016981,
  0.035045084,
  -0.008356808,
  0.005228386,
  0.0013706016,
  -0.012333743,
  0.0023424986,
  -0.016569933,
  -0.0022862498,
  0.0041110488,
  0.007828318,
  -0.017267732,
  0.010184016,
  0.0060257646,
  -0.015473363,
  -0.0002670708,
  0.035875935,
  0.005226497,
  -0.0150096575,
  0.005476281,
  -0.012803419,
  0.0037719016,
  -0.023759725,
  0.014543051,
  0.00039788574,
  0.

## Storing Embeddings in a Vector Store

Vector store capabilities have recently been extended to PostgreSQL via the `pgvector` extension. This enables you to use the same database you're already familiar with and to power both your transactional tables as well as your vector search tables.

### Getting Set Up with PGVector

In [16]:
! docker --version

Docker version 28.5.1, build e180ab8


In [23]:
! docker run --name pgvector-container -e POSTGRES_USER=langchain -e POSTGRES_PASSWORD=langchain -e POSTGRES_DB=langchain -p 6024:5432 -d pgvector/pgvector:pg16

daa7394e1f1c50a5ea59852cb1e4bfa2a2eafd4dc247295f1ac39b7aeec593ca


In [24]:
! docker ps

CONTAINER ID   IMAGE                    COMMAND                  CREATED         STATUS         PORTS                                         NAMES
daa7394e1f1c   pgvector/pgvector:pg16   "docker-entrypoint.s…"   6 seconds ago   Up 6 seconds   0.0.0.0:6024->5432/tcp, [::]:6024->5432/tcp   pgvector-container


### Working with Vector Stores

In [25]:
from langchain_community.document_loaders import TextLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_postgres.vectorstores import PGVector
from langchain_core.documents import Document
import uuid

# Load the document, split it into chunks
raw_documents = TextLoader('./example.txt').load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
documents = text_splitter.split_documents(raw_documents)

# embed each chunk and insert it into the vector store
embeddings_model = OllamaEmbeddings(model='llama3.1')
connection = 'postgresql+psycopg://langchain:langchain@localhost:6024/langchain'
db = PGVector.from_documents(documents=documents, embedding=embeddings_model, connection=connection)

In [27]:
db.similarity_search('TXT', k=4)

[Document(id='1e73c05c-5703-46a5-8192-0b2dc52ce2b1', metadata={'source': './example.txt'}, page_content='John Doe is sometimes used to refer to a typical male in other contexts as well, in a similar manner to John Q. Public, known in Great Britain as Joe Public, John Smith or Joe Bloggs. For example, the first name listed on a form is often John Doe, along with a fictional address or other fictional information to provide an example of how to fill in the form. The name is also used frequently in popular culture, for example in the Frank Capra film Meet John Doe. John Doe was also the name of a 2002 American television series.'),
 Document(id='b278b4d6-8675-46dc-a0b0-11ba400472fa', metadata={'source': './example.txt'}, page_content='File created by https://www.online-convert.com\nMore example files: https://www.online-convert.com/file-type\nText of Example content: Wikipedia (https://en.wikipedia.org/wiki/John_Doe)\nLicense: Attribution-ShareAlike 4.0 (https://creativecommons.org/licens

This method will find the most relevant documents, by following this process:

* The search query -in this case, the word `TXT`- will be sent to the embeddings model to retrieve its embedding.
* Then, it will run a query on Postgres to find the N (in this case 4) previously stored embeddings that are most similar to your query.
* Finally, it will fetch the text content and metadata that relates to each of those embeddings.
* The model can now return a list of `Document` sorted by how similar they are to the query.

In [28]:
ids = [str(uuid.uuid4()), str(uuid.uuid4())]
db.add_documents(
    [
        Document(
            page_content="there are cats in the pond",
            metadata={'location': 'pond', 'topic': 'animals'}
        ),
        Document(
            page_content="ducks are also found in the pond",
            metadata={"location": "pond", "topic": "animals"}
        )
    ],
    ids=ids
)

['3b89649e-fba3-4a13-8923-35abd40dc774',
 'dd6dd8dc-b6ab-4315-aadf-2be0628ea902']

In [31]:
db.delete(id=[1])

## Tracking Changes to Your Documents

First, you create a record manager, which keeps track of which documents have been indexed before. Then you use the `index` function to synchronize your vector store with the new list of documents. For example, using the `incremental mode`, so any documents that have the same ID as previous ones will be replaced with the new version.

## Indexing Optimization

### MultiVectorRetriever

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_postgres.vectorstores import PGVector
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel
from langchain_core.runnables import RunnablePassthrough
from langchain_ollama import ChatOllama
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from langchain_core.stores import InMemoryStore
import uuid

connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
collection_name = "summaries"
embeddings_model = OllamaEmbeddings(model='llama3.1')

# Load the document
loader = TextLoader('./test.txt', encoding='utf-8')
docs = loader.load()

print("length of loaded docs: ", len(docs[0].page_content))

# Split the document
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

prompt_text = "Summarize the following document:\n\n{doc}"

prompt = ChatPromptTemplate.from_template(prompt_text)
llm = ChatOllama(temperature=0, model='llama3.1')
summarize_chain = {
    "doc": lambda x: x.page_content} | prompt | llm | StrOutputParser()

# batch the chain across the chunks
summaries = summarize_chain.batch(chunks, {'max_concurrency': 5})

# The vectorstore to use to index the child chunks
vectorstore = PGVector(
    embeddings=embeddings_model,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True
)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# indexing the summaries in our vector store, whilst retaining the original documents in our document store:
retriever = BaseRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key
)

length of loaded docs:  624212
