# Indexing Your Data

## Loading Data
You can load adata from files of different types
- webpages
- csv, json, markdown
- pdfs
_see 

In [1]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./test.txt")
loader.load()

[Document(metadata={'source': './test.txt'}, page_content='here is a sample text\nhere is the second line\nhere is the third line\n\n\nhere is a sample text\nhere is the second line\nhere is the third line\t\t\nhere is a sample text\nhere is the second line\nhere is the third line\n\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second lin

## Breaking down large documents into chunks

In [2]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = WebBaseLoader('https://www.langchain.com/')
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
splitted_docs = splitter.split_documents(docs)
print(len(splitted_docs))


USER_AGENT environment variable not set, consider setting it to identify your requests.


7


### NOTE
1. we use `create_document` function when we are not splitting a list of documents that were loaded by a document loader
2. there is no overlap because code is usually structured so there is less need to overlap
3. there is an optional list of metadata in the `create_documents` function call

In [3]:
#another example splitting python code into chunks

from langchain_text_splitters import Language

python_code = '''
def hello_world():
    print('Hello, World!')

# call the function
hello_world()
'''
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)
python_docs = python_splitter.create_documents([python_code], [{'source': "github.com/hello-word"}])

## Embedding == Converting documents to vectors 
Interface with models to generate vectors

In [4]:
from langchain_openai import OpenAIEmbeddings

model = OpenAIEmbeddings()
embeddings = model.embed_documents([
    "doc 1",
    "doc 2",
    "doc 3",
    "hello world!",
])
print(embeddings[0][:5])

[-0.006597376894205809, -0.008810247294604778, -0.008679674006998539, -0.01749679446220398, 0.027708983048796654]


## Storing vectors to a vectore store
Use pgvector a  flavor of postgres database. You will need
- a docker container pgvector/pgvector:latest
- python package langchain-postgres

In [5]:
from langchain_postgres.vectorstores import PGVector
from langchain_core.documents import Document
import uuid

connection_string = 'postgresql+psycopg://langchain:langchain@localhost:6024/langchain'
raw_documents = TextLoader('./test.txt').load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = splitter.split_documents(raw_documents)

embeddings = OpenAIEmbeddings()
db = PGVector.from_documents(documents, embeddings, connection=connection_string)



### An example of a similarity search
FYI, the search string will be embedded using the embeddings model first, then searched in the vector db

In [6]:
db.similarity_search('query', k=4)

[Document(id='b3449f22-85cb-452a-85f0-3e964f60d7e9', metadata={'source': './test.txt'}, page_content='here is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a sample text\nhere is the second line\nhere is the third line\n\nhere is a 

### Indexing documents that change
Create a SQL Record Manager that will keep track of documents that have already been index.


In [10]:
from langchain.indexes import SQLRecordManager, index

collection_name = 'my_docs'
namespace = 'my_docs_namespace'

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = PGVector(
    embeddings=embeddings_model,
    collection_name=collection_name,
    connection=connection_string,
    use_jsonb=True,
)

record_manager = SQLRecordManager(
    namespace,
    db_url=connection_string,
)

# Create Schema if it does not exist
record_manager.create_schema()

docs = [
    Document(page_content='who let the dogs out', metadata={
        'id':1, 'source': 'songs.txt'
    }),
    Document(page_content='who let the cats out', metadata={
        'id':2, 'source': 'songs.txt'
    }),    
]

index_1 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup='incremental', # no duplicates
    source_id_key='source',
)
print(f'Index attempt 1: {index_1}')

Index attempt 1: {'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}


In [11]:
#second attempt, no changes
index_2 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup='incremental', # no duplicates
    source_id_key='source',
)
print(f'Index attempt 2: {index_2}')

Index attempt 2: {'num_added': 0, 'num_updated': 0, 'num_skipped': 2, 'num_deleted': 0}


In [13]:
# mutated doc!
docs[0].page_content = 'I mutated!'

#second attempt, no changes
index_3 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup='incremental', # no duplicates
    source_id_key='source',
)
print(f'Index attempt 2: {index_3}')

Index attempt 2: {'num_added': 1, 'num_updated': 0, 'num_skipped': 1, 'num_deleted': 1}
