# VectorStores

## Chroma VectorStore

In [2]:
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

### Document Loading

In [3]:
path = "./data/attention-is-all-your-need.pdf"
loader = PyPDFLoader(path)
pages = loader.load()

In [5]:
len(pages)

15

### Text Splitting

In [6]:
text_split = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=45,
    separators=["\n\n", "\n", ".", " ", ""]
)

documents = text_split.split_documents(pages)
len(documents)

102

### Criando o VectorStore

In [7]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [8]:
from langchain_openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings()

In [14]:
from langchain_chroma import Chroma

directory = './data/chroma_vectorstore'

vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embed_model,
    persist_directory=directory
)

In [16]:
print(vectorstore._collection.count())

102


### Importando vectorstore do disco

In [17]:
vectorstore = Chroma(
    embedding_function=embed_model,
    persist_directory=directory
)

### Retrieval

In [21]:
question = 'what is transformers?'

In [22]:
docs = vectorstore.similarity_search(question, k=5)

In [25]:
for doc in docs:
    print(doc.page_content)
    print(f'\n----{doc.metadata}----\n')
    print('========================================\n')

The Transformer allows for significantly more parallelization and can reach a new state of the art in
translation quality after being trained for as little as twelve hours on eight P100 GPUs.
2 Background
The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU
[16], ByteNet [18] and ConvS2S [9], all of which use convolutional neural networks as basic building

----{'author': '', 'creationdate': '2024-04-10T21:11:43+00:00', 'creator': 'LaTeX with hyperref', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'page': 1, 'page_label': '2', 'producer': 'pdfTeX-1.40.25', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'source': './data/attention-is-all-your-need.pdf', 'subject': '', 'title': '', 'total_pages': 15, 'trapped': '/False'}----


language modeling tasks [34].
To the best of our knowledge, however, the Transformer is the first transduction model relying
entirely on self-attent

## FAISS VectorStore

In [27]:
from langchain_community.vectorstores.faiss import FAISS

vectorstore = FAISS.from_documents(
    documents=documents,
    embedding=embed_model
)

In [28]:
question = 'what is transformers?'

In [29]:
docs = vectorstore.similarity_search(question, k=5)

In [30]:
for doc in docs:
    print(doc.page_content)
    print(f'\nMETADATA: {doc.metadata}\n')
    print('========================================\n')

The Transformer allows for significantly more parallelization and can reach a new state of the art in
translation quality after being trained for as little as twelve hours on eight P100 GPUs.
2 Background
The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU
[16], ByteNet [18] and ConvS2S [9], all of which use convolutional neural networks as basic building

METADATA: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': './data/attention-is-all-your-need.pdf', 'total_pages': 15, 'page': 1, 'page_label': '2'}


language modeling tasks [34].
To the best of our knowledge, however, the Transformer is the first transduction model relying
entirely on self-atte

### Salvando BD FAISS

In [31]:
vectorstore.save_local('./data/faiss_bd')

### Importando bd FAISS

In [32]:
vectorstore = FAISS.load_local(
    './data/faiss_bd/',
    embeddings=embed_model,
    allow_dangerous_deserialization=True
)