In [46]:
from llama_index import SimpleDirectoryReader
from llama_index import Document
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import IndexNode
from llama_index.llms import OpenAI
from llama_index import ServiceContext
from llama_index import VectorStoreIndex
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.embeddings import OpenAIEmbedding, resolve_embed_model

from dotenv import load_dotenv

load_dotenv()

import os
api_key = os.getenv("OPENAI_API_KEY_1")

os.environ["OPENAI_API_KEY"] = api_key

In [47]:
# load pdf
documents = SimpleDirectoryReader(
input_files=["bertolini.pdf"]).load_data()

# combine documents into one
doc_text = "\n\n".join([d.get_content() for d in documents])
text= [Document(text=doc_text)]

In [48]:
# set up text chunk
node_parser = SimpleNodeParser.from_defaults()

# split doc
base_nodes = node_parser.get_nodes_from_documents(text)

# reset node ids 
for idx, node in enumerate(base_nodes):
    node.id_ = f"node-{idx}"

In [49]:
# load embedding model
embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")

# load llm
llm = OpenAI(model="gpt-3.5-turbo")

In [50]:
# set up service
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

In [51]:
# create & store in embeddings vectorstore index
index = VectorStoreIndex(base_nodes, service_context=service_context)

In [52]:
# create retriever
retriever = index.as_retriever()

In [54]:
# set up query engine
query_engine = RetrieverQueryEngine.from_args(retriever,
 service_context=service_context)

# query
response = query_engine.query("quais estados são mencionados?")
print(str(response))

Santa Catarina.
