In [4]:
import os
import openai
import langchain
from pinecone import Pinecone
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.llms import OpenAI
from typing import List
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
try:
    from langchain_core.documents import Document
except ImportError:
    from langchain.schema import Document

def read_doc(directory: str) -> List[Document]:
    loader = PyPDFDirectoryLoader(directory)
    documents = loader.load()
    return documents

In [6]:
docs = read_doc("documents/")
print(f"Loaded {len(docs)} documents.")

Loaded 60 documents.


In [7]:
docs[0]

Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-02-01T03:54:26+05:30', 'author': 'hss', 'moddate': '2025-02-01T03:56:01+05:30', 'title': '', 'source': 'documents\\budget_speech.pdf', 'total_pages': 60, 'page': 0, 'page_label': '1'}, page_content='GOVERNMENT OF INDIA\nBUDGET 2025-2026\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2025')

In [8]:
def chunk_data(docs: List[Document], chunk_size: int = 800, chunk_overlap: int = 200) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

    doc = text_splitter.split_documents(docs)
    return doc

In [9]:
documents = chunk_data(docs)
print(f"Chunked into {len(documents)} documents.")
#documents

Chunked into 158 documents.


In [10]:
documents[0]

Document(metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-02-01T03:54:26+05:30', 'author': 'hss', 'moddate': '2025-02-01T03:56:01+05:30', 'title': '', 'source': 'documents\\budget_speech.pdf', 'total_pages': 60, 'page': 0, 'page_label': '1'}, page_content='GOVERNMENT OF INDIA\nBUDGET 2025-2026\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2025')

In [11]:
embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))

  embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))


In [12]:
vectors = embeddings.embed_query("How are you?")
len(vectors)

1536

In [13]:
index_name = os.getenv("PINECONE_INDEX_NAME")
docsearch = PineconeVectorStore.from_documents(
    documents=documents, 
    embedding=embeddings, 
    index_name=index_name
)
print("Documentos indexados com sucesso!")

Documentos indexados com sucesso!


In [24]:
def retrieve_query(query, k=2):
    matching_results = docsearch.similarity_search(query, k=k)
    return matching_results

In [20]:
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAI

In [26]:
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.5)
# 1. Defina o Prompt explicitamente
prompt = ChatPromptTemplate.from_template("""
Responda à pergunta com base apenas no contexto fornecido:

<context>
{context}
</context>

Pergunta: {input}
""")

# 2. Crie a chain usando o novo construtor
chain = create_stuff_documents_chain(llm, prompt)


In [22]:
def retrieve_answers(query):
    doc_search = retrieve_query(query)
    print(doc_search)
    
    response = chain.invoke({
        "context": doc_search,  # Sua lista de Documentos
        "input": query
    })
    return response

In [27]:
query = "How much the agriculture target will be increased by how many crore?"
answer = retrieve_answers(query)
print(answer)

[Document(id='d173eed9-259b-413e-abd5-c7348c679073', metadata={'author': 'hss', 'creationdate': '2025-02-01T03:54:26+05:30', 'creator': 'Microsoft® Word 2021', 'moddate': '2025-02-01T03:56:01+05:30', 'page': 6.0, 'page_label': '7', 'producer': 'Microsoft® Word 2021', 'source': 'documents\\budget_speech.pdf', 'title': '', 'total_pages': 60.0}, page_content='1.7 crore farmers. \nBuilding Rural Prosperity and Resilience \n11. A comprehensive multi -sectoral ‘Rural Prosperity and Resilience’ \nprogramme will be launched in partnership with states. This will address under-\nemployment in agriculture through skilling, investment, technology, and \ninvigorating the rural economy. The goal is to generate ample opportunities in \nrural areas so that migration is an option, but not a necessity.  \n12. The programme will focus on rural women, young farmers, rural youth, \nmarginal and small farmers, and landless families. Details are in Annexure A.'), Document(id='b5de7ec3-3334-4ecf-977e-6181c33c

In [28]:
query = "How is the agriculture doing?"
answer = retrieve_answers(query)
print(answer)

[Document(id='62201884-bff6-4125-8ff3-216e0bfed091', metadata={'author': 'hss', 'creationdate': '2025-02-01T03:54:26+05:30', 'creator': 'Microsoft® Word 2021', 'moddate': '2025-02-01T03:56:01+05:30', 'page': 32.0, 'page_label': '33', 'producer': 'Microsoft® Word 2021', 'source': 'documents\\budget_speech.pdf', 'title': '', 'total_pages': 60.0}, page_content='1) development and commercial availability of climate resilient seeds,  \n2) enhancing protein content,  \n3) increasing productivity,  \n4) improving post-harvest storage and management, and \n5) assuring remunerative prices to the farmers.'), Document(id='2d0de3c8-5d9e-4f2f-adf4-96baaf4291f7', metadata={'author': 'hss', 'creationdate': '2025-02-01T03:54:26+05:30', 'creator': 'Microsoft® Word 2021', 'moddate': '2025-02-01T03:56:01+05:30', 'page': 7.0, 'page_label': '8', 'producer': 'Microsoft® Word 2021', 'source': 'documents\\budget_speech.pdf', 'title': '', 'total_pages': 60.0}, page_content='17. It is encouraging that our peopl