In [53]:
from pathlib import Path
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.vectorstores.faiss import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI

FILES_FOLDER = Path('/Users/mondragon/MDS_GITHUB/TCC_PUC/teste.ipynb').parent / "files"
MODEL_NAME = "gpt-4o-mini"

def document_importer():
    documents = []
    for file in FILES_FOLDER.glob("*.pdf"):
        loader = PyPDFLoader(str(file))
        documents_file = loader.load()
        documents.extend(documents_file)  # Add the documents to the list

    if not documents:
        print("No documents found in the specified directory.")
    else:
        print(f"Loaded {len(documents)} documents from {file}")

    return documents


def split_documents(documents):
    recur_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2500,
        chunk_overlap=250,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    documents = recur_splitter.split_documents(documents)

    for i, doc in enumerate(documents):
        doc.metadata['source'] = doc.metadata['source'].split('/')[1]
        doc.metadata['doc_id'] = i
    return documents


def create_vector_store(documents):
    embeddings_model = OpenAIEmbeddings()
    vector_store = FAISS.from_documents(
        documents=documents,
        embedding=embeddings_model
    )
    return vector_store


def create_chain_chat(vector_store):
    chat = ChatOpenAI(model_name=MODEL_NAME)
    memory = ConversationBufferMemory(
        return_messages=True,
        memory_key='chat_history',
        output_key='answer',
    )

    retriever = vector_store.as_retriever()
    chat_chain = ConversationalRetrievalChain.from_llm(
        llm=chat,
        memory=memory,
        retriever=retriever,
        return_source_documents=True,
        verbose=True
    )
    return chat_chain

In [54]:
    # Run the functions
    documents = document_importer()
    print(f"Number of documents imported: {len(documents)}")
    documents = split_documents(documents)
    print(f"Number of documents after splitting: {len(documents)}")
    if documents:
        vector_store = create_vector_store(documents)
        chat_chain = create_chain_chat(vector_store)
    else:
        print("Cannot create vector store and chat chain without documents.")

Loaded 9 documents from /Users/mondragon/MDS_GITHUB/TCC_PUC/files/PEC-9-2023.pdf
Number of documents imported: 9
Number of documents after splitting: 10


In [56]:
chat_chain.invoke("do que se trata o documento?")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: Hello, how are you?
Assistant: I'm just a program, so I don't have feelings, but I'm here to help you! How can I assist you today?
Follow Up Input: do que se trata o documento?
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Proposta de Emenda à Constituição 
(Do Sr. Paulo Magalhães) 
 
 
Altera a Emenda Constitucional 
nº 117, de 5 de abril de 2022, quanto à 
aplicação de sanções aos partidos que não 
preencheram a cota m

{'question': 'do que se trata o documento?',
 'chat_history': [HumanMessage(content='Hello, how are you?'),
  AIMessage(content="I'm just a program, so I don't have feelings, but I'm here to help you! How can I assist you today?"),
  HumanMessage(content='do que se trata o documento?'),
  AIMessage(content='O documento se trata de uma Proposta de Emenda à Constituição (PEC) apresentada pelo deputado Paulo Magalhães, que altera a Emenda Constitucional nº 117, de 5 de abril de 2022. A proposta aborda a aplicação de sanções aos partidos que não cumprirem a cota mínima de recursos destinados a sexo e raça em eleições, bem como nas prestações de contas anuais e eleitorais. O objetivo da emenda é esclarecer a aplicação das sanções e garantir a estabilidade da ordem jurídica, especialmente em relação às eleições de 2022 e anteriores.')],
 'answer': 'O documento se trata de uma Proposta de Emenda à Constituição (PEC) apresentada pelo deputado Paulo Magalhães, que altera a Emenda Constitucional