In [1]:
import os
os.environ['allow_dangerous_deserialization'] = 'true'

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import OllamaLLM
from langchain_community.vectorstores import FAISS
#from langchain_chroma import Chroma 
from langchain.chains import RetrievalQA, ConversationChain
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from operator import itemgetter
# importing required modules
from pypdf import PdfReader

import string
import tiktoken
import glob

In [2]:
pdf_data_path = "../data/documents/*.pdf"
txt_data_path = "../data/documents/*.txt"
path_database = "../data/database"
list_pdf_files = glob.glob(pdf_data_path)

In [3]:
def clean_text(text):
    
  while '  ' in text:
    text = text.replace('  ', ' ')    
    
  for s in string.punctuation:
    text = text.replace(s+s, s)
    
  return text.strip()


def extract_text_from_pdf(file_path):
  if '.pdf' not in file_path:
    raise Exception(f'File {file_path} is not a pdf file!')
  
  result = []
  # creating a pdf reader object
  reader = PdfReader(file_path)

  # getting a specific page from the pdf file
  for page in reader.pages:
    # extracting text from page
    text = page.extract_text()
    text = clean_text(text)
    if len(text.strip()) == 0:
      continue
    result.append(text)
  return ' '.join(result)

In [4]:
for pdf_file in list_pdf_files:
  print(f'Extracting text from {pdf_file} ...')
  lines = extract_text_from_pdf(pdf_file)
  with open(pdf_file.replace('pdf', 'txt'), 'w') as f:
    f.writelines(lines)
  os.rename(pdf_file, pdf_file.replace('documents', 'indexed_files'))

In [5]:
# 3️⃣ Criar embeddings com Ollama
embeddings = OllamaEmbeddings(model="mistral")  # Usa o modelo "mistral" para embeddings

vectorstore = None 
new_vectors = None 

if os.path.isfile(path_database + '/index.faiss'):
  vectorstore = FAISS.load_local(path_database, embeddings, allow_dangerous_deserialization=True)  # Carrega o vetorstore salvo em disco

In [6]:
# 🔹 Load and process new documents
new_documents = []
list_txt_files = glob.glob(txt_data_path)
for txt_file in list_txt_files:
    loader = TextLoader(txt_file)
    new_documents.extend(loader.load())
    os.rename(txt_file, txt_file.replace('documents', 'indexed_files'))

In [None]:
print(f'Number of new documents: {len(new_documents)}')  

In [8]:
if len(new_documents) > 0:
  # 2️⃣ Dividir o texto em pedaços menores para indexação eficiente
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap=50)
  texts = text_splitter.split_documents(new_documents)
  # 🔹 Generate embeddings
  vectorstore = FAISS.from_documents(texts, embeddings)
  vectorstore.save_local(path_database)

In [9]:
if (vectorstore is not None) and (new_vectors is not None):
  vectorstore.merge_from(new_vectors)
  vectorstore.save_local(path_database)
  print(f'{path_database} Updated with new files!!!')
  
elif new_vectors is not None and vectorstore is None:
  # Save FAISS index locally
  new_vectors.save_local(path_database)  
  vectorstore = new_vectors  
  print(f'{path_database} Loaded!!!')
elif (vectorstore is None) and (new_vectors is None):
  raise Exception('No previous vector db found and no new files do vectorize!')

In [None]:
retriever = vectorstore.as_retriever()  
retriever

In [11]:
# 5️⃣ Carregar o modelo Llama 3.1 com Ollama para geração de texto
#llm = OllamaLLM(model='llama3.1')  
llm = OllamaLLM(model='llama3.1')

In [12]:
template = """Você é um assistente de modelo de linguagem de IA. Sua tarefa é gerar cinco
versões diferentes da pergunta do usuário fornecida para recuperar documentos relevantes de um vector database. 
Ao gerar múltiplas perspectivas sobre a pergunta do usuário, seu objetivo é ajudar
o usuário a superar algumas das limitações da busca por similaridade baseada em distância.
Forneça essas perguntas alternativas separadas por quebras de linha. Pergunta original: {question}"""

In [13]:
prompt_perspectives = ChatPromptTemplate.from_template(template)
generate_queries = (
    prompt_perspectives 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [14]:
from langchain.load import dumps, loads
def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

question = 'Quais são os crimes ediondos citados no documento?'

retrieval_chain = generate_queries | retriever.map() | get_unique_union
#docs = retrieval_chain.invoke({"question":question})
#len(docs)

In [None]:
# RAG
template = """Responta a seguinte questão baseada no contexto abaixo:

Contexto: {context}

Questão: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

In [16]:
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

In [None]:
retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
len(docs)

In [None]:
# RAG
template = """Responda a seguinte questão baseado neste contexto:

{context}

Questão: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

In [19]:
# Decomposition
template = """Você é um assistente útil que gera várias subperguntas relacionadas a uma pergunta de entrada. \n
O objetivo é dividir a entrada em um conjunto de subproblemas/subquestões que podem ser respondidos isoladamente. \n
Gere várias consultas de pesquisa relacionadas a: {question} \n
Gere 3 consultas:"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [20]:
# Chain
generate_queries_decomposition = (  prompt_decomposition 
                                  | llm 
                                  | StrOutputParser() 
                                  | (lambda x: x.split("\n")) )

# Run
question = "Quais são os princípios do código penal brasileiro?"
questions = generate_queries_decomposition.invoke({"question":question})

In [None]:
questions

In [22]:
template = """Aqui está a pergunta que você precisa responder:

\n --- \n {question} \n --- \n

Aqui estão quaisquer pares de perguntas e respostas de contexto disponíveis:

\n --- \n {q_a_pairs} \n --- \n

Aqui está um contexto adicional relevante para a pergunta:

\n --- \n {context} \n --- \n

Use o contexto acima e quaisquer pares de perguntas e respostas de contexto para responder à pergunta: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [None]:
def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Questão: {question}\nResposta: {answer}\n\n"
    return formatted_string.strip()

q_a_pairs = ""
answers = []
for q in questions:
    rag_chain = (
    {"context": itemgetter("question") | retriever, 
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | llm
    | StrOutputParser())

    answer = rag_chain.invoke({"question":q,"q_a_pairs":q_a_pairs})
    answers.append(answer)
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair
    
print(q_a_pairs)

In [None]:
len(questions), len(answers)

In [None]:
def format_qa_pairs(questions, answers):
    """Format Q and A pairs"""
    
    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Pergunta {i}: {question}\nResposta {i}: {answer}\n\n"
    return formatted_string.strip()

context = format_qa_pairs(questions, answers)

# Prompt
template = """Aqui está um conjunto de pares de Perguntas e Respostas:

{context}

Use este conjunto para sintetizar uma resposa para pergunta: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

resposta_final = final_rag_chain.invoke({"context":context,"question":question})

print(resposta_final)

In [38]:
# Few Shot Examples
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
examples = [
    {
        "input": "Quais são os principios do Código Penal Brasileiro?",
        "output": "Os princípios do Código Penal Brasileiro estão declarados nos documentos enviados?",
    },
    {
        "input": "As forças de segurança do Brasil são prevista no código penal?",
        "output": "Quais são as forças de segurança?",
    },
]
# We now transform these to example messages
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}"),
        ("ai", "{output}"),
    ]
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """Você é um especialista em conhecimento mundial. Sua tarefa é recuar e parafrasear uma pergunta para uma pergunta de recuo mais genérica, que é mais fácil de responder. Aqui estão alguns exemplos:""",
        ),
        # Few shot examples
        few_shot_prompt,
        # New question
        ("user", "{question}"),
    ]
)

In [None]:
generate_queries_step_back = prompt | llm | StrOutputParser()
question = "Diga o que está previsto nos documentos sobre código penal?"
print(question)
generate_queries_step_back.invoke({"question": question})