# Testing Gemini API

In [None]:
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text=text._result.candidates[0].content.parts[0].text
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
import os
from dotenv import load_dotenv

load_dotenv('app/src/shared/.env')

GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
GOOGLE_API_KEY

In [None]:
model = genai.GenerativeModel('gemini-pro')

In [None]:
response = model.generate_content("whats the meaning of life?")

In [None]:
to_markdown(response)

## Chat session

In [None]:
chat = model.start_chat()
chat


In [None]:
response = chat.send_message("Hello,my name is Nathan!")
to_markdown(response)

In [None]:
response = chat.send_message("Whats my name?")
to_markdown(response)

In [None]:
chat.history

In [None]:
for message in chat.history:
  display(to_markdown(f'**{message.role}**: {message.parts[0].text}'))

## Use LangChain to Access Gemini API

In [None]:
from langchain_google_genai import GoogleGenerativeAI,ChatGoogleGenerativeAI

In [None]:
llm = ChatGoogleGenerativeAI(model="gemini-pro")
result = llm.invoke("tell me a joke in french")
result

In [None]:
llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
result = llm.invoke("tell me a joke in french")
result

In [None]:
import getpass
import os

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Provide your Google API Key")

In [None]:
llm = ChatGoogleGenerativeAI(model="gemini-pro")
result = llm.invoke("tell me a joke in portuguese")
print(result.content)

# Gemini LangChain QA using RAG structure

## auth with gemini api key

In [None]:
import textwrap

from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text=text._result.candidates[0].content.parts[0].text
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
import os
from dotenv import load_dotenv
import google.generativeai as genai

load_dotenv('app/src/shared/.env')

GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
GOOGLE_API_KEY

## Document loaders - Load Sources

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader

In [None]:
loader = PyPDFLoader("test/nr-10.pdf")
pages = loader.load()
len(pages)

In [None]:
pages

In [None]:
loader = PyPDFDirectoryLoader("test")
pdf_docs = loader.load()

In [None]:
# number of pages
len(pdf_docs)

In [None]:
def extract_text(pdf_docs):
  """
  Extracts the text from all pages in a list of documents.

  Args:
    docs: A list of "Document" objects.

  Returns:
    A string containing the concatenated text of all pages.
  """

  text = ""
  for doc in pdf_docs:
    text += doc.page_content 
  return text

In [None]:
text = extract_text(pdf_docs)
text

## cleaning the pdf docs

In [None]:
import re

def clean_text(text):
  """
  Removes markup and special characters from the text.

  Args:
    text: A string containing the text to be cleaned.

  Returns:
    A string with the cleaned text.
  """
  
  # Converte o texto para minúsculas.
  text = text.lower()

  # Remove caracteres que não sejam letras, números ou espaços em branco.
  # (Mantém apenas letras, números e espaços em branco.)
  text = re.sub(r"[^\w\s]", "", text)

  # Substitui sequências de espaços em branco por um único espaço.
  # (Garante espaçamento consistente e evita chunks indesejados.)
  text = re.sub(r"\s+", " ", text)

  # Return the cleaned text
  return text


In [None]:
# Example of usage
text_cleaned = clean_text(text)

# Print the cleaned text
print(text_cleaned)

## Text Splitting - Chunking the text

### Langchain text splitters

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker

# using recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

In [None]:
# using Semantic Chunking
# text_splitter = SemanticChunker(gemini_embeddings, breakpoint_threshold_type="percentile")

In [None]:
crunks = text_splitter.split_documents(pdf_docs)
print(f'### Chunk 1: \n\n{crunks[0].page_content}\n\n=====\n')
print(f'### Chunk 2: \n\n{crunks[1].page_content}\n\n=====')
print(len(crunks))


In [None]:
chunks  = text_splitter.create_documents([text_cleaned])
print(f'### Chunk 1: \n\n{crunks[0].page_content}\n\n=====\n')
print(f'### Chunk 2: \n\n{crunks[1].page_content}\n\n=====')
print(len(crunks))

## Text embedding 

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",
                                                  task_type="retrieval_document")

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

## Vectorstores

In [None]:
# from langchain.vectorstores import Chroma
# from langchain_community.vectorstores import Chroma

In [None]:
from langchain.vectorstores import Chroma
from langchain_community.vectorstores import Chroma

# creating a db -> vector store
db = Chroma.from_documents(
                     documents=pdf_docs,             # Data
                     embedding=gemini_embeddings,    # Embedding model
                     persist_directory="./chroma_db" # Directory to save data
                     )

In [None]:
query = """Em todas as intervenções em instalações elétricas devem ser adotadas medidas 
preventivas  de  controle  do  risco  elétrico"""
# query = "NR 10"
docs = db.similarity_search(query)
print(docs[0].page_content)

## Retrievers

In [None]:
#vectorstore retriver
# retriever = db.as_retriever(
#     search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5}
# )

# #vectorstore retriver
retriever = db.as_retriever(search_kwargs={"k": 1})

In [None]:
print(len(retriever.get_relevant_documents("NR 10")))

In [None]:
from langchain.chains import create_retrieval_chain

## Generator

### Initialize Gemini

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAI

llm = GoogleGenerativeAI(model="gemini-pro",
                 temperature=0.7, top_p=0.85)

### Prompt Design 

In [None]:
from langchain import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate

# Prompt template to query Gemini
llm_prompt_template = """You are an assistant for question-answering tasks.
Use the following context to answer the question.
If you don't know the answer, just say that you don't know.
Use five sentences maximum and keep the answer concise.\n
Question: {question} \nContext: {context} \nAnswer:"""

prompt = ChatPromptTemplate.from_template(llm_prompt_template)

print(prompt)

### Chain

In [None]:
#chain
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema import StrOutputParser


# Combine data from documents to readable string format.
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("O que diz o topico 10.2.8.1 da norma regulamentadora NR 10?")

# MultiQueryRetriever - fine tuning prompt

In [None]:
# Build a sample vectorDB
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Load blog post
loader = WebBaseLoader("https://www.guiatrabalhista.com.br/legislacao/nr/nr10.htm")
data = loader.load()

In [None]:
data

In [None]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
splits = text_splitter.split_documents(data)
splits

In [None]:
len(splits)

In [None]:
# save to disk
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectordb = Chroma.from_documents(documents=splits, 
                                 embedding=gemini_embeddings,
                                 persist_directory="./chroma_db")

# Load from disk
vectorstore_disk = Chroma(
                        persist_directory="./chroma_db",       # Directory of db
                        embedding_function=gemini_embeddings   # Embedding model
                   )

retriever = vectorstore_disk.as_retriever(search_kwargs={"k": 1})

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_google_genai import GoogleGenerativeAI

llm = GoogleGenerativeAI(model="gemini-pro", temperature=0)

question = "O que diz o tópico 10.2.8.1 da norma regulamentadora NR 10?"

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=retriever, llm=llm
)

In [None]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [None]:
unique_docs = retriever_from_llm.get_relevant_documents(query=question)
len(unique_docs)

In [None]:
from langchain.chains import RetrievalQA
# create a chain to answer questions

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_reduce",
    retriever=retriever_from_llm,
    return_source_documents=True,
    verbose=True,
)

In [None]:
qa("O que diz o tópico 10.2.8.1 da norma regulamentadora NR 10?")

# Codigo v1 

In [None]:
from dotenv import load_dotenv
import os

load_dotenv('app/src/shared/.env')

GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
GOOGLE_API_KEY

In [None]:
#langchain libraries
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import GoogleGenerativeAI

from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import MessagesPlaceholder
from langchain.chains.history_aware_retriever import create_history_aware_retriever

#load docs
from langchain_community.document_loaders import WebBaseLoader

#vectorstore
from langchain_community.vectorstores.faiss import FAISS

# processing docs
#its missing cleaning text
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))


llm = GoogleGenerativeAI(model="gemini-pro")


In [None]:
def get_documents_from_web(url: str):
    loader = WebBaseLoader(url)
    docs = loader.load()
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    split_docs = splitter.split_documents(docs)
    return split_docs

In [None]:
docs = get_documents_from_web("https://www.guiatrabalhista.com.br/legislacao/nr/nr10.htm")

In [None]:
print(f'### Chunk 1: \n\n{docs[40].page_content}\n\n=====\n')
print(f'### Chunk 2: \n\n{docs[41].page_content}\n\n=====')

len(docs)

In [None]:
def create_db(docs):
    gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vectorstore = FAISS.from_documents(docs, embedding=gemini_embeddings)
    return vectorstore

In [None]:
def create_chain(vectorstore):
    model = GoogleGenerativeAI(
        model="gemini-pro",
        temperature=0.3
        )

    prompt = ChatPromptTemplate.from_messages([
        ("system", "Responda às perguntas do usuário com base no contexto: {context}"),
        MessagesPlaceholder(variable_name="chat_history"),
        ("user", "{input}")
    ])

    # chain = prompt | model
    chain = create_stuff_documents_chain(
        llm=model,
        prompt=prompt
    )

    retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

    retriever_prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}"),
        ("human", "Considerando a conversa acima, gere uma consulta de pesquisa para obter informações relevantes para a conversa")
    ])

    history_aware_retriever = create_history_aware_retriever(
        llm=model,
        retriever=retriever,
        prompt=retriever_prompt
    )

    retrieval_chain = create_retrieval_chain(
        # retriever,
        history_aware_retriever,
        chain
    )

    return retrieval_chain

In [None]:
def process_chat(chain, question, chat_history):
    response = chain.invoke({
        "input": question,
        "chat_history": chat_history
    })
    return response["answer"]

In [None]:
docs = get_documents_from_web('https://www.guiatrabalhista.com.br/legislacao/nr/nr10.htm')
vectorstore = create_db(docs)
chain = create_chain(vectorstore)

chat_history = []

while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    response = process_chat(chain, user_input, chat_history)
    chat_history.append(HumanMessage(content=user_input))
    chat_history.append(AIMessage(content=response))

    print("Assistant:", response)

# Semi-structured RAG

## Load data 

As normas brasileiras relacionadas a instalações elétricas abrangem uma variedade de aspectos, desde baixa tensão até instalações específicas em locais como áreas classificadas e sistemas fotovoltaicos. As principais normas da Associação Brasileira de Normas Técnicas (ABNT) sobre instalações elétricas incluem:

* NBR 5410 - Instalações elétricas de baixa tensão: Trata das condições para o projeto, execução e manutenção de instalações elétricas de baixa tensão.

* NBR 14039 - Instalações elétricas de média tensão de 1,0 kV a 36,2 kV: Estabelece as condições para projeto, execução e manutenção dessas instalações.

* NBR 5413 - Iluminância de interiores: Define os requisitos para níveis de iluminância em ambientes internos.

* NBR 13570 -  Instalações Elétricas em Locais de Afluência de Público - Requisitos específicos

* ABNT NBR IEC 60079-14 - Instalações elétricas em atmosferas explosivas - Área classificada: Requisitos para instalações em áreas com risco de explosão.

* NBR 10898 - Sistemas de iluminação de emergência: Especifica os requisitos para sistemas de iluminação de emergência em edifícios.

* NBR 15514 - Recipientes transportáveis de gás liquefeito de petróleo (GLP) — Área de armazenamento — Requisitos de segurança
    * Embora a norma NBR 15514 não seja diretamente uma norma de instalações elétricas, ela possui interseções com a área elétrica em aspectos relacionados à segurança, especialmente devido aos riscos potenciais de explosões e incêndios associados ao GLP.
* NBR 5419 - Proteção contra descargas atmosféricas (em quatro partes):

    * Parte 1: Princípios gerais
    * Parte 2: Gerenciamento de risco
    * Parte 3: Danos físicos a edificações e perigos à vida
    * Parte 4: Sistemas elétricos e eletrônicos internos na estrutura

    * NBR 16280 - Reforma em edificações - Sistema de gestão de reformas: Estabelece requisitos para reformas em edificações, incluindo instalações elétricas.


Essas são algumas das principais normas que tratam de diversos aspectos das instalações elétricas no Brasil, garantindo segurança, eficiência e conformidade técnica.

In [None]:
import os
from concurrent.futures import ThreadPoolExecutor
from typing import List
from unstructured.partition.pdf import partition_pdf

def process_pdf_file(filename, path):
    
    raw_pdf_elements = partition_pdf(
        filename=os.path.join(path, filename),  # Combine path and filename
        extract_images_in_pdf=False,
        infer_table_structure=True,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
        image_output_dir_path=path,
    )
    return raw_pdf_elements

def process_single_pdf(pdf_folder_path: str, filename: str):
    """Helper function to process a single PDF file."""
    if filename.endswith(".pdf"):
        print(f"Reading the PDF doc: {filename}")
        return process_pdf_file(filename, pdf_folder_path)
    return []

# Loop through PDF Files with ThreadPoolExecutor:
def process_multiple_pdfs(pdf_folder_path: str, max_workers: int = 4) -> List:
    """Processes all PDF files within a specified folder using ThreadPoolExecutor."""
    all_elements = []
    filenames = [filename for filename in os.listdir(pdf_folder_path) if filename.endswith(".pdf")]
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(lambda filename: process_single_pdf(pdf_folder_path, filename), filenames)
    
    for result in results:
        all_elements.extend(result)  # Extend the all_elements list
    
    return all_elements

In [None]:
pdf_folder_path  = "/Users/nathan/workspace/tcc/app/src/database/pdf/"
raw_pdfs_elements = process_multiple_pdfs(pdf_folder_path, max_workers=10)

In [None]:
import os
import logging
from concurrent.futures import ThreadPoolExecutor
from typing import List
from unstructured.partition.pdf import partition_pdf

# Configure logging 
logging.basicConfig(filename='pdf_processing.log', level=logging.INFO, format='%(asctime)s %(message)s')

def process_pdf_file(filename, path):
    """Processes a single PDF file and returns the extracted elements.

    Logs information and errors during processing.
    """
    try:
        raw_pdf_elements = partition_pdf(
            filename=os.path.join(path, filename),  # Combine path and filename
            extract_images_in_pdf=False,
            infer_table_structure=True,
            chunking_strategy="by_title",
            max_characters=4000,
            new_after_n_chars=3800,
            combine_text_under_n_chars=2000
        )
        logging.info(f"Successfully processed PDF: {filename}")
        return raw_pdf_elements
    except Exception as e:  # Catch specific PDF processing errors
        logging.error(f"Error processing PDF: {filename} - {e}")
        return []

def process_single_pdf(pdf_folder_path: str, filename: str):
    """Helper function to process a single PDF file."""
    if filename.endswith(".pdf"):
        logging.info(f"Reading PDF doc: {filename}")
        return process_pdf_file(filename, pdf_folder_path)
    return []

# Loop through PDF Files with ThreadPoolExecutor:
def process_multiple_pdfs(pdf_folder_path: str, max_workers: int = 4) -> List:
    """Processes all PDF files within a specified folder using ThreadPoolExecutor.

    Logs information and errors during processing.
    """
    all_elements = []
    filenames = [filename for filename in os.listdir(pdf_folder_path) if filename.endswith(".pdf")]

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(lambda filename: process_single_pdf(pdf_folder_path, filename), filenames)

    for result in results:
        all_elements.extend(result)  # Extend the all_elements list

    return all_elements

# pdf_folder_path = "/Users/nathan/workspace/tcc/app/src/database/pdf/"
# raw_pdfs_elements = process_multiple_pdfs(pdf_folder_path, max_workers=10)

In [None]:
pdf_folder_path = "/Users/nathan/workspace/tcc/test"
raw_pdfs_elements = process_multiple_pdfs(pdf_folder_path, max_workers=4)

In [None]:
len(raw_pdfs_elements)

In [None]:
raw_pdfs_elements

In [None]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdfs_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

In [None]:
from typing import Any
from pydantic import BaseModel

# Define the Element class based on potential types returned by partition_pdf
class Element(BaseModel):
    type: str  # Textual representation of the element type (e.g., "table", "text")
    text: Any  # Content of the element, can be text, tables, or other data structures

# Categorize by type
def categorize_elements(raw_pdf_elements) -> list[Element]:
    """Categorizes elements by type and returns a dictionary with counts."""
    categorized_elements = [] # Initialize category counts
    for element in raw_pdf_elements:
        if "unstructured.documents.elements.Table" in str(type(element)):
            categorized_elements.append(Element(type="table", text=str(element)))
        elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
            categorized_elements.append(Element(type="text", text=str(element)))
    return categorized_elements

In [None]:
# Improved categorization function with clear structure
all_categories = categorize_elements(raw_pdfs_elements)
print(len(all_categories))

# Tables
table_elements = [e for e in all_categories if e.type == "table"]
print(f"Number of tables: {len(table_elements)}")

# Text
text_elements = [e for e in all_categories if e.type == "text"]
print(f"Number of text elements: {len(text_elements)}")

In [None]:
text_elements[10]

In [None]:
table_elements

## cleaning data

In [None]:
import regex as re

def clean_text(text: str) -> str:
    # Remove linhas compostas apenas por espaços e caracteres de controle
    text = re.sub(r'^\s*$', '', text, flags=re.MULTILINE)

    # Substitui múltiplos espaços em branco e quebras de linha por um único espaço
    text = re.sub(r'\s+', ' ', text)

    # Remove espaços extras ao redor de pontuações
    text = re.sub(r'\s+([.,;:!?])', r'\1', text)
    text = re.sub(r'([.,;:!?])\s+', r'\1 ', text)

    # Remove espaços extras no início e fim do texto
    text = text.strip()

    return text

In [None]:
def clean_text_elements(text_elements):
    """Aplica a limpeza de texto a todos os elementos de texto de um PDF."""
    cleaned_text_elements = []
    for element in text_elements:
        cleaned_text = clean_text(element.text)
        cleaned_text_elements.append(Element(type=element.type, text=cleaned_text))
    return cleaned_text_elements

In [None]:
def clean_table_elements(text_elements):
    """Aplica a limpeza de texto a todos os elementos de texto de um PDF."""
    cleaned_table_elements = []
    for element in text_elements:
        cleaned_table = clean_text(element.text)
        cleaned_table_elements.append(Element(type=element.type, text=cleaned_table))
    return cleaned_table_elements

In [None]:
cleaned_pdf_elements = clean_text_elements(text_elements)
for element in cleaned_pdf_elements:
    print(element['text'])

In [None]:
cleaned_pdf_elements

## Summarizing tables and text chunks from pdfs

In [None]:
from dotenv import load_dotenv

load_dotenv('app/src/shared/.env')

In [None]:
from langchain_google_genai import GoogleGenerativeAI

model = GoogleGenerativeAI(model="gemini-pro",
                 temperature=0, top_p=0.85)


In [None]:
from langchain_core.prompts import ChatPromptTemplate

# Prompt
prompt_text = """Você é um assistente com a tarefa de resumir tabelas e textos de normas brasileiras sobre instalações elétricas.
Faça um resumo sucinto da tabela ou do texto a seguir. Tabela ou texto: {element} """

prompt = ChatPromptTemplate.from_template(prompt_text)

print(prompt)

In [None]:
from langchain_core.output_parsers import StrOutputParser

summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [None]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

In [None]:
table_summaries

In [None]:
len(table_summaries)

In [None]:
# Apply to texts
texts = [i.text for i in text_elements]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

In [None]:
texts

In [None]:
text_summaries

In [None]:
type(text_summaries) 

In [None]:
text_elements

### Data analytics

In [None]:
# analisar os crunks pai com filho(summarized by gemini model)
import pandas as pd

text_data = {"Texts": texts, "Text_summaries": text_summaries}
df = pd.DataFrame(text_data)

df.head()

In [None]:
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

# gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_document")

# df['Embeddings'] = df.apply(lambda row: gemini_embeddings.embed_query(row['Title'], row['Text']), axis=1)
# df

In [None]:
texts[30]

In [None]:
text_summaries[30]

## Add to vectorstore
Use Multi Vector Retriever with summaries:

- InMemoryStore stores the raw text, tables
- vectorstore stores the embedded summaries

In [None]:
from dotenv import load_dotenv

load_dotenv('app/src/shared/.env')

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_document")
gemini_embeddings

In [None]:
vector = gemini_embeddings.embed_query("hello, world!")
vector[:5]

#### test without summarizing

In [None]:
# Apply to tables
tables = [i.text for i in table_elements]
# Apply to texts
texts = [i.text for i in text_elements]

import uuid
from langchain_core.documents import Document

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
id_key = "doc_id"

page_content_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(texts)
]

page_content_tables = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(tables)
]

In [None]:
from langchain_community.vectorstores import Chroma

# Save to disk
vectorstore = Chroma.from_documents(
                     documents=page_content_texts,                 # Data
                     embedding=gemini_embeddings,    # Embedding model
                     persist_directory="./chroma_db" # Directory to save data
                     )

In [None]:
# Save to disk
vectorstore = Chroma.from_documents(
                     documents=page_content_tables,  # Data
                     embedding=gemini_embeddings,    # Embedding model
                     persist_directory="./chroma_db" # Directory to save data
                     )

In [None]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

persist_directory = "./chroma_db"

# Load from disk
vectorstore = Chroma(collection_name="data", 
                        embedding_function=gemini_embeddings,
                        persist_directory=persist_directory)


# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever 
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [None]:
import sqlite3
import pandas as pd

# Conexão ao banco de dados SQLite
conn = sqlite3.connect('chroma_db/chroma.sqlite3')

# Consulta SQL
query = "SELECT * FROM embeddings_queue"

# Executa a consulta e retorna um DataFrame
df = pd.read_sql_query(query, conn)

# Exibe o DataFrame
print(df)

# Fecha a conexão
conn.close()

In [None]:
retriever.vectorstore.add_documents(page_content_texts)

In [None]:
retriever.vectorstore.add_documents(page_content_tables)

In [None]:
import sqlite3
import pandas as pd

# Conexão ao banco de dados SQLite
conn = sqlite3.connect('chroma_db/chroma.sqlite3')

# Consulta SQL
query = "SELECT * FROM embedding_fulltext_search"

# Executa a consulta e retorna um DataFrame
df = pd.read_sql_query(query, conn)

# Exibe o DataFrame
print(df)

# Fecha a conexão
conn.close()

In [None]:
import sqlite3
import pandas as pd

# Conexão ao banco de dados SQLite
conn = sqlite3.connect('chroma_db/chroma.sqlite3')

# Consulta SQL
query = "SELECT * FROM embeddings_queue"

# Executa a consulta e retorna um DataFrame
df = pd.read_sql_query(query, conn)

# Exibe o DataFrame
print(df)

# Fecha a conexão
conn.close()

In [None]:
# Conexão ao banco de dados SQLite
conn = sqlite3.connect('chroma_db/chroma.sqlite3')

tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table'", conn)

print(tables)

conn.close()

In [None]:
# Conexão ao banco de dados SQLite
conn = sqlite3.connect('chroma_db/chroma.sqlite3')

for table_name in tables['name']:
    # Gerar consulta para a tabela atual
    query = f"SELECT * FROM {table_name}"

    # Obter e exibir dados da tabela
    df = pd.read_sql_query(query, conn)
    print(f"\nDados da tabela {table_name}:")
    print(df.to_string())
 
conn.close()

#### normal pipeline

In [None]:
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

persist_directory = "./chroma_db"

# Load from disk
vectorstore = Chroma(collection_name="summaries", 
                        embedding_function=gemini_embeddings,
                        persist_directory=persist_directory)

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

In [None]:
import uuid

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries)
]

non_empty_summaries_texts = [summary for summary in summary_texts if summary.page_content.strip()]

retriever.vectorstore.add_documents(non_empty_summaries_texts)
# retriever.docstore.mset(list(zip(doc_ids, texts)))

In [None]:
retriever.docstore.mset(list(zip(doc_ids, texts)))

In [None]:
# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(table_summaries)
]

non_empty_summary_tables = [summary for summary in summary_tables if summary.page_content.strip()]

retriever.vectorstore.add_documents(non_empty_summary_tables)
# retriever.docstore.mset(list(zip(table_ids, tables)))

In [None]:
retriever.docstore.mset(list(zip(table_ids, tables)))

In [None]:
vectorstore

## RAG Pipeline

### test 1

In [None]:
from langchain import PromptTemplate

# Prompt template to query Gemini
llm_prompt_template = """You are an assistant for question-answering tasks.
Use the following context to answer the question.
If you don't know the answer, just say that you don't know.
Use five sentences maximum and keep the answer concise.\n
Question: {question} \nContext: {context} \nAnswer:"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

print(llm_prompt)

In [None]:
# Load from disk
vectorstore_disk = Chroma(
                        persist_directory="./chroma_db",       # Directory of db
                        embedding_function=gemini_embeddings   # Embedding model
                   )

retriever = vectorstore_disk.as_retriever(search_kwargs={"k": 1})

In [None]:
from langchain_core.runnables import RunnableMap

# Combine data from documents to readable string format.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Create stuff documents chain using LCEL.
#
# This is called a chain because you are chaining together different elements
# with the LLM. In the following example, to create the stuff chain, you will
# combine the relevant context from the website data matching the question, the
# LLM model, and the output parser together like a chain using LCEL.
#
# The chain implements the following pipeline:
# 1. Extract the website data relevant to the question from the Chroma
#    vector store and save it to the variable `context`.
# 2. `RunnablePassthrough` option to provide `question` when invoking
#    the chain.
# 3. The `context` and `question` are then passed to the prompt where they
#    are populated in the respective variables.
# 4. This prompt is then passed to the LLM (`gemini-pro`).
# 5. Output from the LLM is passed through an output parser
#    to structure the model's response.

# Função para imprimir o contexto
def print_context(context):
    print("Contexto fornecido:", context)
    return context


llm = GoogleGenerativeAI(model="gemini-pro", temperature=0.3, top_p=0.85)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | RunnableMap({"context": print_context, "question": RunnablePassthrough()})
    | llm_prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("O que diz o topico 10.2.8.1 da norma regulamentadora NR 10?")

### test 2

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableMap

model = GoogleGenerativeAI(model="gemini-pro", temperature=0.3, top_p=0.85)

# Prompt template
template = """Você é um assistente chamado Spark e sua função é responder dúvidas e questionamentos relacionadas as instalações elétricas brasileiras com base no contexto, o qual pode incluir textos e/ou tabelas referentes as normas brasileiras (NBRs):
{context}
Question: {question}

"""
prompt = ChatPromptTemplate.from_template(template)

# Função para imprimir o contexto
def print_context(context):
    print("Contexto fornecido:", context)
    return context

# RAG pipeline com etapa intermediária para capturar e imprimir o contexto
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | RunnableMap({"context": print_context, "question": RunnablePassthrough()})
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
# Invocar a chain e imprimir o resultado
result = chain.invoke("O que diz o topico 10.2.8.1 da norma regulamentadora NR 10?")
print("Resposta:", result)

In [None]:
# Invocar a chain e imprimir o resultado
result = chain.invoke("De acordo com a NBR 14039, na tabela 40 - Fatores de correção de agrupamento, qual é o fator de correção para o número de 9 condutores isolados?")
print("Resposta:", result)

In [None]:
chain.invoke("De acordo com a NBR 14039, na tabela 40 - Fatores de correção de agrupamento, qual é o fator de correção para o número de 9 condutores isolados?")

In [None]:
chain.invoke("Qual a definição de barramento blindado segundo a NBR 14039?")

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableMap

model = GoogleGenerativeAI(model="gemini-pro", temperature=0.5, top_p=0.85)


retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

# Prompt template
template = """Você é um assistente chamado Spark e sua função é responder dúvidas e questionamentos relacionadas as instalações elétricas brasileiras com base no contexto, o qual pode incluir textos e/ou tabelas referentes as normas brasileiras (NBRs):
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Função para imprimir o contexto
def print_context(context):
    print("Contexto fornecido:", context)
    return context

# RAG pipeline com etapa intermediária para capturar e imprimir o contexto
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | RunnableMap({"context": print_context, "question": RunnablePassthrough()})
    | prompt
    | model
    | StrOutputParser()
)

# Invocar a chain e imprimir o resultado
result = chain.invoke("De acordo com a NBR 14039, na tabela 40 - Fatores de correção de agrupamento, qual é o fator de correção para o número de 9 condutores isolados?")
print("Resposta:", result)


### RAG pipeline with llama3

In [None]:
from langchain_community.llms import Ollama

model = Ollama(
    model="llama3"
)  # assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3 `


from langchain_core.runnables import RunnablePassthrough

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
response = chain.invoke("De acordo com a NBR 14039, na tabela 39 - eletrodos de aterramento convencnionas, para o tipo de eletrodo de tubo de aço zincado, quais as dimensões mínimas?")

In [None]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
to_markdown(response)

In [None]:
response = chain.invoke("Qual foi minha pergunta anterior?")

In [None]:
to_markdown(response)

# HELP

In [1]:
from langchain_core import Chroma
from langchain_core.documents import Document
from langchain.embeddings import OpenAIEmbeddings
import uuid

# Função de embedding de exemplo
def example_embedding_function(texts):
    # Retorna um embedding fictício (deve ser substituído pela sua função de embedding real)
    return [[float(i) for i in range(10)] for _ in texts]

# Diretório onde o vectorstore será persistido
persist_directory = "chroma_db/test_summaries/test"

# Inicializando o vectorstore
vectorstore = Chroma(
    collection_name="test",
    embedding_function=example_embedding_function,
    persist_directory=persist_directory
)

teste_do_desespero = ['oioi_teste_', 'ola']
id_key = 'docs_id_key'

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in teste_do_desespero]
page_content_test = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(teste_do_desespero)
]

vectorstore.add_documents(page_content_test)


ImportError: cannot import name 'Chroma' from 'langchain_core.vectorstores' (/opt/homebrew/anaconda3/envs/tcc/lib/python3.12/site-packages/langchain_core/vectorstores.py)