###  MANUAL

#### Por tamanho fixo

In [None]:
# --- CHUNKING POR TAMANHO FIXO (CARACTERES) ---
def chunk_by_chars(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    
    return chunks

chunks_chars = chunk_by_chars(lemmas, chunk_size=100, overlap=10)

print(f"Total de chunks: {len(chunks_chars)}")
print("Exemplo de chunk:\n")
print(chunks_chars[0][:300])


Total de chunks: 9
Exemplo de chunk:

titulo introducao machine Learning machine learning Campo inteligencia artificial desenvolver algori


#### Por tokens

In [None]:
# --- CHUNKING POR PAR√ÅGRAFOS ---
def chunk_by_paragraph(text):
    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
    return paragraphs

chunks_paragraphs = chunk_by_paragraph(lemmas)

print(f"Total de chunks: {len(chunks_paragraphs)}")
print("Primeiro par√°grafo:\n")
print(chunks_paragraphs[0][:300])


Total de chunks: 1
Primeiro par√°grafo:

titulo introducao machine Learning machine learning Campo inteligencia artificial desenvolver algoritmos capaz aprender padroe partir dado principal tipo incluir aprendizar supervisionar aprendizar nao supervisionar aprendizar reforco aprendizar supervisionar envolver treinar modelo dado rotular pre


#### Por Senten√ßas

In [None]:
# --- CHUNKING POR SENTEN√áAS ---
from nltk.tokenize import sent_tokenize

def chunk_by_sentences(text, max_sentences=5):
    sentences = sent_tokenize(text)
    chunks = []

    for i in range(0, len(sentences), max_sentences):
        chunk = " ".join(sentences[i:i + max_sentences])
        chunks.append(chunk)

    return chunks

chunks_sentences = chunk_by_sentences(lemmas, max_sentences=5)

print(f"Total de chunks: {len(chunks_sentences)}")
print("Exemplo de chunk:\n")
print(chunks_sentences[0][:300])


Total de chunks: 1
Exemplo de chunk:

titulo introducao machine Learning machine learning Campo inteligencia artificial desenvolver algoritmos capaz aprender padroe partir dado principal tipo incluir aprendizar supervisionar aprendizar nao supervisionar aprendizar reforco aprendizar supervisionar envolver treinar modelo dado rotular pre


#### H√≠brido

In [None]:
# --- CHUNKING H√çBRIDO (SENTEN√áAS AT√â LIMITE DE CARACTERES) ---
from nltk.tokenize import sent_tokenize

def hybrid_sentence_chunk(text, max_chars=500):
    sentences = sent_tokenize(text)
    chunks = []
    current = ""

    for sent in sentences:
        if len(current) + len(sent) <= max_chars:
            current += " " + sent
        else:
            chunks.append(current.strip())
            current = sent
    
    if current:
        chunks.append(current.strip())
    
    return chunks

chunks_hybrid = hybrid_sentence_chunk(lemmas, max_chars=500)

print(f"Total de chunks: {len(chunks_hybrid)}")
print("Exemplo de chunk:\n")
print(chunks_hybrid[1][:300])


Total de chunks: 2
Exemplo de chunk:

titulo introducao machine Learning machine learning Campo inteligencia artificial desenvolver algoritmos capaz aprender padroe partir dado principal tipo incluir aprendizar supervisionar aprendizar nao supervisionar aprendizar reforco aprendizar supervisionar envolver treinar modelo dado rotular pre


#### Recurs√≠vo

## Open AI

In [None]:
from openai import OpenAI
import numpy as np
import time

client = OpenAI()

def embed_openai(chunks, model="text-embedding-3-small"):
    print(f"üî• Embedding usando modelo OpenAI: {model}")
    start = time.time()

    response = client.embeddings.create(
        model=model,
        input=chunks
    )

    vectors = [item.embedding for item in response.data]
    duration = round(time.time() - start, 3)

    print(f"Dimens√£o: {len(vectors[0])}")
    print(f"Total de chunks: {len(chunks)}")
    print(f"Tempo: {duration}s")

    return vectors

# Teste
openai_vectors_small = embed_openai(chunks_recursive, model="text-embedding-3-small")
openai_vectors_large = embed_openai(chunks_recursive, model="text-embedding-3-large")


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
# --- CHUNKING RECURSIVO ---

def recursive_chunk(text, max_size=500, overlap=50):
    """
    Implementa√ß√£o simplificada do Recursive Character Text Splitter
    estilo LangChain.
    """

    # ordem de separadores: do mais forte ao mais fraco
    separators = ["\n\n", "\n", ". ", " "]

    def split_text(text, separators):
        """Divide texto usando o primeiro separador dispon√≠vel."""
        for sep in separators:
            if sep in text:
                return text.split(sep), sep
        return [text], None  # nenhum separador encontrado

    # ---------- fun√ß√£o recursiva -----------

    def _recursive(text):
        if len(text) <= max_size:
            return [text]

        parts, sep = split_text(text, separators)

        # Se o texto n√£o tiver nenhum dos separadores
        if sep is None:
            # chunk por caracteres mesmo
            return [
                text[i:i+max_size]
                for i in range(0, len(text), max_size)
            ]

        chunks = []
        current = ""

        for part in parts:
            # tenta adicionar a parte atual
            if len(current) + len(part) + len(sep) <= max_size:
                current += part + sep
            else:
                # se a parte √© maior que o limite,
                # precisamos dividir recursivamente
                if len(part) > max_size:
                    chunks.extend(_recursive(part))
                else:
                    chunks.append(current.strip())
                    current = part + sep

        if current.strip():
            chunks.append(current.strip())

        # aplicar overlap
        if overlap > 0:
            chunks_with_overlap = []
            for i in range(len(chunks)):
                chunk = chunks[i]
                if i > 0:
                    chunk = chunks[i-1][-overlap:] + " " + chunk
                chunks_with_overlap.append(chunk)
            chunks = chunks_with_overlap

        return chunks

    # ---------------------------------------

    return _recursive(text)


# ---- executando no seu texto limpo ----
chunks_recursive = recursive_chunk(lemmas, max_size=500, overlap=50)

print(f"Total de chunks gerados: {len(chunks_recursive)}")
print("\nExemplo de chunk:\n")
print(chunks_recursive[0][:400])


Total de chunks gerados: 2

Exemplo de chunk:

titulo introducao machine Learning machine learning Campo inteligencia artificial desenvolver algoritmos capaz aprender padroe partir dado principal tipo incluir aprendizar supervisionar aprendizar nao supervisionar aprendizar reforco aprendizar supervisionar envolver treinar modelo dado rotular prever preco casa base caracteristico tamanho localizacao numero quarto aprendizar nao supervisionar de
