In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
from news_summarizer.domain.documents import Article
from news_summarizer.domain.clean_documents import CleanedArticle

In [35]:
import re
import unicodedata

class TextTransformation:
    def apply(self, text: str) -> str:
        raise NotImplementedError

class StripWhitespace(TextTransformation):
    def apply(self, text: str) -> str:
        return text.strip()

class RemoveEmojis(TextTransformation):
    def apply(self, text: str) -> str:
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

class RemoveNonAsciiExceptAccents(TextTransformation):
    def apply(self, text: str) -> str:
        return ''.join(
            c for c in text
            if ord(c) < 128 or unicodedata.category(c).startswith('L')
        )

class ReplaceMultipleSpaces(TextTransformation): 
    def apply(self, text: str) -> str: 
        return re.sub(r'\s+', ' ', text)

class TextPipeline:
    def __init__(self):
        self.transformations = []

    def add_transformation(self, transformation: TextTransformation):
        self.transformations.append(transformation)

    def execute(self, text: str) -> str:
        if text is None:
            return ''
        for transformation in self.transformations:
            text = transformation.apply(text)
        return text

# Example usage
pipeline = TextPipeline()
pipeline.add_transformation(StripWhitespace())
pipeline.add_transformation(RemoveEmojis())
pipeline.add_transformation(RemoveNonAsciiExceptAccents())
pipeline.add_transformation(ReplaceMultipleSpaces())

In [36]:
document_list = list(Article.bulk_find(**{}))

In [37]:
cleaned_documents_list = []

for i, article in enumerate(document_list):
    cleaned_article = CleanedArticle(
        id=article.id,
        title=pipeline.execute(article.title),
        author=article.author,
        content=pipeline.execute(article.content),
        subtitle=pipeline.execute(article.subtitle),
        publication_date=article.publication_date,
        url=article.url
    )

    cleaned_documents_list.append(cleaned_article)

In [38]:
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

In [39]:
CleanedArticle.bulk_insert(cleaned_documents_list)

INFO:httpx:HTTP Request: PUT http://localhost:6333/collections/cleaned_articles/points?wait=true "HTTP/1.1 200 OK"


True

In [40]:
CleanedArticle.bulk_find(**{})

INFO:httpx:HTTP Request: POST http://localhost:6333/collections/cleaned_articles/points/scroll "HTTP/1.1 200 OK"


([CleanedArticle(id=UUID('0005fcb9-638b-4695-9319-f472568aad7d'), title='Lula está estável e conversando normalmente após cirurgia, diz médico', subtitle='Presidente teve funções neurológicas preservadas e procedimento decorre do acidente doméstico sofrido em 19 de outubro', author='Plínio Aguiar, do R7, em Brasília', publication_date=datetime.datetime(2024, 12, 10, 13, 15, 11, 788000), content='O presidente Luiz Inácio Lula da Silva tem quadro de saúde estável após a cirurgia realizada na madrugada desta terça-feira (10). De acordo com o médico Roberto Kalil Filho, o petista voltou do procedimento cirúrgico praticamente acordado e conversando normalmente. Não há, por enquanto, data de alta médica, mas a equipe fala em retorno a Brasília no início da próxima semana. Os profissionais clínicos destacam, ainda, que não há lesões cerebrais. O presidente foi submetido a uma cirurgia para drenagem do hematoma do sangramento no cérebro. O presidente evoluiu bem, já chegou da cirurgia praticam

In [71]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

separators = [
    "\n\n",  # Paragraph breaks
    "\n",    # Line breaks
    " ",     # Spaces
    ".",     # Periods
    ",",     # Commas
    "!",     # Exclamation marks
    "?",     # Question marks
    ";",     # Semicolons
    ":",     # Colons
    "\u2026",  # Ellipsis (…)
    "\u00A0",  # Non-breaking space
]

def chunk_text(text: str, chunk_size: int = 250, chunk_overlap: int = 25) -> list[str]:
    character_splitter = RecursiveCharacterTextSplitter(separators=separators, chunk_size=chunk_size, chunk_overlap=0)
    text_split_by_characters = character_splitter.split_text(text)

    token_splitter = SentenceTransformersTokenTextSplitter(
        chunk_overlap=chunk_overlap,
        tokens_per_chunk=128,
        model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' #'sentence-transformers/all-MiniLM-L6-v2',
    )
    chunks_by_tokens = []
    for section in text_split_by_characters:
        chunks_by_tokens.extend(token_splitter.split_text(section))

    return chunks_by_tokens

In [66]:
from news_summarizer.domain.clean_documents import CleanedArticle
from news_summarizer.domain.chunks import ArticleChunk

In [75]:
import hashlib
from uuid import UUID


def chunk(data_model: CleanedArticle):
    data_models_list = []

    chunks = chunk_text(
        data_model.content, chunk_size=250, chunk_overlap=25
    )

    for chunk in chunks:
        chunk_id = hashlib.md5(chunk.encode()).hexdigest()
        model = ArticleChunk(
            id=UUID(chunk_id, version=4),
            title=data_model.title,
            subtitle=data_model.subtitle,
            content=chunk,
            author=data_model.author,
            publication_date=data_model.publication_date,
            url=data_model.url,
            metadata={'chunk_size': 250, 'chunk_overlap': 25},
        )
        data_models_list.append(model)
    return data_models_list

In [76]:
chunk(cleaned_documents_list[0])

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


[ArticleChunk(id=UUID('26ca57f8-ace6-4d93-879d-110fb2a6a1de'), title='As ferramentas medievais utilizadas na reconstrução da catedral de Notre-Dame', subtitle='Pouco mais de cinco anos após sofrer um incêndio que a devastou, a catedral de Notre-Dame de Paris é reaberta neste sábado.', author=None, publication_date=datetime.datetime(2024, 12, 7, 11, 23, 57, 911000), content='Foram usados machados medievais criados pela Maison Luquet, ferreiro situado no leste da França, para o processo de restauração Foto: Fabrice WITTNER White Fall Lab Pouco mais de cinco anos após sofrer um incêndio que a devastou, a catedral de', url=Url('https://g1.globo.com/turismo-e-viagem/noticia/2024/12/07/as-ferramentas-medievais-utilizadas-na-reconstrucao-da-catedral-de-notre-dame.ghtml'), metadata={'chunk_size': 250, 'chunk_overlap': 25}),
 ArticleChunk(id=UUID('ca65221a-3e25-4145-8f32-c52ae7cf5bdb'), title='As ferramentas medievais utilizadas na reconstrução da catedral de Notre-Dame', subtitle='Pouco mais d