In [None]:
%load_ext autoreload
%autoreload 2

In [86]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

from news_summarizer.domain.documents import Article
from news_summarizer.domain.clean_documents import CleanedArticle

In [61]:
import re
import unicodedata

class TextTransformation:
    def apply(self, text: str) -> str:
        raise NotImplementedError

class StripWhitespace(TextTransformation):
    def apply(self, text: str) -> str:
        return text.strip()

class RemoveEmojis(TextTransformation):
    def apply(self, text: str) -> str:
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

class RemoveNonAsciiExceptAccents(TextTransformation):
    def apply(self, text: str) -> str:
        return ''.join(
            c for c in text
            if ord(c) < 128 or unicodedata.category(c).startswith('L')
        )

class ReplaceMultipleSpaces(TextTransformation): 
    def apply(self, text: str) -> str: 
        return re.sub(r'\s+', ' ', text)

class TextPipeline:
    def __init__(self):
        self.transformations = []

    def add_transformation(self, transformation: TextTransformation):
        self.transformations.append(transformation)

    def execute(self, text: str) -> str:
        if text is None:
            return ''
        for transformation in self.transformations:
            text = transformation.apply(text)
        return text

# Example usage
pipeline = TextPipeline()
pipeline.add_transformation(StripWhitespace())
pipeline.add_transformation(RemoveEmojis())
pipeline.add_transformation(RemoveNonAsciiExceptAccents())
pipeline.add_transformation(ReplaceMultipleSpaces())

In [62]:
document_list = list(Article.bulk_find(**{}))

In [63]:
cleaned_documents_list = []

for i, article in enumerate(document_list):
    cleaned_article = CleanedArticle(
        id=article.id,
        title=pipeline.execute(article.title),
        author=article.author,
        content=pipeline.execute(article.content),
        subtitle=pipeline.execute(article.subtitle),
        publication_date=article.publication_date,
        url=article.url
    )

    cleaned_documents_list.append(cleaned_article)

In [64]:
from typing import Generator

def batch(list_: list, size: int) -> Generator[list, None, None]:
    yield from (list_[i : i + size] for i in range(0, len(list_), size))

In [65]:
from typing import List
from uuid import UUID
import hashlib
from typing import List, Union
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from news_summarizer.domain.chunks import ArticleChunk
from news_summarizer.domain.clean_documents import CleanedArticle
from news_summarizer.domain.embeddeg_chunks import EmbeddedArticleChunk

class ChunkingService:
    def __init__(
        self,
        separators: List[str],
        character_chunk_size: int = 250,
        character_chunk_overlap: int = 0,
        token_chunk_size: int = 128,
        token_chunk_overlap: int = 25,
        token_model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    ):
        """
        Initializes the ChunkingService with configuration parameters.
        
        Args:
            separators (List[str]): List of separators for character-based splitting.
            character_chunk_size (int): Size of chunks for character splitting.
            character_chunk_overlap (int): Overlap between character chunks.
            token_chunk_size (int): Maximum token count per chunk for token splitting.
            token_chunk_overlap (int): Overlap between token chunks.
            token_model_name (str): Name of the model for token-based splitting.
        """
        self.character_splitter = RecursiveCharacterTextSplitter(
            separators=separators,
            chunk_size=character_chunk_size,
            chunk_overlap=character_chunk_overlap,
        )
        self.token_splitter = SentenceTransformersTokenTextSplitter(
            chunk_overlap=token_chunk_overlap,
            tokens_per_chunk=token_chunk_size,
            model_name=token_model_name,
        )
        self.character_chunk_size = character_chunk_size
        self.token_chunk_overlap = token_chunk_overlap

    def split_text_into_chunks(self, text: str) -> List[str]:
        """
        Splits a given text into chunks based on characters and tokens.
        
        Args:
            text (str): The text to split into chunks.
        
        Returns:
            List[str]: A list of text chunks.
        """
        # Step 1: Split text into character-based chunks
        character_chunks = self.character_splitter.split_text(text)

        # Step 2: Further split character chunks into token-based chunks
        token_chunks = []
        for chunk in character_chunks:
            token_chunks.extend(self.token_splitter.split_text(chunk))
        
        return token_chunks

    def chunk(self, data_model: CleanedArticle) -> List[ArticleChunk]:
        """
        Converts a CleanedArticle into a list of ArticleChunks.
        
        Args:
            data_model (CleanedArticle): The article to chunk.
        
        Returns:
            List[ArticleChunk]: The list of resulting ArticleChunks.
        """
        # Generate text chunks
        text_chunks = self.split_text_into_chunks(data_model.content)

        # Convert text chunks into ArticleChunk models
        article_chunks = []
        for chunk in text_chunks:
            chunk_id = hashlib.md5(chunk.encode()).hexdigest()
            article_chunk = ArticleChunk(
                id=UUID(chunk_id, version=4),
                title=data_model.title,
                subtitle=data_model.subtitle,
                content=chunk,
                author=data_model.author,
                publication_date=data_model.publication_date,
                url=data_model.url,
                document_id=data_model.id,
                metadata={
                    "chunk_size": self.character_chunk_size,
                    "chunk_overlap": self.token_chunk_overlap,
                },
            )
            article_chunks.append(article_chunk)
        
        return article_chunks

class EmbedderService:
    def __init__(self, embedder):
        """
        Initializes the EmbedderService with the embedding model.
        
        Args:
            embedder: An embedding model instance with attributes like `model_id` and a callable for generating embeddings.
        """
        self.embedder = embedder

    def create_embedded_chunk(self, data_model: ArticleChunk, embedding: List[float]) -> EmbeddedArticleChunk:
        """
        Creates an EmbeddedArticleChunk from a given ArticleChunk and its embedding.
        """
        return EmbeddedArticleChunk(
            id=data_model.id,
            title=data_model.title,
            subtitle=data_model.subtitle,
            author=data_model.author,
            content=data_model.content,
            url=data_model.url,
            document_id=data_model.document_id,
            embedding=embedding,
            metadata={
                "embedding_model_id": self.embedder.model_id,
                "embedding_size": self.embedder.embedding_size,
                "max_input_length": self.embedder.max_input_length,
            },
        )

    def embed_batch(self, data_models: List[ArticleChunk]) -> List[EmbeddedArticleChunk]:
        """
        Embeds a batch of ArticleChunks into EmbeddedArticleChunks.
        """
        embedding_inputs = [chunk.content for chunk in data_models]
        embeddings = self.embedder(embedding_inputs, to_list=True)
        return [
            self.create_embedded_chunk(chunk, embedding)
            for chunk, embedding in zip(data_models, embeddings, strict=False)
        ]

    def embed(self, data_model: Union[ArticleChunk, List[ArticleChunk]]) -> Union[EmbeddedArticleChunk, List[EmbeddedArticleChunk]]:
        """
        Embeds one or more ArticleChunks.
        """
        is_single_instance = not isinstance(data_model, list)
        data_models = [data_model] if is_single_instance else data_model
        embedded_chunks = self.embed_batch(data_models)
        return embedded_chunks[0] if is_single_instance else embedded_chunks


In [None]:
from news_summarizer.embeddings import EmbeddingModel

separators = [
    "\n\n",  # Paragraph breaks
    "\n",    # Line breaks
    " ",     # Spaces
    ".",     # Periods
    ",",     # Commas
    "!",     # Exclamation marks
    "?",     # Question marks
    ";",     # Semicolons
    ":",     # Colons
    "\u2026",  # Ellipsis (…)
    "\u00A0",  # Non-breaking space
]

embedder = EmbeddingModel(model_id='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device='cuda', cache_dir=None)
chunking_service = ChunkingService(separators=separators)
embedder_service = EmbedderService(embedder)

In [67]:
cleaned_documents = cleaned_documents_list

def chunk_and_embed(cleaned_documents):
    embedded_chunks = []
    for document in cleaned_documents:
        chunks = chunking_service.chunk(document)
        
        for batched_chunks in batch(chunks, 10):
            batched_embedded_chunks = embedder_service.embed(batched_chunks)
            embedded_chunks.extend(batched_embedded_chunks)

    return embedded_chunks

In [68]:
embedded_chunks = chunk_and_embed(cleaned_documents)

In [107]:
#from news_summarizer.database.qdrant import connection
#connection.delete_collection(EmbeddedArticleChunk.Config.name)

In [None]:
EmbeddedArticleChunk.bulk_insert(embedded_chunks)

In [None]:
def fetch_all_embeddings():
    offset = None
    my_embeddings = []

    while True:
        embeddings, offset = EmbeddedArticleChunk.bulk_find(**{}, offset=offset)
        my_embeddings.extend(embeddings)

        if offset is None:
            break

    return my_embeddings

# Example usage
all_embeddings = fetch_all_embeddings()
len(all_embeddings)