In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from news_summarizer.domain.documents import Article

In [3]:
import re
import unicodedata

class TextTransformation:
    def apply(self, text: str) -> str:
        raise NotImplementedError

class StripWhitespace(TextTransformation):
    def apply(self, text: str) -> str:
        return text.strip()

class RemoveEmojis(TextTransformation):
    def apply(self, text: str) -> str:
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

class RemoveNonAsciiExceptAccents(TextTransformation):
    def apply(self, text: str) -> str:
        return ''.join(
            c for c in text
            if ord(c) < 128 or unicodedata.category(c).startswith('L')
        )

class ReplaceMultipleSpaces(TextTransformation): 
    def apply(self, text: str) -> str: 
        return re.sub(r'\s+', ' ', text)

class TextPipeline:
    def __init__(self):
        self.transformations = []

    def add_transformation(self, transformation: TextTransformation):
        self.transformations.append(transformation)

    def execute(self, text: str) -> str:
        for transformation in self.transformations:
            text = transformation.apply(text)
        return text

# Example usage
pipeline = TextPipeline()
pipeline.add_transformation(StripWhitespace())
pipeline.add_transformation(RemoveEmojis())
pipeline.add_transformation(RemoveNonAsciiExceptAccents())
pipeline.add_transformation(ReplaceMultipleSpaces())

In [None]:
for i, article in enumerate(Article.bulk_find(**{})):
    transformed_text = pipeline.execute(article.content )
    print(i, transformed_text)