In [1]:

import re
from typing import List, Optional, Tuple

class TextTokenizer:
    """An enhanced text tokenizer that handles various text elements including words,
    contractions, numbers, punctuation, and special characters, using named regex groups.
    """
    
    # Token specification with named groups
    TOKEN_SPECIFICATION = [
        ('NUMBER',      r'\b\d+(?:\.\d+)?(?:[eE][+-]?\d+)?\b'),      # Integer or decimal number
        ('WORD',        r'\b\w+(?:-\w+)*\b'),                        # Words (including hyphenated)
        ('CONTRACTION', r"\b\w+'(?:\w+)?\b"),                        # Contractions like can't, don't
        ('PUNCTUATION', r'[.,!?;:"\'()\[\]{}]'),                     # Punctuation
        ('SPECIAL',     r'[$%&@#^*+=<>~`|\\/]'),                     # Special characters
        ('OTHER',       r'\S'),                                       # Any non-whitespace character
    ]
    
    def __init__(self, custom_patterns: Optional[List[Tuple[str, str]]] = None):
        """Initialize the tokenizer with optional custom patterns.
        
        Args:
            custom_patterns (list of tuples, optional): List of (name, pattern) tuples
                to include in tokenization.
        """
        self.token_specification = self.TOKEN_SPECIFICATION.copy()
        if custom_patterns:
            self.token_specification.extend(custom_patterns)
        
        # Combine all patterns into a single regex
        self.tokenizer_pattern = '|'.join('(?P<%s>%s)' % pair for pair in self.token_specification)
        self.compiled_pattern = re.compile(self.tokenizer_pattern)
    
    def tokenize(self, text: str, lowercase: bool = False) -> List[str]:
        """Tokenize the input text.
        
        Args:
            text (str): Input text to tokenize
            lowercase (bool): Whether to convert tokens to lowercase
        
        Returns:
            List[str]: List of tokens
        """
        if not text:
            return []
        
        if lowercase:
            text = text.lower()
        
        tokens = [match.group() for match in self.compiled_pattern.finditer(text)]
        return tokens
    
    def tokenize_with_types(self, text: str) -> List[Tuple[str, str]]:
        """Tokenize text and identify the type of each token.
        
        Args:
            text (str): Input text to tokenize
        
        Returns:
            List[tuple]: List of (token, token_type) pairs
        """
        tokens = []
        for match in self.compiled_pattern.finditer(text):
            token_type = match.lastgroup
            token = match.group()
            tokens.append((token, token_type))
        return tokens

def main():
    # Define your custom patterns
    custom_patterns = [
        ('EMOJI', r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF]'),  # Match emojis
        ('HASHTAG', r'#\w+'),                                         # Match hashtags
    ]
    
    # Initialize the tokenizer with custom patterns
    tokenizer = TextTokenizer(custom_patterns=custom_patterns)
    
    # Test cases
    test_texts = [
        "I love Python! 😊 #coding",
        "Let's meet at 5pm #meeting 🕔",
        "Stay positive! 👍 #motivation",
    ]
    
    for text in test_texts:
        print("\nOriginal text:", text)
        print("Tokens:", tokenizer.tokenize(text))
        print("Tokens with types:")
        for token, token_type in tokenizer.tokenize_with_types(text):
            print(f"  {token}: {token_type}")

if __name__ == "__main__":
    main()



Original text: I love Python! 😊 #coding
Tokens: ['I', 'love', 'Python', '!', '😊', '#', 'coding']
Tokens with types:
  I: WORD
  love: WORD
  Python: WORD
  !: PUNCTUATION
  😊: OTHER
  #: SPECIAL
  coding: WORD

Original text: Let's meet at 5pm #meeting 🕔
Tokens: ['Let', "'", 's', 'meet', 'at', '5pm', '#', 'meeting', '🕔']
Tokens with types:
  Let: WORD
  ': PUNCTUATION
  s: WORD
  meet: WORD
  at: WORD
  5pm: WORD
  #: SPECIAL
  meeting: WORD
  🕔: OTHER

Original text: Stay positive! 👍 #motivation
Tokens: ['Stay', 'positive', '!', '👍', '#', 'motivation']
Tokens with types:
  Stay: WORD
  positive: WORD
  !: PUNCTUATION
  👍: OTHER
  #: SPECIAL
  motivation: WORD
