# Import Libraries

In [None]:
import re
import unicodedata
from collections import Counter, defaultdict
import pandas as pd
from typing import List, Dict, Tuple
import matplotlib.pyplot as plt
import seaborn as sns
import os
import urllib.request

# Configure Bangla Fonts

In [None]:

def setup_bangla_fonts():
    """Download and install Bangla fonts for matplotlib"""
    font_dir = '/usr/share/fonts/truetype/bangla/'
    os.makedirs(font_dir, exist_ok=True)

    # Download Noto Sans Bengali font
    font_url = 'https://github.com/googlefonts/noto-fonts/raw/main/hinted/ttf/NotoSansBengali/NotoSansBengali-Regular.ttf'
    font_path = f'{font_dir}NotoSansBengali-Regular.ttf'

    if not os.path.exists(font_path):
        print("Downloading Bangla font...")
        urllib.request.urlretrieve(font_url, font_path)
        print("Font downloaded successfully!")

        # Clear matplotlib font cache (updated method)
        import matplotlib.font_manager as fm
        import matplotlib.pyplot as plt

        # Delete font cache and rebuild
        try:
            import shutil
            cache_dir = fm.get_cachedir()
            if os.path.exists(cache_dir):
                shutil.rmtree(cache_dir)
            fm.FontManager.__init__(fm.fontManager)
            print("Font cache rebuilt successfully!")
        except Exception as e:
            print(f"  Font cache rebuild failed: {e}")
            print("You may need to restart the runtime for fonts to work properly")

    return 'Noto Sans Bengali'

# Setup fonts
font_name = setup_bangla_fonts()

# Configure matplotlib with fallback fonts
plt.rcParams['font.family'] = [font_name, 'DejaVu Sans', 'Liberation Sans', 'sans-serif']
plt.rcParams['axes.unicode_minus'] = False  # Handle minus signs properly

print(f"Font configured: {font_name}")
print("If Bangla text doesn't display properly, restart runtime and run again")


Font configured: Noto Sans Bengali
If Bangla text doesn't display properly, restart runtime and run again


# Tokenizer Class

In [None]:
class BanglaTokenizer:
    """
    Bangla Text Tokenizer using Regular Expressions
    Implements word-level and character-level tokenization for Bangla text
    """

    def __init__(self):
        # Word-level tokenization patterns
        self.word_patterns = {
            'WORD': r'[\u0980-\u09FF]+',  # Bangla Unicode block
            'NUMBER': r'[\u09E6-\u09EF]+',  # Bangla digits
            'PUNCTUATION': r'[।,?!;:]|\u0964|\u0965',  # Bangla and borrowed punctuation
            'WHITESPACE': r'\s+',  # Standard whitespace
            'LATIN_WORD': r'[a-zA-Z]+',  # Latin characters (for mixed text)
            'ARABIC_NUM': r'[0-9]+',  # Arabic numerals
            'OTHER': r'[^\s]'  # Any other character
        }

        # Character-level tokenization patterns (hierarchical precedence)
        self.char_patterns = {
            'COMPOUND': r'[\u0995-\u09B9](?:\u09CD[\u0995-\u09B9])+',  # Compound consonants
            'CONSONANT_VOWEL': r'[\u0995-\u09B9][\u09BE-\u09CC]',  # Consonant + vowel diacritic
            'CONSONANT': r'[\u0995-\u09B9]',  # Individual consonants
            'INDEPENDENT_VOWEL': r'[\u0985-\u0994]',  # Independent vowels
            'DEPENDENT_VOWEL': r'[\u09BE-\u09CC]',  # Dependent vowel signs
            'MODIFIER': r'[\u09BC\u09CD\u09D7]',  # Modifiers and diacritics
            'BANGLA_DIGIT': r'[\u09E6-\u09EF]',  # Bangla digits
            'PUNCTUATION': r'[।,?!;:]|\u0964|\u0965',  # Punctuation
            'WHITESPACE': r'\s',  # Individual whitespace
            'LATIN_CHAR': r'[a-zA-Z]',  # Latin characters
            'ARABIC_DIGIT': r'[0-9]',  # Arabic numerals
            'OTHER': r'.'  # Any other character
        }

        # Compile patterns for efficiency
        self.word_regex = self._compile_patterns(self.word_patterns)
        self.char_regex = self._compile_patterns(self.char_patterns)

    def _compile_patterns(self, patterns: Dict[str, str]) -> List[Tuple[str, re.Pattern]]:
        """Compile regex patterns with their labels"""
        return [(label, re.compile(pattern)) for label, pattern in patterns.items()]

    def normalize_text(self, text: str) -> str:
        """Apply Unicode NFC normalization"""
        return unicodedata.normalize('NFC', text)

    def word_tokenize(self, text: str) -> List[Dict[str, str]]:
        """
        Perform word-level tokenization
        Returns list of dictionaries with token and type
        """
        normalized_text = self.normalize_text(text)
        tokens = []
        position = 0

        while position < len(normalized_text):
            matched = False

            # Try each pattern in order
            for label, pattern in self.word_regex:
                match = pattern.match(normalized_text, position)
                if match:
                    token_text = match.group()
                    if label != 'WHITESPACE' or token_text.strip():  # Include whitespace but filter empty
                        tokens.append({
                            'token': token_text,
                            'type': label,
                            'position': position,
                            'length': len(token_text)
                        })
                    position = match.end()
                    matched = True
                    break

            if not matched:
                # Handle unmatched characters
                tokens.append({
                    'token': normalized_text[position],
                    'type': 'UNKNOWN',
                    'position': position,
                    'length': 1
                })
                position += 1

        return tokens

    def character_tokenize(self, text: str) -> List[Dict[str, str]]:
        """
        Perform character-level tokenization with compound character handling
        Returns list of dictionaries with token and type
        """
        normalized_text = self.normalize_text(text)
        tokens = []
        position = 0

        while position < len(normalized_text):
            matched = False

            # Try patterns in precedence order (compound first)
            for label, pattern in self.char_regex:
                match = pattern.match(normalized_text, position)
                if match:
                    token_text = match.group()
                    tokens.append({
                        'token': token_text,
                        'type': label,
                        'position': position,
                        'length': len(token_text)
                    })
                    position = match.end()
                    matched = True
                    break

            if not matched:
                # Fallback for unmatched characters
                tokens.append({
                    'token': normalized_text[position],
                    'type': 'UNKNOWN',
                    'position': position,
                    'length': 1
                })
                position += 1

        return tokens

    def analyze_tokens(self, tokens: List[Dict[str, str]]) -> Dict:
        """Analyze tokenization results"""
        token_texts = [t['token'] for t in tokens]
        token_types = [t['type'] for t in tokens]

        # Count statistics
        total_tokens = len(tokens)
        unique_tokens = len(set(token_texts))
        token_freq = Counter(token_texts)
        type_freq = Counter(token_types)

        # Most common tokens
        most_common_tokens = token_freq.most_common(10)

        return {
            'total_tokens': total_tokens,
            'unique_tokens': unique_tokens,
            'token_frequency': token_freq,
            'type_frequency': type_freq,
            'most_common_tokens': most_common_tokens,
            'tokens': tokens
        }

# Load Text File

In [None]:
def load_text_file(file_path: str) -> str:
    """Load text from file with proper encoding"""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Text file not found at: {file_path}\n"
                              f"Please upload a .txt file with Bangla text to: {file_path}")

    # Try different encodings
    encodings = ['utf-8', 'utf-16', 'utf-8-sig', 'cp1252']

    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                content = f.read().strip()
                if content:  # Check if file is not empty
                    print(f"File loaded successfully with {encoding} encoding")
                    return content
                else:
                    raise ValueError("File is empty")
        except (UnicodeDecodeError, UnicodeError):
            continue
        except Exception as e:
            raise Exception(f"Error reading file: {str(e)}")

    raise UnicodeDecodeError("Could not decode the file with any supported encoding")

def get_sample_text() -> str:
    raise NotImplementedError("Sample text is not provided. Please upload a .txt file with Bangla text.")



# Results Generation

In [None]:

def print_detailed_results(word_analysis: Dict, char_analysis: Dict, sample_size: int = 20):
    """Print detailed tokenization results"""
    print("BANGLA TEXT TOKENIZATION RESULTS")
    print(" ")

    # Summary Statistics
    print("\nSUMMARY STATISTICS")
    print(" ")
    print(f"Word-level Tokenization:")
    print(f"  • Total tokens: {word_analysis['total_tokens']:,}")
    print(f"  • Unique tokens: {word_analysis['unique_tokens']:,}")
    print(f"  • Type-token ratio: {word_analysis['unique_tokens']/word_analysis['total_tokens']:.3f}")

    print(f"\nCharacter-level Tokenization:")
    print(f"  • Total tokens: {char_analysis['total_tokens']:,}")
    print(f"  • Unique tokens: {char_analysis['unique_tokens']:,}")
    print(f"  • Type-token ratio: {char_analysis['unique_tokens']/char_analysis['total_tokens']:.3f}")

    # Token Type Distribution
    print("\nTOKEN TYPE DISTRIBUTION")
    print(" ")

    print("Word-level:")
    for token_type, count in word_analysis['type_frequency'].items():
        percentage = (count / word_analysis['total_tokens']) * 100
        print(f"  • {token_type:<15}: {count:>4} ({percentage:>5.1f}%)")

    print("\nCharacter-level:")
    for token_type, count in char_analysis['type_frequency'].items():
        percentage = (count / char_analysis['total_tokens']) * 100
        print(f"  • {token_type:<20}: {count:>4} ({percentage:>5.1f}%)")

    # Most Common Tokens
    print("\nMOST FREQUENT TOKENS")
    print(" ")

    print("Word-level (Top 10):")
    for i, (token, count) in enumerate(word_analysis['most_common_tokens'][:10], 1):
        print(f"  {i:>2}. '{token}' → {count} times")

    print("\nCharacter-level (Top 15):")
    for i, (token, count) in enumerate(char_analysis['most_common_tokens'][:15], 1):
        print(f"  {i:>2}. '{token}' → {count} times")

    # Sample Tokens
    print(f"\nSAMPLE TOKENS (First {sample_size})")
    print(" ")

    print("Word-level tokens:")
    word_tokens = [t for t in word_analysis['tokens'][:sample_size] if t['type'] != 'WHITESPACE'][:sample_size]
    for i, token in enumerate(word_tokens, 1):
        print(f"  {i:>2}. [{token['type']:<12}] '{token['token']}'")

    print(f"\nCharacter-level tokens:")
    char_tokens = [t for t in char_analysis['tokens'][:sample_size*2] if t['type'] != 'WHITESPACE'][:sample_size]
    for i, token in enumerate(char_tokens, 1):
        print(f"  {i:>2}. [{token['type']:<18}] '{token['token']}'")



# Main Function

In [None]:
def main():
    """Main pipeline execution"""
    print("Bangla Text Tokenization Pipeline Starting...")
    print("REQUIREMENTS:")
    print("   • Upload a .txt file containing Bangla text")
    print("   • Update the 'text_file_path' variable with your file path")
    print("-" * 60)

    # Initialize tokenizer
    tokenizer = BanglaTokenizer()

    # Load text (USER MUST MODIFY THIS PATH)
    text_file_path = "/content/Typed_GroundTruth.txt"  #

    print(f"Looking for text file at: {text_file_path}")

    try:
        text = load_text_file(text_file_path)
        print(f"Input text length: {len(text)} characters")
        print(f"Input text preview: {text[:100]}...")

        # Perform tokenization
        print("\nPerforming word-level tokenization...")
        word_tokens = tokenizer.word_tokenize(text)
        word_analysis = tokenizer.analyze_tokens(word_tokens)

        print("Performing character-level tokenization...")
        char_tokens = tokenizer.character_tokenize(text)
        char_analysis = tokenizer.analyze_tokens(char_tokens)

        # Display results
        print_detailed_results(word_analysis, char_analysis)

        # Export results to CSV (optional)
        print("\nExporting results...")

        # Word tokens DataFrame
        word_df = pd.DataFrame(word_analysis['tokens'])
        word_df.to_csv('/content/word_tokens.csv', index=False, encoding='utf-8')

        # Character tokens DataFrame
        char_df = pd.DataFrame(char_analysis['tokens'])
        char_df.to_csv('/content/char_tokens.csv', index=False, encoding='utf-8')

        print("Results exported to CSV files:")
        print("   • word_tokens.csv")
        print("   • char_tokens.csv")

        return word_analysis, char_analysis

    except FileNotFoundError as e:
        print(f"ERROR: {e}")
        print("\nTO FIX THIS:")
        print("1. Upload your Bangla text file to Colab")
        print("2. Update the 'text_file_path' variable in main() function")
        print("3. Make sure the file contains Bangla text in UTF-8 encoding")
        return None, None

    except Exception as e:
        print(f"ERROR: {e}")
        return None, None



# Execute the pipeline

In [None]:
# Execute the pipeline
if __name__ == "__main__":
    # Check if results were successful before proceeding
    word_results, char_results = main()

    if word_results is not None and char_results is not None:
        print("\nPipeline completed successfully!")
        print("You can now analyze the results and CSV files.")
    else:
        print("\nPipeline failed. Please check the error messages above.")
        print("Make sure to upload your Bangla text file and update the file path.")

Bangla Text Tokenization Pipeline Starting...
REQUIREMENTS:
   • Upload a .txt file containing Bangla text
   • Update the 'text_file_path' variable with your file path
------------------------------------------------------------
Looking for text file at: /content/Typed_GroundTruth.txt
File loaded successfully with utf-8 encoding
Input text length: 1106 characters
Input text preview: ৫ আগস্ট অন্তর্বর্তী সরকারের প্রধান উপদেষ্টা অধ্যাপক মুহাম্মদ ইউনূস জাতীয় সংসদের দক্ষিণ প্লাজায় জুলাই...

Performing word-level tokenization...
Performing character-level tokenization...
BANGLA TEXT TOKENIZATION RESULTS
 

SUMMARY STATISTICS
 
Word-level Tokenization:
  • Total tokens: 185
  • Unique tokens: 138
  • Type-token ratio: 0.746

Character-level Tokenization:
  • Total tokens: 789
  • Unique tokens: 167
  • Type-token ratio: 0.212

TOKEN TYPE DISTRIBUTION
 
Word-level:
  • WORD           :  159 ( 85.9%)
  • PUNCTUATION    :   20 ( 10.8%)
  • OTHER          :    6 (  3.2%)

Character-level:
  • BA