# BPE vocabulary analysis

(C) Maxim Gansert, 2019, Mindscan

Recently I build some bpe-vocabularies, which had way too much tokens because of arabic and asian languages. This notebook is intended to review these vocabularies...

Current state: the vocabulary was cleaned up... So the initial reason for this notebook is no longer given.


In [None]:
import sys
sys.path.insert(0,'../src')

In [None]:
import functools 
from de.mindscan.fluentgenesis.bpe.bpe_model import BPEModel

In [None]:
model = BPEModel("16K-full", "../src/de/mindscan/fluentgenesis/bpe/")
model.load_hparams()
model_vocabulary = model.load_tokens()
model_bpe_data = model.load_bpe_pairs()


In [None]:
model_vocabulary_statistics_length = {}

for token, _ in model_vocabulary.items():
    lengthOfToken = len(token)
    if lengthOfToken not in model_vocabulary_statistics_length:
        model_vocabulary_statistics_length[lengthOfToken] = 1
    else:
        model_vocabulary_statistics_length[lengthOfToken] += 1
        
print(model_vocabulary_statistics_length)

The result is something like this:

{1: 24804, 2: 1462, 3: 2465, 4: 2213, 5: 1906, 6: 1639, 7: 1426, 8: 1255, 9: 971, 10: 738, 11: 505, 12: 349, 13: 277, 14: 202, 15: 156, 16: 107, 17: 84, 18: 64, 19: 44, 20: 35, 21: 22, 22: 21, 23: 8, 24: 9, 25: 13, 26: 7, 27: 3, 28: 3, 29: 2, 30: 4, 32: 2, 33: 2, 34: 1, 36: 1, 37: 1, 42: 1, 43: 1, 51: 1}

The intended dictionary size was about 16K tokens...


In [None]:
functools.reduce( lambda sum,y: sum + model_vocabulary_statistics_length[y] ,model_vocabulary_statistics_length.keys() )


The result for all tokenlength longer than one char is 16001 (oops, off by one in token calculation...) we can see, that we have another 24804 Tokens of length one. Since we might can not encode every words with pairs, and to takle the unknown word problem, we have to emit all unpaired tokens at the end of the process. There are nearly twice the number of unpaired tokens than paired tokens...

These originate from strings containing other languages, so lets identify these




In [None]:
import collections

unsupported_vocab_ranges = [
        # Latin Extended
        (0x0100, 0x017F),  # Latin Extended-A
        (0x0180, 0x024F),  # Latin Extended-B
        (0x1E00, 0x1EFF),  # Latin Extended Additional
        (0x2C60, 0x2C7F),  # Latin Extended-C
        (0xA720, 0xA7FF),  # Latin Extended-D
        (0xAB30, 0xAB6F),  # Latin Extended-E
    
        # Diacritical
        (0x0300, 0x036F),  # Combining Diacritical Marks
        (0x1AB0, 0x1AFF),  # Combining Diacritical Marks Extended
        (0x1DC0, 0x1DFF),  # Combining Diacritical Marks Supplement
        (0x20D0, 0x20FF),  # Combining Diacritical Marks for Symbols
    
        # IPA & phonetic Extemsion
        (0x0250, 0x02AF),  # IPA Extensions
        (0x1D00, 0x1D7F),  # Phonetic Extensions
        (0x1D80, 0x1DBF),  # Phonetic Extensions Supplement
    
        # Spacing Modifier Letters
        (0x02B0, 0x02FF),  # Spacing Modifier Letters
    
        # Greek Coptic
        (0x0370, 0x03FF),  # Greek and coptic
        (0x1F00, 0x1FFF),  # Greek Extended
        (0x2C80, 0x2CFF),  # Coptic
    
        # Cyrillic
        (0x0400, 0x04FF),  # Cyrillic)
        (0x0500, 0x052F),  # Cyrillic Supplement
        (0x2DE0, 0x2DFF),  # Cyrillic Extended-A
        (0xA640, 0xA69F),  # Cyrillic Extended-B
        (0x1C80, 0x1C8F),  # Cyrillic Extended-C
    
        # Armenian
        (0x0530, 0x058F),  # Armenian 
    
        # Hebrew
        (0x0590, 0x05FF),  # Hebrew
    
        # Arabic
        (0x0600, 0x06FF),  # Arabic
        (0x0750, 0x077F),  # Arabic Supplement
        (0x08A0, 0x08FF),  # Arabic Extended-A
        (0xFB50, 0xFDFF),  # Arabic Presentation Forms A
        (0xFE70, 0xFEFF),  # Arabic Presentation Forms B
    
        # Syriac
        (0x0700, 0x074F),  # Syriac
        (0x0860, 0x086F),  # Syriac Supplement
    
        # Thaana
        (0x0780, 0x07BF),  # Thaana
    
        # NKo
        (0x07C0, 0x07FF),  # NKo
    
        # Samritan
        (0x0800, 0x083F),  # Samaritan
    
        # Mandaic
        (0x0840, 0x085F),  # Mandaic
    
        # Invalid range
        (0x0870, 0x089F),  # Invalid range

        # Indian Subkontinent Languages
        (0x0900, 0x097F),  # Devanagari
        (0xA8E0, 0xA8FF),  # Devanagari Extended
         
        # (0x0980, 0x09FF), # Bengali
        # ...
        (0x0900, 0x0DFF),  # India - covers multiple languages
        (0xA830, 0xA83F),  # Common Indic Number Forms


        (0x0E00, 0x0E7F),  # Thai
        (0x0E80, 0x0EFF),  # Lao
    
        (0x0F00, 0x0FFF),  # Tibetan
    
        # Myanmar
        (0x1000, 0x109F),  # Myanmar
        (0xAA60, 0xAA7F),  # Myanmar Extended-A
        (0xA9E0, 0xA9FF),  # Myanmar Extended-B
    
        # Georgian
        (0x10A0, 0x10FF),  # Georgian
        (0x2D00, 0x2D2F),  # Georgian Supplement
        (0x1C90, 0x1CBF),  # Georgian Extended
    
        # Korean
        (0x1100, 0x11FF),  # Hangul Jamo    
        (0x3130, 0x318F),  # Hangul Compatibility Jamo    
        (0xAC00, 0xD7AF),  # Hangul Syllables
        (0xA960, 0xA97F),  # Hangul Jamo Extended-A
        (0xD7B0, 0xD7FF),  # Hangul Jamo Extended B
    
        # Ethiopic
        (0x1200, 0x139f), # Ethiopic, Ethopic Supplement
        (0xAB00, 0xAB2F), # Ethiopic Extended-A
        (0x2D80, 0x2DDF), # Ethiopic Extended

        # Cherokee
        (0x13A0, 0x13FF),  # Cherokee
        (0xAB70, 0xABBF),  # Cherokee Supplement
    
        # Canadian Aboriginal
        (0x1400, 0x167F),  # Unified Canadian Aboriginal Syllabics
        (0x18B0, 0x18FF),  # Unified Canadian Aboriginal Syllabics Extended
    
    
        (0x1680, 0x169F),  # Ogham
        (0x16A0, 0x16FF),  # Runic
        (0x1700, 0x171F),  # Tagalog
        (0x1720, 0x173F),  # Hanunoo
        (0x1740, 0x175F),  # Buhid
        (0x1760, 0x177F),  # Tagbanwa
        
        # Khmer
        (0x1780, 0x17FF),  # Khmer
        (0x19E0, 0x19FF),  # Khmer Symbols
    
        # Mongolian
        (0x1800, 0x18AF),  # Mongolian

        (0x1900, 0x194F),  # Limbu
        (0x1950, 0x197F),  # Tai Le
        (0x1980, 0x19DF),  # New Tai Lue
    
        (0x1A00, 0x1A1F),  # Buginese
        (0x1A20, 0x1AAF),  # Tai Tham
    
        (0x1B00, 0x1B7F),  # Balinese
    
        (0x1B80, 0x1BBF),  # Sundanese
        (0x1CC0, 0x1CCF),  # Sundanese Supplement
    
        (0x1BC0, 0x1BFF),  # Batak
    
        (0x1C00, 0x1C4F),  # Lepcha
        (0x1C50, 0x1C7F),  # Ol Chiki
        (0x1CD0, 0x1CFF),  # Vedic Extensions
    
        # Punctuation
        (0x2000, 0x206F),  # General Punctuation
        (0x2E00, 0x2E7F),  # Supplemental Punctuation
        (0x3000, 0x303F),  # CJK Symbols and Punctuation
        
        (0x2070, 0x209F),  # Superscripts and Subscripts
        (0x20A0, 0x20CF),  # Currency Symbols
    
    
        # Symbols
        (0x2100, 0x26ff), # Letterlike Symbols, ... Miscelaneous Symbols
        (0x2700, 0x27FF), # Dingbats & co
        (0x2800, 0x28FF), # Braille
        (0x2900, 0x2BFF), # Symbols Arrows math
    
        (0x2D30, 0x2D7F),  # Tifinagh
        
        (0x2f00, 0x2FFF),  # Kangxi radicals, Ideographic Description Characters
    

        # CJK
        (0x3000, 0xa4FF),
        (0xFE30, 0xFE4F),  # CJK Compatibility Forms
        (0xF900, 0xFAFF),  # CJK Compatibility Ideographs
        (0x2E80, 0x2EFF),  # CJK Radicals Supplement
    

        #
        (0xA500, 0xA63F),  # Vai
        (0xA6A0, 0xA6FF),  # Bamum
        (0xA700, 0xA71F),  # Modifier Tone Letters
        (0xA800, 0xA82F),  # Syloti Nagri
        (0xA840, 0xA87F),  # Phags-pa
        (0xA880, 0xA8DF),  # Saurashtra
        (0xA900, 0xA92F),  # Kayah Li
        (0xA930, 0xA95F),  # Rejang
        (0xA980, 0xA9DF),  # Javanese
    
        (0xAA00, 0xAA5F),  # Cham
        (0xAA80, 0xAADF),  # Tai Viet
    
        (0xABC0, 0xABFF),  # Meetei Mayek
        (0xAAE0, 0xAAFF),  # Meetei Mayek Extensions

        # Private Area
        (0xE000,0xF8FF),  # Private Use Area
        (0xD800, 0xDB7F), # High Surrogates
        (0xDB80, 0xDBFF), # High Private Use Surrogates
        (0xDC00, 0xDFFF), # Low Surrogates
        # ???
        (0x2c00, 0x2c5f), # Glagolitic
        
    
        (0xFB00, 0xFB4F),  # Alphabetic Presentation Forms
        (0xFE00, 0xFE0F),  # Variation Selectors
        (0xFE10, 0xFE1F),  # Vertical Forms
        (0xFE20, 0xFE2F),  # Combining Half Marks
        (0xFE50, 0xFE6F),  # Small Form Variants
        
        (0xFF00, 0xFFEF),  # Halfwidth and Fullwidth Forms
    
        # OLD and OLDER
        (0x010000, 0x10FFFF) # Basically everything what is not as important to be in first ~65000 Codes

    ]

def is_unsupported_character(char):
    char=ord(char)
    for bottom, top in unsupported_vocab_ranges:
        if char >= bottom and char <= top:
            return True
    return False

one_char_elements = filter(lambda x: len(x)==1,model_vocabulary )
chars = filter(lambda char: not(is_unsupported_character(char)), one_char_elements)
charsasArray = [x for x in chars]

orderedChars = sorted(charsasArray, key=lambda item: item)
print(len(orderedChars))
print(orderedChars)

print(["0x%x"%ord(item) for item in orderedChars])
