## work with pre-tokenizers

In [4]:
from tokenizers.pre_tokenizers import Split
from tokenizers import Regex

# create regex for \s
ws_re = Regex(r'\s+')

test_str = "Hello    world!🫨"
isolated_pre_tokenizer = Split(ws_re, behavior='isolated')
print("isolated:", isolated_pre_tokenizer.pre_tokenize_str(test_str))
contiguous_pre_tokenizer = Split(ws_re, behavior='contiguous')
print("contiguous:", contiguous_pre_tokenizer.pre_tokenize_str(test_str))

isolated: [('Hello', (0, 5)), ('    ', (5, 9)), ('world!\U0001fae8', (9, 16))]
contiguous: [('Hello', (0, 5)), ('    ', (5, 9)), ('world!\U0001fae8', (9, 16))]


In [28]:
# Define regex pattern
regex_pattern = (
    r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}|"
    r" ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
)

# Define pre-tokenizers
split = Split(pattern=Regex(regex_pattern), behavior="merged_with_next", invert=False)
byte = ByteLevel(add_prefix_space=False, use_regex=False)
pre_tokenizer = Sequence([split, byte])

test_str = "Hello    1000 world!🫨"
print("just byte:", byte.pre_tokenize_str(test_str))
print("sequence:", pre_tokenizer.pre_tokenize_str(test_str))

just byte: [('HelloĠĠĠĠ1000Ġworld!ðŁ«¨', (0, 21))]
sequence: [('Hello', (0, 5)), ('ĠĠĠ', (5, 8)), ('Ġ', (8, 9)), ('100', (9, 12)), ('0', (12, 13)), ('Ġworld', (13, 19)), ('!ðŁ«¨', (19, 21))]


In [26]:
# Define regex pattern
regex_pattern =  (
    r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|"
    r"\d(?=(\d{2})+(\D|$))| ?[^\s\p{L}\p{N}]+[\r\n]*|"
    r"\s*[\r\n]+|\s+(?!\S)|\s+"
)

# Define pre-tokenizers
split = Split(pattern=Regex(regex_pattern), behavior="merged_with_next", invert=False)
byte = ByteLevel(add_prefix_space=False, use_regex=False)
pre_tokenizer = Sequence([split, byte])

test_str = "Hello    100000 world!🫨"
print("isolated:", pre_tokenizer.pre_tokenize_str(test_str))

isolated: [('Hello', (0, 5)), ('ĠĠĠ', (5, 8)), ('Ġ1', (8, 10)), ('00', (10, 12)), ('000', (12, 15)), ('Ġworld', (15, 21)), ('!ðŁ«¨', (21, 23))]


In [12]:
import re
def split_number(number):
    # Regular expression to find the right spot for inserting a space
    # This looks for digit sequences that are followed by three more digits, from the right
    # Ensure the pattern matches only in the right-to-left order
    regex = r'(?<=\d)(?=(\d{3})+$)'

    # Use re.sub to insert spaces at matched positions
    result = re.sub(regex, ' ', number)

    return result

# Example usage
number = "1000"
formatted_number = split_number(number)
formatted_number

'1 000'

In [1]:
from tokenizers import Tokenizer, Regex
from tokenizers.models import BPE
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [2]:
tokenizer.save(f"tokenizer.json")

In [5]:
import os
fitting_corpus_path = "/shared/3/projects/hiatus/TOKENIZER_wegmann/data/fitting-corpora/wikipedia"
corpus_name = os.path.basename(fitting_corpus_path)

In [6]:
corpus_name

'wikipedia'