In [1]:
from tokenization_baichuan import BaichuanTokenizer

In [2]:
original = BaichuanTokenizer.from_pretrained(".")

In [3]:
from transformers.convert_slow_tokenizer import SpmConverter, LlamaConverter, GemmaConverter, _get_prepend_scheme
from tokenizers import decoders, normalizers, pre_tokenizers, processors, Tokenizer, AddedToken
from tokenizers.models import BPE

Barebones SPM converter. If we don't add any `AddedToken`s, we get 100% match on XNLI (but see below).

In [24]:
class BaichuanConverter(SpmConverter):
    handle_byte_fallback = True

    def vocab(self, proto):
        vocab = [
            (self.original_tokenizer.convert_ids_to_tokens(0), 0.0),
            (self.original_tokenizer.convert_ids_to_tokens(1), 0.0),
            (self.original_tokenizer.convert_ids_to_tokens(2), 0.0),
        ]
        vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
        return vocab

    def unk_id(self, proto):
        unk_id = 0
        return unk_id

    def decoder(self, replacement, add_prefix_space):
        sequence = [
            decoders.Replace("▁", " "),
            decoders.ByteFallback(),
            decoders.Fuse(),
        ]
        return decoders.Sequence(sequence)

    def normalizer(self, proto):
        return normalizers.Replace(pattern=" ", content="▁")

    def pre_tokenizer(self, replacement, add_prefix_space):
        return None

    def post_processor(self):
        return None

    def tokenizer(self, proto):
        vocab_scores = self.vocab(proto)
        _, merges = self.SpmExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
        bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)}
        tokenizer = Tokenizer(
            BPE(
                bpe_vocab,
                merges,
                unk_token=proto.trainer_spec.unk_piece,
                fuse_unk=True,
                byte_fallback=self.handle_byte_fallback,
                dropout=None,
            )
        )

        # control tokens are special
        # user defined symbols are not
        # both user and control tokens are AddedTokens
        # Add user defined symbols (type == 4) from sentencepiece (https://github.com/google/sentencepiece/blob/6225e08edb2577757163b3f5dbba4c0b670ef445/src/sentencepiece_model.proto#L299C29-L299C33)
        spm_added_tokens = [
            (id, p.piece, p.type == 3 or p.piece in self.special_tokens)
            for id, p in enumerate(proto.pieces)
            if p.type in [3, 4]
        ]
        for t in spm_added_tokens:
            if t[1].endswith("{\n"):
                print(t)
        # print("added tokens", len(spm_added_tokens))
        # tokenizer.add_tokens(
        #     [
        #         AddedToken(token, normalized=False, special=special)
        #         for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
        #     ]
        # )

        return tokenizer


In [25]:
converter = BaichuanConverter(original)

In [26]:
converted = converter.converted()

(131334, '▁()=>{\n', False)
(131367, '(()=>{\n', False)
(131450, "'])){\n", False)
(131479, '()=>{\n', False)
(131491, '())){\n', False)
(131514, '▁""){\n', False)
(131524, '"])){\n', False)
(131591, "▁''){\n", False)
(131692, "']){\n", False)
(131806, '"]){\n', False)
(131817, '▁(){\n', False)
(131830, '▁=>{\n', False)
(131868, "')){\n", False)
(131895, '▁},{\n', False)
(131896, '))){\n', False)
(131898, '")){\n', False)
(131922, '[]){\n', False)
(131997, '▁//{\n', False)
(132005, '--){\n', False)
(132021, '])){\n', False)
(132112, '()){\n', False)
(132140, '++){\n', False)
(132156, ')=>{\n', False)
(132213, '":[{\n', False)
(132240, '▁@{\n', False)
(132246, ')){\n', False)
(132279, '▁^{\n', False)
(132289, "',{\n", False)
(132307, '▁){\n', False)
(132337, '{}{\n', False)
(132397, "'){\n", False)
(132427, '=>{\n', False)
(132457, '"){\n', False)
(132462, ']){\n', False)
(132509, '▁={\n', False)
(132532, '▁[{\n', False)
(132533, '},{\n', False)
(132538, '//{\n', False)
(132567, '::{\n'

In [7]:
from transformers import PreTrainedTokenizerFast

t_fast = PreTrainedTokenizerFast(
    tokenizer_object=converted,
    model_input_names=original.model_input_names,
    model_max_length=32768,
    clean_up_tokenization_spaces=False,
)

In [8]:
original.encode(" {\n")

[133035]

In [9]:
t_fast.encode(" {\n")

[133035]

^ These work because we didn't add them. The actual token is:

In [12]:
original.get_vocab()["▁{\n"]

133035

So it has a metaspace char. In addition, `"{\n"` is also in the list of added tokens as 133081 (see last line in the log above).

So, if we encode `" {\n"`, we would split by the added tokens regexp, which happens _before_ replacing the space by `▁`, and we would get a match for `"{\n"` and therefore encoding would not give the same result.

In [48]:
original.decode([133081])

'{\n'

If we skip the added tokens, then XNLI succeeds but Code Parrot fails:

In [27]:
original.encode('----------------------------------------------------------------------------\n')

[1414, 131098]

In [28]:
t_fast.encode('----------------------------------------------------------------------------\n')

[3604, 75215, 131143]

This is because those tokens need to be added :(

How can we reconcile both things?

In [49]:
original.encode("hello")

[18632]

In [50]:
t_fast.encode("hello")

[18632]

In [51]:
original.decode([18632])

'hello'

In [52]:
t_fast.decode([30109])

' hello'

In [53]:
t_fast.decode([18632])

'hello'

In [54]:
original.decode([30109])

' hello'

Testing on xnli

In [55]:
from datasets import load_dataset
from tqdm import tqdm

In [56]:
xnli = load_dataset("xnli", "all_languages", split="validation")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [57]:
def verify(lang, text):
    encoded_original = original.encode(text)
    encoded_fast = t_fast.encode(text)
    assert encoded_fast == encoded_original, f"Fast encode error: {lang} - {text}"
    decoded = original.decode(encoded_original)
    decoded_fast = t_fast.decode(encoded_fast, skip_special_tokens=True)
    assert decoded_fast == decoded, f"Fast decode error: {lang} - {text}"

In [58]:
for p in tqdm(xnli["premise"]):
    for lang, text in p.items():
        verify(lang, text)

100%|███████████████████████████████████████████████████████████████████████████████| 2490/2490 [00:16<00:00, 150.03it/s]


Testing on codeparrot

In [59]:
ds = load_dataset("codeparrot/github-code", streaming=True, trust_remote_code=True, split="train")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [60]:
skipped = 0
iterator = iter(ds)
for _ in tqdm(range(1000)):
    item = next(iterator)
    code = item["code"]
    lang = item["language"]
    if False and item["size"] > 1000:
        skipped += 1
        continue
    verify(lang, code)

  1%|▌                                                                                  | 7/1000 [00:02<05:41,  2.90it/s]


AssertionError: Fast encode error: C - /* Copyright information is at end of file */

#include "xmlrpc_config.h"

#include <stddef.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>

#include "stdargx.h"

#include "xmlrpc-c/base.h"
#include "xmlrpc-c/base_int.h"
#include "xmlrpc-c/string_int.h"


static void
getString(xmlrpc_env *const envP,
          const char **const formatP,
          va_listx *const argsP,
          xmlrpc_value **const valPP) {

    const char *str;
    size_t len;

    str = (const char *) va_arg(argsP->v, char*);
    if (*(*formatP) == '#') {
        ++(*formatP);
        len = (size_t) va_arg(argsP->v, size_t);
    } else
        len = strlen(str);

    *valPP = xmlrpc_string_new_lp(envP, len, str);
}


static void
getWideString(xmlrpc_env *const envP ATTR_UNUSED,
              const char **const formatP ATTR_UNUSED,
              va_listx *const argsP ATTR_UNUSED,
              xmlrpc_value **const valPP ATTR_UNUSED) {

#if HAVE_UNICODE_WCHAR
    wchar_t *wcs;
    size_t len;
    
    wcs = (wchar_t*) va_arg(argsP->v, wchar_t*);
    if (**formatP == '#') {
        (*formatP)++;
        len = (size_t) va_arg(argsP->v, size_t);
    } else
        len = wcslen(wcs);

    *valPP = xmlrpc_string_w_new_lp(envP, len, wcs);

#endif /* HAVE_UNICODE_WCHAR */
}


static void
getBase64(xmlrpc_env *const envP,
          va_listx *const argsP,
          xmlrpc_value **const valPP) {

    unsigned char *value;
    size_t length;

    value = (unsigned char *) va_arg(argsP->v, unsigned char*);
    length = (size_t) va_arg(argsP->v, size_t);

    *valPP = xmlrpc_base64_new(envP, length, value);
}


static void
        getValue(xmlrpc_env *const envP,
                 const char **const format,
                 va_listx *const argsP,
                 xmlrpc_value **const valPP);


static void
getArray(xmlrpc_env *const envP,
         const char **const formatP,
         char const delimiter,
         va_listx *const argsP,
         xmlrpc_value **const arrayPP) {

    xmlrpc_value *arrayP;

    arrayP = xmlrpc_array_new(envP);

    /* Add items to the array until we hit our delimiter. */

    while (**formatP != delimiter && !envP->fault_occurred) {

        xmlrpc_value *itemP;

        if (**formatP == '\0')
            xmlrpc_env_set_fault(
                    envP, XMLRPC_INTERNAL_ERROR,
                    "format string ended before closing ')'.");
        else {
            getValue(envP, formatP, argsP, &itemP);
            if (!envP->fault_occurred) {
                xmlrpc_array_append_item(envP, arrayP, itemP);
                xmlrpc_DECREF(itemP);
            }
        }
    }
    if (envP->fault_occurred)
        xmlrpc_DECREF(arrayP);

    *arrayPP = arrayP;
}


static void
getStructMember(xmlrpc_env *const envP,
                const char **const formatP,
                va_listx *const argsP,
                xmlrpc_value **const keyPP,
                xmlrpc_value **const valuePP) {


    /* Get the key */
    getValue(envP, formatP, argsP, keyPP);
    if (!envP->fault_occurred) {
        if (**formatP != ':')
            xmlrpc_env_set_fault(
                    envP, XMLRPC_INTERNAL_ERROR,
                    "format string does not have ':' after a "
                            "structure member key.");
        else {
            /* Skip over colon that separates key from value */
            (*formatP)++;

            /* Get the value */
            getValue(envP, formatP, argsP, valuePP);
        }
        if (envP->fault_occurred)
            xmlrpc_DECREF(*keyPP);
    }
}


static void
getStruct(xmlrpc_env *const envP,
          const char **const formatP,
          char const delimiter,
          va_listx *const argsP,
          xmlrpc_value **const structPP) {

    xmlrpc_value *structP;

    structP = xmlrpc_struct_new(envP);
    if (!envP->fault_occurred) {
        while (**formatP != delimiter && !envP->fault_occurred) {
            xmlrpc_value *keyP;
            xmlrpc_value *valueP;

            getStructMember(envP, formatP, argsP, &keyP, &valueP);

            if (!envP->fault_occurred) {
                if (**formatP == ',')
                    (*formatP)++;  /* Skip over the comma */
                else if (**formatP == delimiter) {
                    /* End of the line */
                } else
                    xmlrpc_env_set_fault(
                            envP, XMLRPC_INTERNAL_ERROR,
                            "format string does not have ',' or ')' after "
                                    "a structure member");

                if (!envP->fault_occurred)
                    /* Add the new member to the struct. */
                    xmlrpc_struct_set_value_v(envP, structP, keyP, valueP);

                xmlrpc_DECREF(valueP);
                xmlrpc_DECREF(keyP);
            }
        }
        if (envP->fault_occurred)
            xmlrpc_DECREF(structP);
    }
    *structPP = structP;
}


static void
mkArrayFromVal(xmlrpc_env *const envP,
               xmlrpc_value *const value,
               xmlrpc_value **const valPP) {

    if (xmlrpc_value_type(value) != XMLRPC_TYPE_ARRAY)
        xmlrpc_env_set_fault(envP, XMLRPC_INTERNAL_ERROR,
                             "Array format ('A'), non-array xmlrpc_value");
    else
        xmlrpc_INCREF(value);

    *valPP = value;
}


static void
mkStructFromVal(xmlrpc_env *const envP,
                xmlrpc_value *const value,
                xmlrpc_value **const valPP) {

    if (xmlrpc_value_type(value) != XMLRPC_TYPE_STRUCT)
        xmlrpc_env_set_fault(envP, XMLRPC_INTERNAL_ERROR,
                             "Struct format ('S'), non-struct xmlrpc_value");
    else
        xmlrpc_INCREF(value);

    *valPP = value;
}


static void
getValue(xmlrpc_env *const envP,
         const char **const formatP,
         va_listx *const argsP,
         xmlrpc_value **const valPP) {
/*----------------------------------------------------------------------------
   Get the next value from the list.  *formatP points to the specifier
   for the next value in the format string (i.e. to the type code
   character) and we move *formatP past the whole specifier for the
   next value.  We read the required arguments from 'argsP'.  We return
   the value as *valPP with a reference to it.

   For example, if *formatP points to the "i" in the string "sis",
   we read one argument from 'argsP' and return as *valP an integer whose
   value is the argument we read.  We advance *formatP to point to the
   last 's' and advance 'argsP' to point to the argument that belongs to
   that 's'.
-----------------------------------------------------------------------------*/
    char const formatChar = *(*formatP)++;

    switch (formatChar) {
        case 'i':
            *valPP =
                    xmlrpc_int_new(envP, (xmlrpc_int32) va_arg(argsP->v,
                                                               xmlrpc_int32));
            break;

        case 'b':
            *valPP =
                    xmlrpc_bool_new(envP, (xmlrpc_bool) va_arg(argsP->v,
                                                               xmlrpc_bool));
            break;

        case 'd':
            *valPP =
                    xmlrpc_double_new(envP, (double) va_arg(argsP->v, double));
            break;

        case 's':
            getString(envP, formatP, argsP, valPP);
            break;

        case 'w':
            getWideString(envP, formatP, argsP, valPP);
            break;

        case 't':
            *valPP = xmlrpc_datetime_new_sec(envP, va_arg(argsP->v, time_t));
            break;

        case '8':
            *valPP = xmlrpc_datetime_new_str(envP, va_arg(argsP->v, char*));
            break;

        case '6':
            getBase64(envP, argsP, valPP);
            break;

        case 'n':
            *valPP =
                    xmlrpc_nil_new(envP);
            break;

        case 'I':
            *valPP =
                    xmlrpc_i8_new(envP, (xmlrpc_int64) va_arg(argsP->v,
                                                              xmlrpc_int64));
            break;

        case 'p':
            *valPP =
                    xmlrpc_cptr_new(envP, (void *) va_arg(argsP->v, void*));
            break;

        case 'A':
            mkArrayFromVal(envP,
                           (xmlrpc_value *) va_arg(argsP->v, xmlrpc_value*),
                           valPP);
            break;

        case 'S':
            mkStructFromVal(envP,
                            (xmlrpc_value *) va_arg(argsP->v, xmlrpc_value*),
                            valPP);
            break;

        case 'V':
            *valPP = (xmlrpc_value *) va_arg(argsP->v, xmlrpc_value*);
            xmlrpc_INCREF(*valPP);
            break;

        case '(':
            getArray(envP, formatP, ')', argsP, valPP);
            if (!envP->fault_occurred) {
                XMLRPC_ASSERT(**formatP == ')');
                (*formatP)++;  /* Skip over closing parenthesis */
            }
            break;

        case '{':
            getStruct(envP, formatP, '}', argsP, valPP);
            if (!envP->fault_occurred) {
                XMLRPC_ASSERT(**formatP == '}');
                (*formatP)++;  /* Skip over closing brace */
            }
            break;

        default: {
            const char *const badCharacter = xmlrpc_makePrintableChar(
                    formatChar);
            xmlrpc_env_set_fault_formatted(
                    envP, XMLRPC_INTERNAL_ERROR,
                    "Unexpected character '%s' in format string", badCharacter);
            xmlrpc_strfree(badCharacter);
        }
    }
}


void
xmlrpc_build_value_va(xmlrpc_env *const envP,
                      const char *const format,
                      va_list const args,
                      xmlrpc_value **const valPP,
                      const char **const tailP) {

    XMLRPC_ASSERT_ENV_OK(envP);
    XMLRPC_ASSERT(format != NULL);

    if (strlen(format) == 0)
        xmlrpc_faultf(envP, "Format string is empty.");
    else {
        va_listx currentArgs;
        const char *formatCursor;

        init_va_listx(&currentArgs, args);
        formatCursor = &format[0];
        getValue(envP, &formatCursor, &currentArgs, valPP);

        if (!envP->fault_occurred)
            XMLRPC_ASSERT_VALUE_OK(*valPP);

        *tailP = formatCursor;
    }
}


xmlrpc_value *
xmlrpc_build_value(xmlrpc_env *const envP,
                   const char *const format,
                   ...) {

    va_list args;
    xmlrpc_value *retval;
    const char *suffix;

    va_start(args, format);
    xmlrpc_build_value_va(envP, format, args, &retval, &suffix);
    va_end(args);

    if (!envP->fault_occurred) {
        if (*suffix != '\0')
            xmlrpc_faultf(envP, "Junk after the format specifier: '%s'.  "
                                  "The format string must describe exactly "
                                  "one XML-RPC value "
                                  "(but it might be a compound value "
                                  "such as an array)",
                          suffix);

        if (envP->fault_occurred)
            xmlrpc_DECREF(retval);
    }
    return retval;
}


/* Copyright (C) 2001 by First Peer, Inc. All rights reserved.
** Copyright (C) 2001 by Eric Kidd. All rights reserved.
**
** Redistribution and use in source and binary forms, with or without
** modification, are permitted provided that the following conditions
** are met:
** 1. Redistributions of source code must retain the above copyright
**    notice, this list of conditions and the following disclaimer.
** 2. Redistributions in binary form must reproduce the above copyright
**    notice, this list of conditions and the following disclaimer in the
**    documentation and/or other materials provided with the distribution.
** 3. The name of the author may not be used to endorse or promote products
**    derived from this software without specific prior written permission. 
**  
** THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
** ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
** ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
** FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
** DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
** OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
** HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
** LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
** OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
** SUCH DAMAGE. */


In [62]:
encoded = original.encode(code)

In [63]:
fast_encoded = t_fast.encode(code)

In [64]:
len(encoded), len(fast_encoded)

(3302, 3303)

In [65]:
for i, (x, y) in enumerate(zip(encoded, fast_encoded)):
    if x != y:
        print(f"Mismatch at {i}: {x} != {y}")
        break

Mismatch at 1649: 1414 != 3604


In [66]:
original.decode([1414])

'--'

In [67]:
original.decode([3604])

'--------------------------------'

In [43]:
t_fast.decode([1414])

'--'

In [44]:
t_fast.decode([3604])

'--------------------------------'

In [84]:
original.encode("-----------------------------------------------------------------------------*/")

[3604, 3604, 17504, 4446]

In [85]:
t_fast.encode("-----------------------------------------------------------------------------*/")

[3604, 3604, 17504, 4446]

In [53]:
encoded[1649:]

[1414,
 131098,
 1306,
 4307,
 1316,
 2473,
 2292,
 1487,
 1316,
 2210,
 124127,
 1306,
 124166,
 4874,
 124153,
 3615,
 1339,
 1316,
 1974,
 4973,
 133119,
 1306,
 1377,
 1316,
 2473,
 2292,
 1340,
 1316,
 5637,
 3073,
 1400,
 124113,
 124127,
 124109,
 124127,
 1339,
 1316,
 2516,
 3654,
 133119,
 1306,
 3661,
 124133,
 1345,
 1435,
 2975,
 1552,
 4874,
 124153,
 3015,
 1316,
 3760,
 1974,
 4973,
 1377,
 1316,
 133119,
 1306,
 2473,
 2292,
 124127,
 1306,
 2151,
 2067,
 1316,
 3360,
 8571,
 1487,
 1629,
 4612,
 124153,
 6875,
 1306,
 2151,
 2099,
 133119,
 1306,
 1316,
 2292,
 1415,
 1552,
 2182,
 8293,
 1403,
 1308,
 6108,
 1339,
 1406,
 132869,
 1306,
 2358,
 2972,
 124128,
 1645,
 1552,
 4874,
 124153,
 3615,
 1339,
 1316,
 1481,
 124113,
 124145,
 1340,
 1316,
 3073,
 1481,
 8654,
 132858,
 1306,
 1435,
 2067,
 1637,
 7236,
 1487,
 1629,
 4612,
 124153,
 124154,
 1345,
 2099,
 1415,
 1552,
 2182,
 124153,
 1333,
 11974,
 5650,
 133119,
 1306,
 2292,
 1369,
 1316,
 7236,
 1435,
 2

In [55]:
fast_encoded[1649:]

[3604,
 75215,
 131143,
 1306,
 4307,
 1316,
 2473,
 2292,
 1487,
 1316,
 2210,
 124127,
 1306,
 124166,
 4874,
 124153,
 3615,
 1339,
 1316,
 1974,
 4973,
 133119,
 1306,
 1377,
 1316,
 2473,
 2292,
 1340,
 1316,
 5637,
 3073,
 1400,
 124113,
 124127,
 124109,
 124127,
 1339,
 1316,
 2516,
 3654,
 133119,
 1306,
 3661,
 124133,
 1345,
 1435,
 2975,
 1552,
 4874,
 124153,
 3015,
 1316,
 3760,
 1974,
 4973,
 1377,
 1316,
 133119,
 1306,
 2473,
 2292,
 124127,
 1306,
 2151,
 2067,
 1316,
 3360,
 8571,
 1487,
 1629,
 4612,
 124153,
 6875,
 1306,
 2151,
 2099,
 133119,
 1306,
 1316,
 2292,
 1415,
 1552,
 2182,
 8293,
 1403,
 1308,
 6108,
 1339,
 1406,
 132869,
 1306,
 2358,
 2972,
 124128,
 1645,
 1552,
 4874,
 124153,
 3615,
 1339,
 1316,
 1481,
 124113,
 124145,
 1340,
 1316,
 3073,
 1481,
 8654,
 132858,
 1306,
 1435,
 2067,
 1637,
 7236,
 1487,
 1629,
 4612,
 124153,
 124154,
 1345,
 2099,
 1415,
 1552,
 2182,
 124153,
 1333,
 11974,
 5650,
 133119,
 1306,
 2292,
 1369,
 1316,
 7236,
 

In [56]:
original.decode([1414, 131098])

'----------------------------------------------------------------------------\n'

In [86]:
original.encode('----------------------------------------------------------------------------\n')

[1414, 131098]

In [87]:
t_fast.encode('----------------------------------------------------------------------------\n')

[3604, 75215, 131143]

In [59]:
original.decode([1414])

'--'

In [60]:
original.decode([131098])

'--------------------------------------------------------------------------\n'

In [61]:
t_fast.decode([1414])

'--'

In [62]:
t_fast.decode([131098])

'--------------------------------------------------------------------------\n'

In [63]:
original.decode([3604])

'--------------------------------'

In [64]:
original.decode([75215])

'-----------------------------'

In [65]:
original.decode([131143])

'---------------\n'