In [18]:
from transformers import WhisperTokenizer

# Demo special tokens in Spanish and English

In [19]:
sp_tok = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Spanish", task="transcribe")
en_tok = WhisperTokenizer.from_pretrained("openai/whisper-small", language="English", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
empty_str_sp = sp_tok.encode('')
empty_str_sp

[50258, 50262, 50359, 50363, 50257]

In [15]:
sp_tok.decode(empty_str_sp)

'<|startoftranscript|><|es|><|transcribe|><|notimestamps|><|endoftext|>'

In [16]:
empty_str_en = en_tok.encode('')
empty_str_en

[50258, 50259, 50359, 50363, 50257]

In [17]:
en_tok.decode(empty_str_en)

'<|startoftranscript|><|en|><|transcribe|><|notimestamps|><|endoftext|>'

Do Spanish and English tokenizers differ at all when decoding?

In [22]:
en_tok.decode(empty_str_en), sp_tok.decode(empty_str_en)

('<|startoftranscript|><|en|><|transcribe|><|notimestamps|><|endoftext|>',
 '<|startoftranscript|><|en|><|transcribe|><|notimestamps|><|endoftext|>')

In [23]:
en_tok.decode(empty_str_sp), sp_tok.decode(empty_str_sp)

('<|startoftranscript|><|es|><|transcribe|><|notimestamps|><|endoftext|>',
 '<|startoftranscript|><|es|><|transcribe|><|notimestamps|><|endoftext|>')

In [28]:
en_tok.special_tokens_map == sp_tok.special_tokens_map

True

In [30]:
en_tok.all_special_ids == sp_tok.all_special_ids

True

Demo decoding non-special tokens

In [32]:
remove_special_ids = lambda l: [tok_id for tok_id in l if tok_id not in en_tok.all_special_ids]
remove_special_ids(empty_str_en), remove_special_ids(empty_str_sp)

([], [])

In [34]:
remove_special_ids(
    en_tok.encode("Hello")
), remove_special_ids(
    sp_tok.encode("Hello")
)

([15947], [15947])

In [35]:
remove_special_ids(
    en_tok.encode("Hola")
), remove_special_ids(
    sp_tok.encode("Hola")
)

([48529], [48529])

Looks like the only difference between tokenizers for different languages is what lang id token is output when encoding, so I won't worry about comparing them any more.

In [39]:
encode_no_special = lambda s: remove_special_ids(en_tok.encode(s))
encode_no_special("Hello hello")

[15947, 7751]

In [40]:
encode_no_special("Dogs dogs")

[35, 664, 82, 7197]

In [50]:
def print_tokens(s):
    print("Input:", s)
    tok_ids = encode_no_special(s)
    for tok_id in tok_ids:
        tok = en_tok.decode(tok_id)
        print(f"{tok_id}\t{tok}\t{tok.encode('raw_unicode_escape')}")

print_tokens("Hello, world")
print_tokens("Hello world")
print_tokens("Helloworld")

Input: Hello, world
15947	Hello	b'Hello'
11	,	b','
1002	 world	b' world'
Input: Hello world
15947	Hello	b'Hello'
1002	 world	b' world'
Input: Helloworld
39	H	b'H'
21348	ellow	b'ellow'
3445	orld	b'orld'


In [51]:
print_tokens("Bom dia meus caras")

Input: Bom dia meus caras
33	B	b'B'
298	om	b'om'
6801	 dia	b' dia'
28033	 meus	b' meus'
1032	 car	b' car'
296	as	b'as'


In [52]:
print_tokens("да свидания")

Input: да свидания
3444	да	b'\\u0434\\u0430'
43666	 свид	b' \\u0441\\u0432\\u0438\\u0434'
8831	ания	b'\\u0430\\u043d\\u0438\\u044f'


In [53]:
print_tokens("àpɾí jícə̀lò")

Input: àpɾí jícə̀lò
64	a	b'a'
136	�	b'\\ufffd'
222	�	b'\\ufffd'
79	p	b'p'
133	�	b'\\ufffd'
122	�	b'\\ufffd'
72	i	b'i'
32797	́	b'\\u0301'
32606	 ji	b' ji'
32797	́	b'\\u0301'
66	c	b'c'
7250	ə	b'\\u0259'
136	�	b'\\ufffd'
222	�	b'\\ufffd'
752	lo	b'lo'
136	�	b'\\ufffd'
222	�	b'\\ufffd'


In [56]:
enc_dec = lambda s: en_tok.decode(en_tok.encode(s), skip_special_tokens=True)
enc_dec("àpɾí jícə̀lò")

'àpɾí jícə̀lò'

# Change name of language in Whisper tokenizer?

In [57]:
en_tok.language

'english'

In [67]:
tic_tok = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe", language="english")
tic_tok.language

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


'english'

In [68]:
tic_tok.
tic_tok.encode(' ')

[50258, 50259, 50359, 50363, 220, 50257]

: 