In [2]:
text = "Tokenizing text is a core task of NLP"

# 文字トークン化
tokenized = list(text)
print(tokenized)

# 数値化: 一意な整数でエンコード
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized)))}
print(token2idx)

input_ids = [token2idx[token] for token in tokenized]
print(input_ids)

# 一意のIDの対応付けでは順序関係を付与してしまう
# => one-hot vector へ変換
# = トークンエンコーディング

import torch
import torch.nn.functional as F
input_ids = torch.tensor(input_ids)
one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))
one_hot_encodings.shape # token_num:38, vocab_size:20
# TensorFlow: tf.one_hot() # depthがnum_classesに相当
one_hot_encodings[0] # 'T' => ID: 4に対応

['T', 'o', 'k', 'e', 'n', 'i', 'z', 'i', 'n', 'g', ' ', 't', 'e', 'x', 't', ' ', 'i', 's', ' ', 'a', ' ', 'c', 'o', 'r', 'e', ' ', 't', 'a', 's', 'k', ' ', 'o', 'f', ' ', 'N', 'L', 'P']
{' ': 0, 'L': 1, 'N': 2, 'P': 3, 'T': 4, 'a': 5, 'c': 6, 'e': 7, 'f': 8, 'g': 9, 'i': 10, 'k': 11, 'n': 12, 'o': 13, 'r': 14, 's': 15, 't': 16, 'x': 17, 'z': 18}
[4, 13, 11, 7, 12, 10, 18, 10, 12, 9, 0, 16, 7, 17, 16, 0, 10, 15, 0, 5, 0, 6, 13, 14, 7, 0, 16, 5, 15, 11, 0, 13, 8, 0, 2, 1, 3]


tensor([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## サブワードトークン化

In [3]:
from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# 手動ロード ditilbert_tokenizer = DistilBertTokenizer.from_pretrained(model_ckpt)
 
encoded = tokenizer(text)
print(encoded)
tokens = tokenizer.convert_ids_to_tokens(encoded.input_ids)
print(tokens)
print(tokenizer.convert_tokens_to_string(tokens))
# tokenizer.vocab_size, tokenizer.model_max_length, tokenizer.model_input_names

{'input_ids': [101, 19204, 6026, 3793, 2003, 1037, 4563, 4708, 1997, 17953, 2361, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'token', '##izing', 'text', 'is', 'a', 'core', 'task', 'of', 'nl', '##p', '[SEP]']
[CLS] tokenizing text is a core task of nlp [SEP]


トークン埋め込みまでされているのか微妙