# Training a tokenizer from scratch/ Training a new tokenizer from old one

## Step1: Build a corpus

In [None]:
!pip install datasets



In [None]:
#Load the required dataset
#Here we will use the CodeSearch dataset which contains millions of function in different programming languages
from datasets import load_dataset

#We will load the Python part of the dataset
raw_dataset = load_dataset('code_search_net', "python")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

In [None]:
#Check the training dataset
raw_dataset['train']

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 412178
})

In [None]:
#We will take the whole_func_string to train the tokenizer
raw_dataset['train'][0]['whole_func_string']

'def write_map_file(mapFNH, items, header):\n    """\n    Given a list of mapping items (in the form described by the parse_mapping_file method)\n    and a header line, write each row to the given input file with fields separated by tabs.\n\n    :type mapFNH: file or str\n    :param mapFNH: Either the full path to the map file or an open file handle\n\n    :type items: list\n    :param item: The list of row entries to be written to the mapping file\n\n    :type header: list or str\n    :param header: The descriptive column names that are required as the first line of\n                   the mapping file\n\n    :rtype: None\n    """\n    if isinstance(header, list):\n        header = "\\t".join(header) + "\\n"\n\n    with file_handle(mapFNH, "w") as mapF:\n        mapF.write(header)\n        for row in items:\n            mapF.write("\\t".join(row)+"\\n")'

In [None]:
#For memory efficiency we use generator
training_corpus = (raw_dataset['train'][i: i+1000]['whole_func_string']
                   for i in range(0, len(raw_dataset['train']), 1000)
                   )
training_corpus

<generator object <genexpr> at 0x7d5b3259bed0>

In [None]:
#The only disadvantage with generator is that it can only be used once
gen = (i for i in range(10))
print(list(gen))
print(list(gen))

#Note here we will get the list only once and then an empty list

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[]


In [None]:
#Hence define a function which generates a generator
def get_training_corpus():
  return (raw_dataset['train'][i: i+1000]['whole_func_string']
          for i in range(0, len(raw_dataset['train']), 1000)
          )

training_corpus = get_training_corpus()

In [None]:
#Another way to generate training corpus using for and yield
def get_training_corpus():
  dataset = raw_dataset['train']
  for start_idx in range(0, len(dataset), 1000):
    samples = dataset[start_idx: start_idx + 1000]
    yield samples['whole_func_String']

In [None]:
training_corpus

<generator object get_training_corpus.<locals>.<genexpr> at 0x7d5b31867370>

## Step2: Train a new tokenizer

In [None]:
#Import an old tokenizer
#This is useful as we dont have to start from scratch
#The new tokenizer is exactly like GPT-2
#Only thing that will change is vocabulary
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained('gpt2')

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
#take an example to see how the tokenizer works
example = '''def add_numbers(a,b):
"""Add the two numbers 'a' and 'b'."""
return a + b'''

tokens = old_tokenizer.tokenize(example)
tokens

['def',
 'Ġadd',
 '_',
 'n',
 'umbers',
 '(',
 'a',
 ',',
 'b',
 '):',
 'Ċ',
 '"""',
 'Add',
 'Ġthe',
 'Ġtwo',
 'Ġnumbers',
 "Ġ'",
 'a',
 "'",
 'Ġand',
 "Ġ'",
 'b',
 '\'."',
 '""',
 'Ċ',
 'return',
 'Ġa',
 'Ġ+',
 'Ġb']

Note the tokenizer is not efficient as it seperated the function based on special character

In [None]:
#Train a new tokenizer
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)

Note that train_new_from_iterator works only if yu are using fast tokenizer

In [None]:
tokens = tokenizer.tokenize(example)
tokens

In [None]:
print(len(tokens))
print(len(old_tokenizer.tokenize(example)))

# Fast tokenizer special power

1. The output of the tokenizer is usually a BatchEncoding object.

2. Besides parallelization fast tokenizer can also do offset mapping which ensures that which tokens in a sequence of words corresponds to which token

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."

encoding = tokenizer(example)
encoding

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'input_ids': [101, 1422, 1271, 1110, 156, 7777, 2497, 1394, 1105, 146, 1250, 1120, 20164, 10932, 10289, 1107, 6010, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
print(type(encoding))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [None]:
#2 ways of checking if the tokenizer is fast
print(tokenizer.is_fast)
print(encoding.is_fast)

True
True


In [None]:
#Access the tokens
print(list(encoding.tokens()))
list(encoding.word_ids())

['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in', 'Brooklyn', '.', '[SEP]']


[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]

Masking a tokens coming from same word is called whole word masking

In [None]:
#We can find out the word corresponding to token 3 by executing following code
start, end = encoding.word_to_chars(3)
example[start:end]

'Sylvain'

In [None]:
!pip install tensorflow==2.14

Collecting tensorflow==2.14
  Downloading tensorflow-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (489.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.8/489.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.15,>=2.14 (from tensorflow==2.14)
  Downloading tensorboard-2.14.1-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow-estimator<2.15,>=2.14.0 (from tensorflow==2.14)
  Downloading tensorflow_estimator-2.14.0-py2.py3-none-any.whl (440 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m440.7/440.7 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras<2.15,>=2.14.0 (from tensorflow==2.14)
  Downloading keras-2.14.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collect

In [None]:
from transformers import pipeline

In [None]:
token_classifier = pipeline('token-classification')
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

[{'entity': 'I-PER',
  'score': 0.99938285,
  'index': 4,
  'word': 'S',
  'start': 11,
  'end': 12},
 {'entity': 'I-PER',
  'score': 0.99815494,
  'index': 5,
  'word': '##yl',
  'start': 12,
  'end': 14},
 {'entity': 'I-PER',
  'score': 0.99590707,
  'index': 6,
  'word': '##va',
  'start': 14,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.99923277,
  'index': 7,
  'word': '##in',
  'start': 16,
  'end': 18},
 {'entity': 'I-ORG',
  'score': 0.9738931,
  'index': 12,
  'word': 'Hu',
  'start': 33,
  'end': 35},
 {'entity': 'I-ORG',
  'score': 0.976115,
  'index': 13,
  'word': '##gging',
  'start': 35,
  'end': 40},
 {'entity': 'I-ORG',
  'score': 0.9887976,
  'index': 14,
  'word': 'Face',
  'start': 41,
  'end': 45},
 {'entity': 'I-LOC',
  'score': 0.9932106,
  'index': 16,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [None]:
#Grouping the tokens corresponding to same entity
from transformers import pipeline

token_classifier = pipeline('token-classification', aggregation_strategy = 'simple')
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796019,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

aggregation_strategy =

1. 'simple' - Mean of the scores with regular tokenizer

2. 'first' - score is the 1st token of the entity (here, score will be for S in Sylvian)

3. 'max' - max score of the tokens taken together (for 'Hugging face' it would be 0.98879766, the score for 'Face')

4. 'average' - avearge of the score composing the entity (there will be difference in Huggin Face)

# Result without pipeline

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

example = "My name is Sylvain and I work at Hugging Face in Brooklyn."

checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForTokenClassification.from_pretrained(checkpoint)

inputs = tokenizer(example, return_tensors = 'pt')
output = model(**inputs)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
print(inputs['input_ids'].shape)
print(output.logits.shape)

torch.Size([1, 19])
torch.Size([1, 19, 9])


Read the output as 1 sequence with 19 tokens and the model has 9 different labels

In [None]:
import torch
import numpy as np

probabilities = torch.nn.functional.softmax(output.logits, dim = -1)[0].tolist()
predictions = output.logits.argmax(dim = -1)[0].tolist()
predictions

[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]

In [None]:
model.config.id2label

#note that there are 9 labels

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [None]:
results = []
tokens = inputs.tokens()

for idx, pred in enumerate(predictions):
  label = model.config.id2label[pred]
  if label != 'O':
    results.append(
        {'entity': label, 'score': probabilities[idx][pred], 'word':tokens[idx]}
    )


In [None]:
print(results)

[{'entity': 'I-PER', 'score': 0.9993828535079956, 'word': 'S'}, {'entity': 'I-PER', 'score': 0.9981548190116882, 'word': '##yl'}, {'entity': 'I-PER', 'score': 0.995907187461853, 'word': '##va'}, {'entity': 'I-PER', 'score': 0.9992327690124512, 'word': '##in'}, {'entity': 'I-ORG', 'score': 0.9738931059837341, 'word': 'Hu'}, {'entity': 'I-ORG', 'score': 0.9761149883270264, 'word': '##gging'}, {'entity': 'I-ORG', 'score': 0.9887974858283997, 'word': 'Face'}, {'entity': 'I-LOC', 'score': 0.99321049451828, 'word': 'Brooklyn'}]


In [None]:
#Set the offset mapping
inputs_with_offsets = tokenizer(example, return_offsets_mapping = True)
inputs_with_offsets['offset_mapping']

[(0, 0),
 (0, 2),
 (3, 7),
 (8, 10),
 (11, 12),
 (12, 14),
 (14, 16),
 (16, 18),
 (19, 22),
 (23, 24),
 (25, 29),
 (30, 32),
 (33, 35),
 (35, 40),
 (41, 45),
 (46, 48),
 (49, 57),
 (57, 58),
 (0, 0)]

In [None]:
example[12:14]

'yl'

In [None]:
results = []
inputs_with_offsets - tokenizer(example, return_offsets_mapping = True)
tokens = input_with_offsets.tokens()
offsets = input_with_offsets['offset_mapping']

for ids, pred in enumerate(predictions):
  label = model.config.id2label[pred]
  if label != 'O':
    start, end = offsets[idx]
    results.append(
        {
            'entity': label,
            'score': probabilities,
            'word': tokens[idx],
            'start': start,
            'end': end
        }
    )

  print(results)