## BERT for token classification

In [1]:
from transformers import DataCollatorForTokenClassification, DistilBertForTokenClassification, \
                         DistilBertTokenizerFast, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# using a cased tokenizer because I think case will matter
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [4]:
snips_file = open('data/snips.train.txt', 'rb')
snips_rows = snips_file.readlines()

In [5]:
# This code segment parses the snips dataset into a more manageable format

utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
    if len(snip_row) == 2:  # skip over rows with no data
        continue
    if ' ' not in snip_row.decode():  # we've hit a sequence label
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens.append(label_for_utterances)
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)
    

In [6]:
unique_sequence_labels = list(set(sequence_labels))
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels]

In [7]:
from functools import reduce
unique_token_labels = list(set(reduce(lambda x, y: x + y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

In [8]:
from datasets import Dataset
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances, 
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)
snips_dataset = snips_dataset.train_test_split(test_size=0.2)

In [9]:
snips_dataset['train'][0]

{'utterance': 'play in your eyes by gareth gates on netflix',
 'label': 2,
 'tokens': ['play',
  'in',
  'your',
  'eyes',
  'by',
  'gareth',
  'gates',
  'on',
  'netflix'],
 'token_labels': [18, 45, 67, 67, 18, 0, 39, 18, 44]}

In [10]:
# The given "token_labels" may not match up with the BERT wordpiece tokenization so
#  this function will map them to the tokenization that BERT uses
#  -100 is a reserved for labels where we do not want to calculate losses so BERT doesn't waste time
#  trying to predict tokens like CLS or SEP

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"token_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:  # Set the special tokens to -100.
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # CLS and SEP are labeled as -100
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
snips_dataset['train'][0]

{'utterance': 'play in your eyes by gareth gates on netflix',
 'label': 2,
 'tokens': ['play',
  'in',
  'your',
  'eyes',
  'by',
  'gareth',
  'gates',
  'on',
  'netflix'],
 'token_labels': [18, 45, 67, 67, 18, 0, 39, 18, 44]}

In [12]:
# map our dataset from sequence classification to be for token classification
tok_clf_tokenized_snips = snips_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 10467/10467 [00:00<00:00, 15623.78 examples/s]
Map: 100%|██████████| 2617/2617 [00:00<00:00, 16420.59 examples/s]


In [13]:
tok_clf_tokenized_snips['train'][0]

{'utterance': 'play in your eyes by gareth gates on netflix',
 'label': 2,
 'tokens': ['play',
  'in',
  'your',
  'eyes',
  'by',
  'gareth',
  'gates',
  'on',
  'netflix'],
 'token_labels': [18, 45, 67, 67, 18, 0, 39, 18, 44],
 'input_ids': [101,
  1505,
  1107,
  1240,
  1257,
  1118,
  176,
  22914,
  1324,
  8257,
  1113,
  5795,
  2087,
  20711,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100,
  18,
  45,
  67,
  67,
  18,
  0,
  -100,
  -100,
  39,
  18,
  44,
  -100,
  -100,
  -100]}

In [14]:
tok_clf_tokenized_snips['train'] = tok_clf_tokenized_snips['train'].remove_columns(
    ['utterance', 'label', 'tokens', 'token_labels']
)

tok_clf_tokenized_snips['test'] = tok_clf_tokenized_snips['test'].remove_columns(
    ['utterance', 'label', 'tokens', 'token_labels']
)

tok_clf_tokenized_snips

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10467
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2617
    })
})

In [15]:
tok_data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [16]:
tok_clf_model = DistilBertForTokenClassification.from_pretrained(
    'distilbert-base-cased', num_labels=len(unique_token_labels)
)

# Set our label dictionary
tok_clf_model.config.id2label = {i: l for i, l in enumerate(unique_token_labels)}

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
tok_clf_model.config.id2label[0], tok_clf_model.config.id2label[1]

('B-artist', 'B-object_type')

In [18]:
from transformers import Trainer, TrainingArguments
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_tok_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
        
    logging_steps=10,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

# Define the trainer:

trainer = Trainer(
    model=tok_clf_model,
    args=training_args,
    train_dataset=tok_clf_tokenized_snips['train'],
    eval_dataset=tok_clf_tokenized_snips['test'],
    data_collator=tok_data_collator
)



In [19]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 4.449569225311279,
 'eval_model_preparation_time': 0.0015,
 'eval_runtime': 0.9672,
 'eval_samples_per_second': 2705.665,
 'eval_steps_per_second': 84.778}

In [20]:
trainer.train()

***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 65,246,280


Epoch,Training Loss,Validation Loss,Model Preparation Time
1,0.2132,0.1608,0.0015
2,0.0959,0.116164,0.0015



***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_tok_clf/results/checkpoint-328
Configuration saved in ./snips_tok_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_tok_clf/results/checkpoint-328/model.safetensors
Saving model checkpoint to ./snips_tok_clf/results/checkpoint-656
Configuration saved in ./snips_tok_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_tok_clf/results/checkpoint-656/model.safetensors

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_tok_clf/results/checkpoint-656
Configuration saved in ./snips_tok_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_tok_clf/results/checkpoint-656/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_tok_clf/results/checkpoint-656 (score: 0.11616412550210953).


TrainOutput(global_step=656, training_loss=0.39572874465730135, metrics={'train_runtime': 27.8413, 'train_samples_per_second': 751.904, 'train_steps_per_second': 23.562, 'total_flos': 129401180577648.0, 'train_loss': 0.39572874465730135, 'epoch': 2.0})

In [21]:
trainer.evaluate()


***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.11616412550210953,
 'eval_model_preparation_time': 0.0015,
 'eval_runtime': 0.7124,
 'eval_samples_per_second': 3673.439,
 'eval_steps_per_second': 115.102,
 'epoch': 2.0}

In [22]:
pipe = pipeline("token-classification", tok_clf_model, tokenizer=tokenizer)
pipe('Please add Here We Go by Dispatch to my road trip playlist')

Device set to use cuda:0
Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'entity': 'B-entity_name',
  'score': np.float32(0.894636),
  'index': 3,
  'word': 'Here',
  'start': 11,
  'end': 15},
 {'entity': 'I-entity_name',
  'score': np.float32(0.9581628),
  'index': 4,
  'word': 'We',
  'start': 16,
  'end': 18},
 {'entity': 'I-entity_name',
  'score': np.float32(0.9635129),
  'index': 5,
  'word': 'Go',
  'start': 19,
  'end': 21},
 {'entity': 'B-artist',
  'score': np.float32(0.851486),
  'index': 7,
  'word': 'Di',
  'start': 25,
  'end': 27},
 {'entity': 'I-artist',
  'score': np.float32(0.59642357),
  'index': 8,
  'word': '##sp',
  'start': 27,
  'end': 29},
 {'entity': 'I-entity_name',
  'score': np.float32(0.5817155),
  'index': 9,
  'word': '##atch',
  'start': 29,
  'end': 33},
 {'entity': 'B-playlist_owner',
  'score': np.float32(0.99413365),
  'index': 11,
  'word': 'my',
  'start': 37,
  'end': 39},
 {'entity': 'B-playlist',
  'score': np.float32(0.99492466),
  'index': 12,
  'word': 'road',
  'start': 40,
  'end': 44},
 {'entity': 'I-playli

In [23]:
pipe = pipeline("token-classification", tok_clf_model, tokenizer=tokenizer)
pipe('Rate the doog food 5 out of 5')

Device set to use cuda:0


[{'entity': 'B-object_name',
  'score': np.float32(0.97137284),
  'index': 2,
  'word': 'the',
  'start': 5,
  'end': 8},
 {'entity': 'I-object_name',
  'score': np.float32(0.98746026),
  'index': 3,
  'word': 'do',
  'start': 9,
  'end': 11},
 {'entity': 'I-object_name',
  'score': np.float32(0.9838907),
  'index': 4,
  'word': '##og',
  'start': 11,
  'end': 13},
 {'entity': 'I-object_name',
  'score': np.float32(0.9898357),
  'index': 5,
  'word': 'food',
  'start': 14,
  'end': 18},
 {'entity': 'B-rating_value',
  'score': np.float32(0.9962664),
  'index': 6,
  'word': '5',
  'start': 19,
  'end': 20},
 {'entity': 'B-best_rating',
  'score': np.float32(0.86972195),
  'index': 9,
  'word': '5',
  'start': 28,
  'end': 29}]