In [1]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, \
     DataCollatorWithPadding, pipeline
# from datasets import load_metric, Dataset
import evaluate
import numpy as np
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
snips_file = open('data/snips.train.txt', 'rb')

snips_rows = snips_file.readlines()

snips_rows[:20]

[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

In [3]:
# This code segment parses the snips dataset into a more manageable format

utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
    if len(snip_row) == 2:  # skip over rows with no data
        continue
    if ' ' not in snip_row.decode():  # we've hit a sequence label
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens.append(label_for_utterances)
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)
    

In [4]:
print(utterances[:5])
print(tokenized_utterances[:5])
print(labels_for_tokens[:5])
print(sequence_labels[:5])

['listen to westbam alumb allergic on google music', 'add step to me to the 50 clásicos playlist', 'i give this current textbook a rating value of 1 and a best rating of 6', 'play the song little robin redbreast', 'please add iris dement to my playlist this is selena']
[['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music'], ['add', 'step', 'to', 'me', 'to', 'the', '50', 'clásicos', 'playlist'], ['i', 'give', 'this', 'current', 'textbook', 'a', 'rating', 'value', 'of', '1', 'and', 'a', 'best', 'rating', 'of', '6'], ['play', 'the', 'song', 'little', 'robin', 'redbreast'], ['please', 'add', 'iris', 'dement', 'to', 'my', 'playlist', 'this', 'is', 'selena']]
[['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service'], ['O', 'B-entity_name', 'I-entity_name', 'I-entity_name', 'O', 'O', 'B-playlist', 'I-playlist', 'O'], ['O', 'O', 'O', 'B-object_select', 'B-object_type', 'O', 'O', 'O', 'O', 'B-rating_value', 'O', 'O', 'O', 'O', 'O', 'B-best_rating'], ['O', 'O', '

In [5]:
len(labels_for_tokens), len(tokenized_utterances), len(utterances), len(sequence_labels)

(13084, 13084, 13084, 13084)

In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [7]:
unique_sequence_labels = list(set(sequence_labels))
unique_sequence_labels

['GetWeather',
 'PlayMusic',
 'BookRestaurant',
 'AddToPlaylist',
 'SearchScreeningEvent',
 'SearchCreativeWork',
 'RateBook']

In [8]:
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels]

print(f'There are {len(unique_sequence_labels)} unique sequence labels')

There are 7 unique sequence labels


In [9]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x + y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

print(f'There are {len(unique_token_labels)} unique token labels')

There are 72 unique token labels


In [10]:
print(labels_for_tokens[:5])

[[37, 37, 51, 37, 6, 37, 46, 24], [37, 4, 70, 70, 37, 37, 2, 23, 37], [37, 37, 37, 34, 40, 37, 37, 37, 37, 61, 37, 37, 37, 37, 37, 48], [37, 37, 27, 12, 13, 13], [37, 37, 51, 18, 37, 62, 37, 2, 23, 23]]


In [11]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print([unique_token_labels[l] for l in labels_for_tokens[0]])
print(utterances[0])
print(sequence_labels[0])
print(unique_sequence_labels[sequence_labels[0]])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
[37, 37, 51, 37, 6, 37, 46, 24]
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
1
PlayMusic


In [12]:
from datasets import Dataset
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances, 
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)
snips_dataset = snips_dataset.train_test_split(test_size=0.2)

In [13]:
snips_dataset['train'][0]

{'utterance': 'i am giving finding chandra a 2 out of 6 rating',
 'label': 6,
 'tokens': ['i',
  'am',
  'giving',
  'finding',
  'chandra',
  'a',
  '2',
  'out',
  'of',
  '6',
  'rating'],
 'token_labels': [37, 37, 37, 10, 49, 37, 61, 37, 37, 48, 37]}

In [14]:
# simple function to batch tokenize utterances with truncation
def preprocess_function(examples):
    return tokenizer(examples["utterance"], truncation=True)

In [15]:
seq_clf_tokenized_snips = snips_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 10467/10467 [00:01<00:00, 10142.20 examples/s]
Map: 100%|██████████| 2617/2617 [00:00<00:00, 22803.18 examples/s]


In [16]:
seq_clf_tokenized_snips['train'][0]

{'utterance': 'i am giving finding chandra a 2 out of 6 rating',
 'label': 6,
 'tokens': ['i',
  'am',
  'giving',
  'finding',
  'chandra',
  'a',
  '2',
  'out',
  'of',
  '6',
  'rating'],
 'token_labels': [37, 37, 37, 10, 49, 37, 61, 37, 37, 48, 37],
 'input_ids': [101,
  178,
  1821,
  2368,
  4006,
  22572,
  19799,
  170,
  123,
  1149,
  1104,
  127,
  5261,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  length of the longest element in the batch, making them all the same length. 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
# Data Collator will pad data so that all examples are the same input length.
#  Attention mask is how we ignore attention scores for padding tokens

In [19]:
sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased', 
    num_labels=len(unique_sequence_labels),
)

# set an index -> label dictionary
sequence_clf_model.config.id2label = {i: l for i, l in enumerate(unique_sequence_labels)}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
sequence_clf_model.config.id2label[0]

'GetWeather'

In [22]:
from evaluate import load
metric = load("accuracy")

def compute_metrics(eval_pred):  # custom method to take in logits and calculate accuracy of the eval set
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 13.7MB/s]


In [23]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    
    # some deep learning parameters that the Trainer is able to take in
    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch'
)

# Define the trainer:

trainer = Trainer(
    model=sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)



In [24]:
# Get initial metrics
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, tokens, token_labels. If utterance, tokens, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.9494043588638306,
 'eval_model_preparation_time': 0.0016,
 'eval_accuracy': 0.14061902942300344,
 'eval_runtime': 7.1643,
 'eval_samples_per_second': 365.281,
 'eval_steps_per_second': 11.446}

In [25]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, tokens, token_labels. If utterance, tokens, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 65,786,887


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,0.0948,0.140052,0.0016,0.978984
2,0.0077,0.050379,0.0016,0.990065


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, tokens, token_labels. If utterance, tokens, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/model.safetensors
Saving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved in ./snips_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_clf/results/checkpoint-656/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, tokens, token_labels. If 

TrainOutput(global_step=656, training_loss=0.6675900561691024, metrics={'train_runtime': 35.937, 'train_samples_per_second': 582.52, 'train_steps_per_second': 18.254, 'total_flos': 131479665202746.0, 'train_loss': 0.6675900561691024, 'epoch': 2.0})

In [26]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, tokens, token_labels. If utterance, tokens, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.05037906393408775,
 'eval_model_preparation_time': 0.0016,
 'eval_accuracy': 0.9900649598777226,
 'eval_runtime': 0.7321,
 'eval_samples_per_second': 3574.55,
 'eval_steps_per_second': 112.003,
 'epoch': 2.0}

In [27]:
pipe = pipeline("text-classification", sequence_clf_model, tokenizer=tokenizer)
pipe('Please add Here We Go by Dispatch to my road trip playlist')

Device set to use cuda:0


[{'label': 'AddToPlaylist', 'score': 0.9942271709442139}]

In [28]:
trainer.save_model()

Saving model checkpoint to ./snips_clf/results
Configuration saved in ./snips_clf/results/config.json
Model weights saved in ./snips_clf/results/model.safetensors


In [29]:
pipe = pipeline("text-classification", "./snips_clf/results", tokenizer=tokenizer)
pipe('Please add Here We Go by Dispatch to my road trip playlist')

loading configuration file ./snips_clf/results/config.json
Model config DistilBertConfig {
  "_name_or_path": "./snips_clf/results",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "GetWeather",
    "1": "PlayMusic",
    "2": "BookRestaurant",
    "3": "AddToPlaylist",
    "4": "SearchScreeningEvent",
    "5": "SearchCreativeWork",
    "6": "RateBook"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype":

[{'label': 'AddToPlaylist', 'score': 0.9942271709442139}]

In [30]:
frozen_sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased', 
    num_labels=len(unique_sequence_labels),
)

loading configuration file config.json from cache at /home/nirmal/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/6ea81172465e8b0ad3fddeed32b986cdcdcffcf0/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.47.1",
  "vo

In [32]:
for param in frozen_sequence_clf_model.distilbert.parameters():
    param.requires_grad = False

In [33]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    
    # some deep learning parameters that the Trainer is able to take in
    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch'
)

# Define the trainer:

trainer = Trainer(
    model=frozen_sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [34]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, tokens, token_labels. If utterance, tokens, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.9544270038604736,
 'eval_model_preparation_time': 0.0016,
 'eval_accuracy': 0.14673290026748184,
 'eval_runtime': 0.7338,
 'eval_samples_per_second': 3566.403,
 'eval_steps_per_second': 111.748}

In [35]:
trainer.train()  # ~23min -> ~6min on my laptop with all of distilbert frozen with a worse loss/accuracy

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, tokens, token_labels. If utterance, tokens, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 595,975


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,1.8579,1.892203,0.0016,0.382117
2,1.7415,1.69188,0.0016,0.7906


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, tokens, token_labels. If utterance, tokens, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to ./snips_clf/results/checkpoint-328
Configuration saved in ./snips_clf/results/checkpoint-328/config.json
Model weights saved in ./snips_clf/results/checkpoint-328/model.safetensors
Saving model checkpoint to ./snips_clf/results/checkpoint-656
Configuration saved in ./snips_clf/results/checkpoint-656/config.json
Model weights saved in ./snips_clf/results/checkpoint-656/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, tokens, token_labels. If 

TrainOutput(global_step=656, training_loss=1.8773889169096947, metrics={'train_runtime': 16.2403, 'train_samples_per_second': 1289.017, 'train_steps_per_second': 40.393, 'total_flos': 131479665202746.0, 'train_loss': 1.8773889169096947, 'epoch': 2.0})

In [36]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, tokens, token_labels. If utterance, tokens, token_labels are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.6918796300888062,
 'eval_model_preparation_time': 0.0016,
 'eval_accuracy': 0.7905999235766145,
 'eval_runtime': 0.7466,
 'eval_samples_per_second': 3505.196,
 'eval_steps_per_second': 109.83,
 'epoch': 2.0}