In [1]:
# install the require version of datasets in case you have an older version     
# You need to choose the "Kernel" -> "Restart" option from the menu after running this cell
! pip install "datasets==2.15.0"



In [2]:
# Load the sms_spam dataset
# See: https://huggingface.co/datasets/sms_spam
from datasets import load_dataset 

In [3]:
# The sms_spam dataset has a train and test split, so we use the train_split method to split it into train and test
dataset = load_dataset("sms_spam", split="train").train_test_split(
    test_size=0.2, shuffle=True, seed=23
    )

splits = ['train', 'test']


# view the dataset characteristics
dataset['train']

Dataset({
    features: ['sms', 'label'],
    num_rows: 4459
})

In [4]:
# Inspect the first example. Do you think this is a spam or ham message?
dataset['train'][0]

{'sms': 'Had your mobile 10 mths? Update to the latest Camera/Video phones for FREE. KEEP UR SAME NUMBER, Get extra free mins/texts. Text YES for a call\n',
 'label': 1}

## Pre-process datasets

Now we are going to process our datasets by converting all the text into tokens for our models.

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Let's use a lambda function to tokenize all the examples
tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["sms"], truncation=True), batched=True
    )

# Inspect the available columns in the dataset
tokenized_dataset["train"]

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 4459
})

## Load and set up the model

In this case we are doing a full fine tuning, so we will want to unfreeze all parameters.

In [7]:
from transformers import AutoModelForSequenceClassification 

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Unfreeze all the model parameters
# Hint: Check the documentation at https://huggingface.co/transformers/v4.2.2/training.html
for param in model.parameters():
    param.requires_grad = True

In [9]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

## Let's train it!

Now it's time to train our model. We'll use the `Trainer` class.

First we'll define a function to compute our accuracy metreic then we make the `Trainer`.

In this instance, we will fill in some of the training arguments


In [10]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments


In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [15]:

# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/spam_not_spam",
        # Set the learning rate
        learning_rate=2e-5,
        
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        # Evaluate and save the model after each epoch
        evaluation_strategy="epoch",
        save_strategy="epoch",
        
        # Set the number of epochs to train for
        num_train_epochs=2,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/1116 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.0725, 'learning_rate': 1.1039426523297491e-05, 'epoch': 0.9}


  0%|          | 0/140 [00:00<?, ?it/s]

{'eval_loss': 0.04967828467488289, 'eval_accuracy': 0.989237668161435, 'eval_runtime': 7.1221, 'eval_samples_per_second': 156.555, 'eval_steps_per_second': 19.657, 'epoch': 1.0}
{'loss': 0.0222, 'learning_rate': 2.078853046594982e-06, 'epoch': 1.79}


  0%|          | 0/140 [00:00<?, ?it/s]

{'eval_loss': 0.056275732815265656, 'eval_accuracy': 0.989237668161435, 'eval_runtime': 6.4236, 'eval_samples_per_second': 173.579, 'eval_steps_per_second': 21.795, 'epoch': 2.0}
{'train_runtime': 182.8564, 'train_samples_per_second': 48.771, 'train_steps_per_second': 6.103, 'train_loss': 0.04386217683874151, 'epoch': 2.0}


TrainOutput(global_step=1116, training_loss=0.04386217683874151, metrics={'train_runtime': 182.8564, 'train_samples_per_second': 48.771, 'train_steps_per_second': 6.103, 'train_loss': 0.04386217683874151, 'epoch': 2.0})