In [1]:
# install the require version of datasets in case you have an older version     
# You need to choose the "Kernel" -> "Restart" option from the menu after running this cell
! pip install "datasets==2.15.0"



In [2]:
# Load the sms_spam dataset
# See: https://huggingface.co/datasets/sms_spam
from datasets import load_dataset 

In [3]:
# The sms_spam dataset has a train and test split, so we use the train_split method to split it into train and test
dataset = load_dataset("sms_spam", split="train").train_test_split(
    test_size=0.2, shuffle=True, seed=23
    )

splits = ['train', 'test']


# view the dataset characteristics
dataset['train']

Downloading readme:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/359k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

Dataset({
    features: ['sms', 'label'],
    num_rows: 4459
})

In [4]:
# Inspect the first example. Do you think this is a spam or ham message?
dataset['train'][0]

{'sms': 'Had your mobile 10 mths? Update to the latest Camera/Video phones for FREE. KEEP UR SAME NUMBER, Get extra free mins/texts. Text YES for a call\n',
 'label': 1}

## Pre-process datasets

Now we are going to process our datasets by converting all the text into tokens for our models.

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Let's use a lambda function to tokenize all the examples in the dataset 

tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["text"], padding="max_length", truncation=True), batched=True
    )


