# Fine-tuning a masked language model (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the followin line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 4.2 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.2.2-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 7.8 MB/s 
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 50.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 58.6 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 63.0 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloadin

You will need to setup git, adapt your email and name in the following cell.

In [2]:
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/256M [00:00<?, ?B/s]

In [5]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [6]:
text = "This is a great [MASK]."

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [8]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [9]:
from datasets import load_dataset
import tensorflow as tf

url = 'https://github.com/eyalmazuz/ThreatIntelligenceCorpus/archive/refs/heads/master.zip' 


dataset = tf.keras.utils.get_file('master.zip', url,
                                  extract=True, cache_dir='.',
                                  cache_subdir='')

# dataset = load_dataset("text", data_dir='ThreatIntelligenceCorpus-master/Corpus', split='train')

security_dataset = dataset = load_dataset("text", data_dir='ThreatIntelligenceCorpus-master/Corpus')
security_dataset

Downloading data from https://github.com/eyalmazuz/ThreatIntelligenceCorpus/archive/refs/heads/master.zip
39870464/Unknown - 4s 0us/step

Resolving data files:   0%|          | 0/29 [00:00<?, ?it/s]



Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-52fb921e08ae8d0e/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-52fb921e08ae8d0e/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 441
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1327
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 283
    })
})

In [10]:
sample = security_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Blog: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: '


KeyError: ignored

In [12]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = security_dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

  0%|          | 0/1 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1646 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 441
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 1327
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 283
    })
})

In [13]:
tokenizer.model_max_length

512

In [14]:
chunk_size = 128

In [15]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Blog {idx} length: {len(sample)}'")

'>>> Review 0 length: 10'
'>>> Review 1 length: 2'
'>>> Review 2 length: 90'


In [16]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated blogs length: {total_length}'")

'>>> Concatenated reviews length: 102'


In [17]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 102'


In [18]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [19]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 101
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 264
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 71
    })
})

In [20]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'being used in what appeared to be an active exploit kit to which some users were being redirected. as i started to collect the urls from these sites, a pattern began to emerge with which i was not immediately familiar. below is a sample of the urls. [SEP] [CLS] [SEP] [CLS] [SEP] [CLS] [SEP] [CLS] [SEP] [CLS] ffp495bf1aw. itshis [. ] party / d8667t121 [SEP] [CLS] [SEP] [CLS] 89pd19n3d7d3x9cuf. maylink [. ] space / 766p3d35i8ze [SEP] [CLS] [SEP] [CLS] 131n35co4'

In [21]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [22]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] pulling the brake on the [MASK] [MASK] train [SEP] [CLS] [SEP] [CLS] this blog functions into detail on recent work that unit 42 [MASK] [MASK] to clark [MASK] sites associated with the magnitude exploit kit ( ek ) [MASK] it details the investigation process [MASK] in identifying score algorithm used to generate domains used by the magnitude ek. [MASK] can use the provided data to identify possible domains that may be associated with the magnitude ek before [MASK] ’ re used and block them pre [MASK] emptively and so block rhode ek [MASK] before they happen. [SEP] [CLS] [MASK] assessment [SEP] [CLS] while hunting [MASK] new malware in [MASK] alto networks [MASK]focus, i stumbled across some adobe flash files'

'>>> dharma [MASK] in what appeared to be [MASK] active exploit kit to which some users were being redirected 2007 investors i [MASK] to collect the urls from these sites, a pattern [MASK] to emerge with which i was not immediately familiar. below is a [MASK] [MASK] you

In [23]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id

    return default_data_collator(features)

In [24]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] pulling [MASK] [MASK] on the magnitude [MASK] train [SEP] [CLS] [SEP] [CLS] this blog goes into [MASK] on [MASK] work that unit 42 has [MASK] to identify malicious sites associated with the magnitude exploit kit ( ek [MASK]. it details the investigation process involved [MASK] identifying the algorithm used to generate [MASK] used by the magnitude ek. defenders can use the provided [MASK] to identify possible domains that may be associated with the magnitude [MASK] before they ’ re used [MASK] block [MASK] pre - emptively [MASK] so [MASK] [MASK] ek attacks [MASK] [MASK] [MASK]. [SEP] [CLS] initial [MASK] [SEP] [CLS] while [MASK] for new [MASK] [MASK] in palo alto networks autofocus, i stumbled across some adobe flash files'

'>>> [MASK] used [MASK] what [MASK] to [MASK] an active exploit kit [MASK] which some users were being redirected. as i started to collect [MASK] urls from these sites, a pattern [MASK] to emerge with which i was not immediately familiar. below [MASK] a

In [31]:
train_size = 80
test_size = 20

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 80
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 20
    })
})

In [32]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [35]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    logging_steps=logging_steps,
)

In [37]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
)

Cloning https://huggingface.co/ms15/distilbert-base-uncased-finetuned-imdb into local empty directory.


In [38]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64


>>> Perplexity: 31.63


In [39]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 80
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 6


Epoch,Training Loss,Validation Loss
1,3.1831,3.513566
2,3.6354,3.134987
3,3.1834,3.077654


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=6, training_loss=3.4044596354166665, metrics={'train_runtime': 252.0869, 'train_samples_per_second': 0.952, 'train_steps_per_second': 0.024, 'total_flos': 7953669365760.0, 'train_loss': 3.4044596354166665, 'epoch': 3.0})

In [40]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64


>>> Perplexity: 28.64


In [None]:
trainer.push_to_hub()

In [41]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [42]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [43]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [44]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [45]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [46]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [47]:
from huggingface_hub import get_full_repo_name

model_name = "distilbert-base-uncased-finetuned-imdb-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'ms15/distilbert-base-uncased-finetuned-imdb-accelerate'

In [48]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

Cloning https://huggingface.co/ms15/distilbert-base-uncased-finetuned-imdb-accelerate into local empty directory.


In [49]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/6 [00:00<?, ?it/s]

Configuration saved in distilbert-base-uncased-finetuned-imdb-accelerate/config.json


>>> Epoch 0: Perplexity: 22.528655021131435


Model weights saved in distilbert-base-uncased-finetuned-imdb-accelerate/pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-imdb-accelerate/tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-imdb-accelerate/special_tokens_map.json
Configuration saved in distilbert-base-uncased-finetuned-imdb-accelerate/config.json


>>> Epoch 1: Perplexity: 21.994872219201874


Model weights saved in distilbert-base-uncased-finetuned-imdb-accelerate/pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-imdb-accelerate/tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-imdb-accelerate/special_tokens_map.json
Several commits (2) will be pushed upstream.
Configuration saved in distilbert-base-uncased-finetuned-imdb-accelerate/config.json


>>> Epoch 2: Perplexity: 21.677055866374175


Model weights saved in distilbert-base-uncased-finetuned-imdb-accelerate/pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-imdb-accelerate/tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-imdb-accelerate/special_tokens_map.json
Several commits (3) will be pushed upstream.


In [50]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="huggingface-course/distilbert-base-uncased-finetuned-imdb"
)

https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpx9tpiedr


Downloading config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/bc4545eb3fd5fa91ded7b52c820d769ab2791e220c5fc4a098863c3ffba814d3.d93697f2459b6c5b21e398760e556bd3849c8d50e99fb4992d5ede6b6dfbfdc1
creating metadata file for /root/.cache/huggingface/transformers/bc4545eb3fd5fa91ded7b52c820d769ab2791e220c5fc4a098863c3ffba814d3.d93697f2459b6c5b21e398760e556bd3849c8d50e99fb4992d5ede6b6dfbfdc1
loading configuration file https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/bc4545eb3fd5fa91ded7b52c820d769ab2791e220c5fc4a098863c3ffba814d3.d93697f2459b6c5b21e398760e556bd3849c8d50e99fb4992d5ede6b6dfbfdc1
Model config DistilBertConfig {
  "_name_or_path": "huggingface-course/distilbert-base-uncased-finetuned-imdb",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "att

Downloading pytorch_model.bin:   0%|          | 0.00/256M [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/68293fbe100166528ae658fb82fba8342643c8286ef9c0bd1b68cd36f06e32c4.0f49b33894172bcdf49f481f47af60206c0432545e87f4e5599b4dafc0234557
creating metadata file for /root/.cache/huggingface/transformers/68293fbe100166528ae658fb82fba8342643c8286ef9c0bd1b68cd36f06e32c4.0f49b33894172bcdf49f481f47af60206c0432545e87f4e5599b4dafc0234557
loading weights file https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/68293fbe100166528ae658fb82fba8342643c8286ef9c0bd1b68cd36f06e32c4.0f49b33894172bcdf49f481f47af60206c0432545e87f4e5599b4dafc0234557
All model checkpoint weights were used when initializing DistilBertForMaskedLM.

All the weights of DistilBertForMaskedLM were initialized from the model checkpoint at huggingface-course/dis

Downloading tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/0e75ce48bd829d9de3854ca4dc6094c0873a91ccf82203e36f2416b14b47bc3e.42154c5fd30bfa7e34941d0d8ad26f8a3936990926fbe06b2da76dd749b1c6d4
creating metadata file for /root/.cache/huggingface/transformers/0e75ce48bd829d9de3854ca4dc6094c0873a91ccf82203e36f2416b14b47bc3e.42154c5fd30bfa7e34941d0d8ad26f8a3936990926fbe06b2da76dd749b1c6d4
https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp8kzjca09


Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/6ad739928d7876d3bc26435b31d4d319b0fc77cea3d8e74e8da3eeedd1967f84.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
creating metadata file for /root/.cache/huggingface/transformers/6ad739928d7876d3bc26435b31d4d319b0fc77cea3d8e74e8da3eeedd1967f84.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpbszenwhv


Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/1fdace8e42689b11ea1af0cdc221cd292866d751b934e7d88df01d61fe16332f.f471bd2d72c48b932f7be40446896b7e97c3be406ee93abfb500399bc606c829
creating metadata file for /root/.cache/huggingface/transformers/1fdace8e42689b11ea1af0cdc221cd292866d751b934e7d88df01d61fe16332f.f471bd2d72c48b932f7be40446896b7e97c3be406ee93abfb500399bc606c829
https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpgd_z99yj


Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/special_tokens_map.json in cache at /root/.cache/huggingface/transformers/998d766bcbe227da8ff9181cda6c27ee7aa93d73c1e44a437104333e3caf7a92.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
creating metadata file for /root/.cache/huggingface/transformers/998d766bcbe227da8ff9181cda6c27ee7aa93d73c1e44a437104333e3caf7a92.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/6ad739928d7876d3bc26435b31d4d319b0fc77cea3d8e74e8da3eeedd1967f84.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/1fdace8e42689b11ea1af0cdc2

In [1]:
preds = mask_filler(text)

cyber_text = "This is a cyber [MASK]"
cyber_preds = mask_filler(cyber_text)

for pred in cyber_preds:
    print(f">>> {pred['sequence']}")

NameError: ignored