


## Fine tuning of masked LM model



## Import Distilbert pre-trained model for masking

In [1]:
from transformers import AutoModelForMaskedLM
checkpoint = 'distilbert-base-uncased'
model = AutoModelForMaskedLM.from_pretrained(checkpoint)

In [2]:
model.num_parameters()

66985530

In [3]:
sample_text = "This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
import torch

inputs = tokenizer(sample_text, return_tensors = 'pt')
sample_logits = model(**inputs).logits

In [6]:
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = sample_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_10_tokens = torch.topk(mask_token_logits, 10, dim=1).indices[0].tolist()

token_output_list = [tokenizer.decode([token]) for token in top_10_tokens]

print(f"Top 10 tokens are {top_10_tokens} and corresponding words {token_output_list}")

for token in top_10_tokens:
    print(f"'>>> {sample_text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

Top 10 tokens are [3066, 3112, 6172, 2801, 8658, 6707, 9467, 18457, 6344, 16507] and corresponding words ['deal', 'success', 'adventure', 'idea', 'feat', 'mistake', 'shame', 'undertaking', 'achievement', 'coincidence']
'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'
'>>> This is a great mistake.'
'>>> This is a great shame.'
'>>> This is a great undertaking.'
'>>> This is a great achievement.'
'>>> This is a great coincidence.'


## Dataset
- Import dataset
- Tokenize using Distilbert tokenizer
- Merge the reviews together and Convert the dataset in to chunks of 128 tokens

In [7]:
!pip install datasets



In [8]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")

In [9]:
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [10]:
def tokenize_function(examples):
  result = tokenizer(examples['text'])
  if tokenizer.is_fast:
    result['word_ids'] = [result.word_ids(i) for i in range(len(result["input_ids"]))]

  return result

tokenized_imdb_dataset = imdb_dataset.map(tokenize_function, batched = True, remove_columns = ['text', 'label'])
tokenized_imdb_dataset

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [11]:
tokenizer.model_max_length


512

In [12]:
chunk_size = 128

In [13]:
def group_texts(examples):

  # Concatenate all the examples
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

  total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])

  total_length = (total_length // chunk_size) * chunk_size

  # Split by chunks of max_len
  result = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
  }

  # Create a new labels column
  result["labels"] = result["input_ids"].copy()

  return result

In [14]:
lm_imdb_dataset = tokenized_imdb_dataset.map(group_texts, batched = True)
lm_imdb_dataset

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})



## Fine-tuning Distil BERT with trainer API


In [15]:
from transformers import DataCollatorForLanguageModeling

datacollator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm_probability = 0.15)

In [16]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_imdb_dataset = lm_imdb_dataset["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [17]:
from transformers import TrainingArguments
batch_size = 64
logging_steps = len(downsampled_imdb_dataset["train"]) // batch_size
model_name = checkpoint.split("/")[-1]

In [18]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,)

In [22]:
from transformers import Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=downsampled_imdb_dataset["train"],
    eval_dataset=downsampled_imdb_dataset["test"],
    data_collator=datacollator,
    tokenizer=tokenizer, )

In [23]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


>>> Perplexity: 21.94


In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.6819,2.497822
2,2.5872,2.448819
3,2.525,2.483616


TrainOutput(global_step=471, training_loss=2.598209565359063, metrics={'train_runtime': 162.7061, 'train_samples_per_second': 184.381, 'train_steps_per_second': 2.895, 'total_flos': 994208670720000.0, 'train_loss': 2.598209565359063, 'epoch': 3.0})

In [25]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 12.07


In [26]:
trainer.push_to_hub()

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

events.out.tfevents.1704097858.53649cbadcaa.3336.1:   0%|          | 0.00/359 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

events.out.tfevents.1704097622.53649cbadcaa.3336.0:   0%|          | 0.00/6.00k [00:00<?, ?B/s]

'https://huggingface.co/soninimish/distilbert-base-uncased-finetuned-imdb/tree/main/'