


## Fine tuning of masked LM model



## Import Distilbert pre-trained model for masking

In [1]:
from transformers import AutoModelForMaskedLM
checkpoint = 'distilbert-base-uncased'
model = AutoModelForMaskedLM.from_pretrained(checkpoint)

In [2]:
model.num_parameters()

66985530

In [3]:
sample_text = "This is a great [MASK]."

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
import torch

inputs = tokenizer(sample_text, return_tensors = 'pt')
sample_logits = model(**inputs).logits

In [6]:
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = sample_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_10_tokens = torch.topk(mask_token_logits, 10, dim=1).indices[0].tolist()

token_output_list = [tokenizer.decode([token]) for token in top_10_tokens]

print(f"Top 10 tokens are {top_10_tokens} and corresponding words {token_output_list}")

for token in top_10_tokens:
    print(f"'>>> {sample_text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

Top 10 tokens are [3066, 3112, 6172, 2801, 8658, 6707, 9467, 18457, 6344, 16507] and corresponding words ['deal', 'success', 'adventure', 'idea', 'feat', 'mistake', 'shame', 'undertaking', 'achievement', 'coincidence']
'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'
'>>> This is a great mistake.'
'>>> This is a great shame.'
'>>> This is a great undertaking.'
'>>> This is a great achievement.'
'>>> This is a great coincidence.'


## Dataset
- Import dataset
- Tokenize using Distilbert tokenizer
- Merge the reviews together and Convert the dataset in to chunks of 128 tokens

In [7]:
!pip install datasets



In [8]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")

In [9]:
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [10]:
def tokenize_function(examples):
  result = tokenizer(examples['text'])
  if tokenizer.is_fast:
    result['word_ids'] = [result.word_ids(i) for i in range(len(result["input_ids"]))]

  return result

tokenized_imdb_dataset = imdb_dataset.map(tokenize_function, batched = True, remove_columns = ['text', 'label'])
tokenized_imdb_dataset

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [11]:
tokenizer.model_max_length


512

In [12]:
chunk_size = 128

In [13]:
def group_texts(examples):

  # Concatenate all the examples
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

  total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])

  total_length = (total_length // chunk_size) * chunk_size

  # Split by chunks of max_len
  result = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
  }

  # Create a new labels column
  result["labels"] = result["input_ids"].copy()

  return result

In [14]:
lm_imdb_dataset = tokenized_imdb_dataset.map(group_texts, batched = True)
lm_imdb_dataset

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})



## Fine-tuning Distil BERT with trainer API


In [15]:
from transformers import DataCollatorForLanguageModeling

datacollator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm_probability = 0.15)

In [31]:
train_size = 50000
test_size = int(0.2 * train_size)

downsampled_imdb_dataset = lm_imdb_dataset["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
})

In [32]:
from transformers import TrainingArguments
batch_size = 128
logging_steps = len(downsampled_imdb_dataset["train"]) // (batch_size * 4)
model_name = checkpoint.split("/")[-1]

In [35]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [38]:
# # contains all the hyperparameters and settings for the training process
training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    learning_rate=2e-5,
    weight_decay=0.05,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,)

In [39]:
# Importing and initializing a trainer class instance which is a high-level interface for training, fine-tuning, and evaluating models in the Transformers library.
from transformers import Trainer
trainer = Trainer(
    model = model, # the pre-trained model to be used (for this case its DistilBert)
    args = training_args, # contains all the hyperparameters and settings for the training process
    train_dataset=downsampled_imdb_dataset["train"], # train dataset
    eval_dataset=downsampled_imdb_dataset["test"],
    data_collator=datacollator, # prepares batches of input data ensuring same length across batches
    tokenizer=tokenizer, )

In [40]:
# Compute Perplexity of the predictions for evaluation
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [41]:
trainer.train()

Step,Training Loss,Validation Loss
97,2.3758,2.326003
194,2.4041,2.302887
291,2.3842,2.289731


Step,Training Loss,Validation Loss
97,2.3758,2.326003
194,2.4041,2.302887
291,2.3842,2.289731
388,2.4085,2.292026
485,2.3689,2.289022
582,2.3637,2.285239
679,2.3612,2.265332
776,2.3557,2.265294
873,2.3535,2.268327
970,2.3431,2.263358


TrainOutput(global_step=1173, training_loss=2.3663244987286163, metrics={'train_runtime': 1025.4067, 'train_samples_per_second': 146.283, 'train_steps_per_second': 1.144, 'total_flos': 4971043353600000.0, 'train_loss': 2.3663244987286163, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate ()

In [46]:
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


9.694142205176487

##Result:
Reduction in perplexity (from 22 to 9.69) which tells us the model has learned something about the domain of movie reviews!

In [25]:
trainer.push_to_hub()

events.out.tfevents.1704102396.43a849b10e5c.3971.1:   0%|          | 0.00/359 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

events.out.tfevents.1704101474.43a849b10e5c.3971.0:   0%|          | 0.00/6.00k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

'https://huggingface.co/soninimish/distilbert-base-uncased-finetuned-imdb/tree/main/'