In [3]:
!pip install transformers
!pip install torch



In [None]:
# This process of fine-tuning a pretrained language model on in-domain data is usually called domain adaptation.

In [2]:
from transformers import AutoModelForMaskedLM

model_checkpoint = 'distilbert-base-uncased'
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

# check number of parameters of the model
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [5]:
from transformers import AutoTokenizer
import torch

text = 'This is a great [MASK]'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# extract logits and print top 5 candidates
inputs = tokenizer(text, return_tensors='pt')
token_logits = model(**inputs).logits

# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")


'>>> This is a great !'
'>>> This is a great .'
'>>> This is a great deal'
'>>> This is a great adventure'
'>>> This is a great ;'


In [None]:
#For both auto-regressive and masked language modeling, a common preprocessing step is to concatenate all the examples and
# then split the whole corpus into chunks of equal size.
