In [5]:
from transformers import AutoModelForMaskedLM
checkpoint = 'distilbert-base-uncased'
model = AutoModelForMaskedLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [6]:
model.num_parameters()

66985530

In [7]:
sample_text = "This is a great [MASK]."

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
import torch

inputs = tokenizer(sample_text, return_tensors = 'pt')
sample_logits = model(**inputs).logits

In [27]:
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = sample_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_10_tokens = torch.topk(mask_token_logits, 10, dim=1).indices[0].tolist()

token_output_list = [tokenizer.decode([token]) for token in top_10_tokens]

print(f"Top 10 tokens are {top_10_tokens} and corresponding words {token_output_list}")

for token in top_10_tokens:
    print(f"'>>> {sample_text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

Top 10 tokens are [3066, 3112, 6172, 2801, 8658, 6707, 9467, 18457, 6344, 16507] and corresponding words ['deal', 'success', 'adventure', 'idea', 'feat', 'mistake', 'shame', 'undertaking', 'achievement', 'coincidence']
'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'
'>>> This is a great mistake.'
'>>> This is a great shame.'
'>>> This is a great undertaking.'
'>>> This is a great achievement.'
'>>> This is a great coincidence.'


## Dataset

In [29]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [30]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating unsupervised split: 0 examples [00:00, ? examples/s]

In [33]:
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [34]:
def tokenize_function(examples):
  result = tokenizer(examples['text'])
  if tokenizer.is_fast:
    result['word_ids'] = [result.word_ids(i) for i in range(len(result["input_ids"]))]

  return result

tokenized_imdb_dataset = imdb_dataset.map(tokenize_function, batched = True, remove_columns = ['text', 'label'])
tokenized_imdb_dataset

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [40]:
tokenizer.model_max_length


512

In [41]:
chunk_size = 128

In [63]:
def group_texts(examples):

  # Concatenate all the examples
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

  total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])

  total_length = (total_length // chunk_size) * chunk_size

  # Split by chunks of max_len
  result = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
  }

  # Create a new labels column
  result["labels"] = result["input_ids"].copy()

  return result

In [64]:
lm_imdb_dataset = tokenized_imdb_dataset.map(group_texts, batched = True)
lm_imdb_dataset

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [62]:
for k, t in lm_imdb_dataset.items():
  print(t,'\n')

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 7846783
}) 

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 7669275
}) 

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 15742051
}) 

