In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m11.8

# Masked Language Modeling (MLM) as in BERT

For MLM, we have started experimenting with different mask token ratios to determine their impact on the model's performance. We have trained multiple models with varying ratios and are in the process of evaluating their performance on downstream tasks. We have also started exploring other self-supervised learning methods, such as Denoising Autoencoders and Contrastive Predictive Coding, to compare their effectiveness with MLM. can you implement code to do this

In [None]:

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import GPT2Model, GPT2Tokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset

# Your existing MLM class definition here




class MLM(nn.Module):
    def __init__(self, model_name, vocab_size):
        super().__init__()
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.tokenizer.mask_token = self.tokenizer.eos_token  # Set the mask token
        self.transformer = GPT2Model.from_pretrained(model_name)
        self.fc = nn.Linear(self.transformer.config.hidden_size, vocab_size)
        self.mask_token_id = self.tokenizer.mask_token_id

    def forward(self, inputs):
        masked_inputs = self.mask_input(inputs)
        transformer_output = self.transformer(masked_inputs)[0]
        logits = self.fc(transformer_output)
        return logits

    def mask_input(self, inputs):
        masked_inputs = inputs.clone()
        mask = (torch.rand_like(inputs, dtype=torch.float) < 0.15) & (inputs != self.mask_token_id)
        replace_with_mask = (torch.rand_like(inputs, dtype=torch.float) < 0.8) & mask
        replace_with_random = ~replace_with_mask & mask
        masked_inputs[replace_with_mask] = self.mask_token_id
        masked_inputs[replace_with_random] = torch.randint(low=0, high=self.tokenizer.vocab_size, size=replace_with_random.sum(), dtype=torch.long)
        return masked_inputs


def evaluate(model, tokenizer, text):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]

    with torch.no_grad():
        logits = model(input_ids)
    probabilities = torch.softmax(logits[0, mask_token_index], dim=1)
    top_5 = torch.topk(probabilities, 5, dim=1).indices.squeeze()
    return top_5


def train(model, train_loader, optimizer, device, num_epochs=1):
    model.train()
    model.to(device)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids)
            loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))

            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {loss.item()}")

# Load the WikiText-103 dataset
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")
train_dataset = dataset["train"]

# Tokenize the dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
def tokenize_function(examples):
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length", return_special_tokens_mask=True)

train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
train_dataset.set_format(type="torch", columns=["input_ids", "special_tokens_mask"])

# Create a DataLoader
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
train_loader = DataLoader(train_dataset, batch_size=16, collate_fn=data_collator)

# Initialize the custom GPT-2 MLM model
model_name = "gpt2"
gpt2_mlm = MLM(model_name, GPT2Tokenizer.from_pretrained(model_name).vocab_size)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = torch.optim.Adam(gpt2_mlm.parameters(), lr=1e-4)
train(gpt2_mlm, train_loader, optimizer, device, num_epochs=1)

# Save the trained model
torch.save(gpt2_mlm.state_dict(), "gpt2_mlm_trained.pth")



  0%|          | 0/3 [00:00<?, ?it/s]

Using mask_token, but it is not set yet.


ValueError: ignored

tensor(7.3700, grad_fn=<NllLossBackward0>)