<a href="https://colab.research.google.com/github/paulxiong/tinyTF/blob/main/MLM_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

manipulate a simple dataset, use it to train a MLM, don't use pre-trained mode, I just want to learn how it will be coded.
*italicized text*

In [1]:
import numpy as np
from transformers import BertTokenizer, BertForMaskedLM
import torch
from torch.utils.data import Dataset, DataLoader


In [2]:
sentences = [
    "I love to eat apples.",
    "She went to the park.",
    "The cat is sleeping on the mat.",
    "He plays the guitar.",
    "The book is on the table."
]


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in sentences]


In [4]:
input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_sentences]

# Add special tokens and padding
max_length = max(len(ids) for ids in input_ids)
input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in input_ids]
attention_masks = [[1] * len(ids) + [0] * (max_length - len(ids)) for ids in input_ids]
token_type_ids = [[0] * max_length for _ in input_ids]

# Convert the lists to PyTorch tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
token_type_ids = torch.tensor(token_type_ids)


In [5]:
class MyDataset(Dataset):
    def __init__(self, input_ids, attention_masks, token_type_ids):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.token_type_ids = token_type_ids

    def __getitem__(self, index):
        return {
            'input_ids': self.input_ids[index],
            'attention_mask': self.attention_masks[index],
            'token_type_ids': self.token_type_ids[index]
        }

    def __len__(self):
        return len(self.input_ids)


In [6]:
input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_sentences]

# Add special tokens and padding
max_length = max(len(ids) for ids in input_ids)
input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in input_ids]
attention_masks = [[1] * len(ids) + [0] * (max_length - len(ids)) for ids in input_ids]
token_type_ids = [[0] * max_length for _ in input_ids]

# Convert the lists to PyTorch tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
token_type_ids = torch.tensor(token_type_ids)


In [7]:
dataset = MyDataset(input_ids, attention_masks, token_type_ids)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
# Move the model to the device
model.to(device)

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()
# Train the model
num_epochs = 5
for epoch in range(num_epochs):
    for batch in dataloader:
        # Move batch to device (CPU or GPU)
        batch = {k: v.to(device) for k, v in batch.items()}

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            token_type_ids=batch['token_type_ids']
        )
        predictions = outputs.logits

        # Flatten the predictions and labels
        predictions = predictions.view(-1, tokenizer.vocab_size)
        labels = batch['input_ids'].view(-1)

        # Move the labels to the device
        labels = labels.to(device)

        # Calculate the loss
        loss = criterion(predictions, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
# Set the model to evaluation mode
model.eval()

# Example input sentence
input_sentence = "I like to eat"

# Tokenize the input sentence
input_tokens = tokenizer.tokenize(input_sentence)
input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
input_ids = torch.tensor([input_ids]).to(device)

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids=input_ids)

predictions = outputs.logits
predicted_token_ids = torch.argmax(predictions, dim=-1).squeeze().tolist()

# Convert predicted token IDs back to tokens
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)

# Print the predicted tokens
predicted_sentence = " ".join(predicted_tokens)
print("Input: ", input_sentence)
print("Predicted: ", predicted_sentence)


Input:  I book
Predicted:  exposed ##ith


In [10]:
#!pip install transformers