In [1]:
import sys
sys.path.append("../")
import transformers
transformers.logging.set_verbosity_error()

from functools import partial
from transformers import AutoTokenizer, AutoModelForMaskedLM
from src.data.dataio import DataFiles, Dataset, remove_empty_fn, truncate_fn

In [3]:
PRETRAINED_MODEL = 'distilroberta-base'

data_files = DataFiles.from_url_file(url_file="../data/books.txt")

dataset = Dataset(data_files)
dataset = dataset.map(remove_empty_fn)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=PRETRAINED_MODEL)
dataset = dataset.map(partial(truncate_fn, tokenizer=tokenizer, max_seq_length=3, fill_to_max=True))

for i, x in enumerate(dataset):
    print(x)
    if i >= 5:
        break

Using custom data configuration default-6508e13455e6899d
Reusing dataset text (/home/c_spino/.cache/huggingface/datasets/text/default-6508e13455e6899d/0.0.0/0080d89f73ff0c9a11dfd854d463ea39d3cb8ed8a266110648767bd2b894d30d)
Loading cached processed dataset at /home/c_spino/.cache/huggingface/datasets/text/default-6508e13455e6899d/0.0.0/0080d89f73ff0c9a11dfd854d463ea39d3cb8ed8a266110648767bd2b894d30d/cache-100993ac6f5918fb.arrow


  0%|          | 0/50 [00:00<?, ?ba/s]

{'file_id': 0, 'line_id': 0, 'subline_id': 0, 'text': '�'}
{'file_id': 0, 'line_id': 0, 'subline_id': 1, 'text': '�'}
{'file_id': 0, 'line_id': 0, 'subline_id': 2, 'text': '�'}
{'file_id': 0, 'line_id': 0, 'subline_id': 3, 'text': 'The'}
{'file_id': 0, 'line_id': 0, 'subline_id': 4, 'text': ' Project'}
{'file_id': 0, 'line_id': 0, 'subline_id': 5, 'text': ' Gutenberg'}


In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers.data.data_collator import DataCollatorForLanguageModeling

collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.25)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path=PRETRAINED_MODEL)
model.to(device)
model.train()

loader = DataLoader(dataset, batch_size=4)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(1):
    for i, batch in enumerate(loader):
        optim.zero_grad()
        batch = tokenizer(batch["text"], truncation=True, padding=True, return_special_tokens_mask=True, return_tensors="pt")
        batch = batch.to(device)
        attention_mask = batch["attention_mask"]
        labels = batch['input_ids']
        
        batch = collator(features=(batch,))
        input_ids = batch["input_ids"].squeeze(0)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        print(loss)
        loss.backward()
        optim.step()

# model.eval()

In [None]:
import torch
out = torch.argmax(torch.log_softmax(model(**tokenizer.batch_encode_plus(["Montreal is a <mask> city, but Toronto is <mask>."], return_tensors="pt"))["logits"], dim=-1), dim=-1)
tokenizer.batch_decode(out)