In [2]:
import sys
sys.path.append("../")
import transformers
transformers.logging.set_verbosity_error()

from functools import partial
from transformers import AutoTokenizer, AutoModelForMaskedLM
from src.data.dataio import DataFiles, Dataset, remove_empty_fn, truncate_fn

In [3]:
PRETRAINED_MODEL = 'distilroberta-base'

data_files = DataFiles.from_url_file(url_file="../data/books.txt")

dataset = Dataset(data_files)
dataset = dataset.map(remove_empty_fn)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=PRETRAINED_MODEL)
dataset = dataset.map(partial(truncate_fn, tokenizer=tokenizer, max_seq_length=3, fill_to_max=True))

for i, x in enumerate(dataset):
    print(x)
    if i >= 5:
        break

Using custom data configuration default-6508e13455e6899d
Reusing dataset text (/Users/od/.cache/huggingface/datasets/text/default-6508e13455e6899d/0.0.0/0080d89f73ff0c9a11dfd854d463ea39d3cb8ed8a266110648767bd2b894d30d)
Loading cached processed dataset at /Users/od/.cache/huggingface/datasets/text/default-6508e13455e6899d/0.0.0/0080d89f73ff0c9a11dfd854d463ea39d3cb8ed8a266110648767bd2b894d30d/cache-5c9c1703635afe71.arrow
Loading cached processed dataset at /Users/od/.cache/huggingface/datasets/text/default-6508e13455e6899d/0.0.0/0080d89f73ff0c9a11dfd854d463ea39d3cb8ed8a266110648767bd2b894d30d/cache-d52af9fa55cc70fb.arrow


{'file_id': 0, 'line_id': 0, 'subline_id': 0, 'text': '\ufeffThe Project Gutenberg EBook of A Christmas Carol, by Charles Dickens'}
{'file_id': 0, 'line_id': 2, 'subline_id': 0, 'text': 'This eBook is for the use of anyone anywhere at no cost and with'}
{'file_id': 0, 'line_id': 3, 'subline_id': 0, 'text': 'almost no restrictions whatsoever.  You may copy it, give it away or'}
{'file_id': 0, 'line_id': 4, 'subline_id': 0, 'text': 're-use it under the terms of the Project Gutenberg License included'}
{'file_id': 0, 'line_id': 5, 'subline_id': 0, 'text': 'with this eBook or online at www.gutenberg.org'}
{'file_id': 0, 'line_id': 8, 'subline_id': 0, 'text': 'Title: A Christmas Carol</s>Author: Charles Dickens'}


In [4]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers.data.data_collator import DataCollatorForLanguageModeling

collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.25)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path=PRETRAINED_MODEL)
model.to(device)
model.train()

loader = DataLoader(dataset, batch_size=4)

optim = AdamW(model.parameters(), lr=5e-5)

# for epoch in range(1):
#     for i, batch in enumerate(loader):
#         optim.zero_grad()
#         batch = tokenizer(batch["text"], truncation=True, padding=True, return_special_tokens_mask=True, return_tensors="pt")
#         batch = batch.to(device)
#         attention_mask = batch["attention_mask"]
#         labels = batch['input_ids']
        
#         batch = collator(features=(batch,))
#         input_ids = batch["input_ids"].squeeze(0)
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         print(loss)
#         loss.backward()
#         optim.step()

# model.eval()

In [45]:
import copy
import random
from abc import abstractmethod
from typing import List, Tuple
from collections import Counter


In [None]:
from src.models.masking import RandomMask, LengthBasedMask

randomMaskInstance = RandomMask(0.5, '<mask>')
randomMask = randomMaskInstance.mask

for i, x in enumerate(dataset):
    input_string = [x['text'].split()]
    print(input_string)
    print(randomMask(input_string))
    if i >= 5:
        break

lengthMaskInstance = LengthBasedMask(0.5, 'all', '<mask>')
lengthMask = lengthMaskInstance.mask

for i, x in enumerate(dataset):
    input_string = [x['text'].split()]
    print(input_string)
    print(lengthMask(input_string))
    if i >= 5:
        break

In [58]:
import torch

for i, x in enumerate(dataset):
    input_string = [x['text'].split()]
    print('input_string is', input_string)
    masked_tokens = lengthMask(input_string)[0]
    print('masked_tokens is', masked_tokens)
    masked_sentence = ' '.join(masked_tokens)
    print('masked_sentence is', masked_sentence)
    out = torch.argmax(torch.log_softmax(model(**tokenizer.batch_encode_plus([masked_sentence], return_tensors="pt"))["logits"], dim=-1), dim=-1)
    print('output is', tokenizer.batch_decode(out))
    # print(input_string)
    # print(randomMask(input_string))
    if i >= 5:
        break

input_string is [['\ufeffThe', 'Project', 'Gutenberg', 'EBook', 'of', 'A', 'Christmas', 'Carol,', 'by', 'Charles', 'Dickens']]
masked_tokens is ['\ufeffThe', '<mask>', '<mask>', 'EBook', 'of', 'A', '<mask>', 'Carol,', 'by', '<mask>', '<mask>']
masked_sentence is ﻿The <mask> <mask> EBook of A <mask> Carol, by <mask> <mask>
output is ['<s>\ufeffThe Best Illustrated EBook of A Christmas Carol, by Karen Robinson</s>']
input_string is [['This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with']]
masked_tokens is ['<mask>', '<mask>', 'is', 'for', 'the', 'use', 'of', '<mask>', '<mask>', 'at', 'no', '<mask>', '<mask>', '<mask>']
masked_sentence is <mask> <mask> is for the use of <mask> <mask> at no <mask> <mask> <mask>
output is ['<s>This article is for the use of Adobe links at no time cost.</s>']
input_string is [['almost', 'no', 'restrictions', 'whatsoever.', 'You', 'may', 'copy', 'it,', 'give', 'it', 'away', 'or']]
masked_tokens is ['<mask>',

In [5]:
import torch
out = torch.argmax(torch.log_softmax(model(**tokenizer.batch_encode_plus(["Montreal is a <mask> city, but Toronto is <mask>."], return_tensors="pt"))["logits"], dim=-1), dim=-1)
tokenizer.batch_decode(out)

['<s>Montreal is a beautiful city, but Toronto is unique.</s>']