In [2]:
import pandas as pd
from os.path import dirname, join
import numpy as np
from transformers import AutoModelForMaskedLM,  AutoTokenizer
from datasets import load_dataset, Dataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

import torch
torch.manual_seed(0)


bert_model_tiny = 'google/bert_uncased_L-2_H-128_A-2'
bert_model_mini = 'google/bert_uncased_L-4_H-256_A-4'
bert_model_med = 'google/bert_uncased_L-8_H-512_A-8'
bert_model_base = 'google/bert_uncased_L-12_H-768_A-12'
longformer_model = 'allenai/longformer-base-4096'

models_dict= {'mini': bert_model_mini, 'med': bert_model_med, 'tiny': bert_model_tiny, 'base':bert_model_base, 'longformer': longformer_model}


In [3]:
def truncate(x, n):
    ret = [x[0]] + x[-n:]
    return ret

class TorchDataset(torch.utils.data.Dataset):
    def __init__(self, text, tokenizer, max_length):
        self.text = text
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, idx):
        item = self.text[idx]
        item_tokenized = self.tokenizer.batch_encode_plus([item], max_length= self.max_length, padding='max_length',
                                                  truncation=False, return_special_tokens_mask=True, pad_to_max_length = True)

        trucated_tokens = {key: truncate(val[0], self.max_length - 1) for key, val in item_tokenized.items()}
        ret = {key: torch.tensor(val) for key, val in trucated_tokens.items()}

        return ret

    def __len__(self):
        return len(self.text)

In [7]:
bert_model_name='tiny'
max_len = 512
model_name = models_dict[bert_model_name]
bert_model = AutoModelForMaskedLM.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=False, padding=True, max_len=max_len)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)




Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
text = ['My name is slim shade and I am an aspiring AI Engineer',
'I am an aspiring AI Engineer',
'My name is Slim']

In [12]:
train_data = text

input_dataset = TorchDataset(train_data, tokenizer, max_length=10)


In [13]:
input_dataset[0]

{'input_ids': tensor([  101,  8703,  1998,  1045,  2572,  2019, 22344,  9932,  3992,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'special_tokens_mask': tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 1])}

In [14]:
input_dataset[1]

{'input_ids': tensor([  101,  1045,  2572,  2019, 22344,  9932,  3992,   102,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0]),
 'special_tokens_mask': tensor([1, 0, 0, 0, 0, 0, 0, 1, 1, 1])}

In [15]:
input_dataset[2]

{'input_ids': tensor([  101,  2026,  2171,  2003, 11754,   102,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0]),
 'special_tokens_mask': tensor([1, 0, 0, 0, 0, 1, 1, 1, 1, 1])}

In [16]:
input_dataset[0]['input_ids']

tensor([  101,  8703,  1998,  1045,  2572,  2019, 22344,  9932,  3992,   102])

In [17]:
tokenizer.decode(input_dataset[0]['input_ids'])

'[CLS] shade and i am an aspiring ai engineer [SEP]'

In [18]:
tokenizer.decode(input_dataset[2]['input_ids'])

'[CLS] my name is slim [SEP] [PAD] [PAD] [PAD] [PAD]'