In [1]:
from transformers import BertTokenizer,BertForMaskedLM
import torch

In [2]:
text = ("After Abraham Lincoln won the November 1860 presidential "
        "election on an anti-slavery platform, an initial seven "
        "slave states declared their secession from the country "
        "to form the Confederacy. War broke out in April 1861 "
        "when secessionist forces attacked Fort Sumter in South "
        "Carolina, just over a month after Lincoln's "
        "inauguration.")

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
inputs = tokenizer(text,return_tensors="pt")

In [5]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [6]:
inputs["labels"] = inputs.input_ids.detach().clone()

In [7]:
inputs.input_ids.shape[1]

62

In [8]:
rand = torch.rand(inputs.input_ids.shape)
mask_arr = rand<.15

In [9]:
mask_arr

tensor([[False, False,  True, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False,  True, False,  True,  True, False, False, False, False,
         False, False, False, False,  True,  True,  True,  True, False, False,
         False, False]])

In [10]:
mask = mask_array = rand<.15 * (inputs.input_ids!=101) * (inputs.input_ids!=101)

In [11]:
mask

tensor([[False, False,  True, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False,  True, False,  True,  True, False, False, False, False,
         False, False, False, False,  True,  True,  True,  True, False, False,
         False, False]])

In [12]:
selection = torch.flatten(mask[0].nonzero()).tolist()

In [13]:
selection

[2, 42, 44, 45, 54, 55, 56, 57]

In [14]:
torch.flatten(mask[0].nonzero())

tensor([ 2, 42, 44, 45, 54, 55, 56, 57])

In [15]:
inputs.input_ids [0,selection] = 103

In [16]:
inputs.input_ids

tensor([[  101,  2044,   103,  5367,  2180,  1996,  2281,  7313,  4883,  2602,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,  3988,  2698,
          6658,  2163,  4161,  2037, 22965,  2013,  1996,  2406,  2000,  2433,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,   103,  4457,   103,   103,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,   103,   103,   103,   103,  1055, 17331,
          1012,   102]])

In [17]:
outputs = model(**inputs)

In [18]:
outputs.loss

tensor(0.7246, grad_fn=<NllLossBackward0>)

In [19]:
outputs.keys()

odict_keys(['loss', 'logits'])

In [20]:
outputs.logits.shape

torch.Size([1, 62, 30522])

In [21]:
torch.flatten((inputs.input_ids[0]==103).nonzero()).tolist()

[2, 42, 44, 45, 54, 55, 56, 57]

### masked language training

In [22]:
from transformers import BertTokenizer,BertForMaskedLM
import torch

In [23]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
with open("./clean.txt","r") as fp:
    text = fp.read().split("\n")
    

In [3]:
text[0:5]

['From my grandfather Verus I learned good morals and the government of my temper.',
 'From the reputation and remembrance of my father, modesty and a manly character.',
 'From my mother, piety and beneficence, and abstinence, not only from evil deeds, but even from evil thoughts; and further, simplicity in my way of living, far removed from the habits of the rich.',
 'From my great-grandfather, not to have frequented public schools, and to have had good teachers at home, and to know that on such things a man should spend liberally.',
 "From my governor, to be neither of the green nor of the blue party at the games in the Circus, nor a partizan either of the Parmularius or the Scutarius at the gladiators' fights; from him too I learned endurance of labour, and to want little, and to work with my own hands, and not to meddle with other people's affairs, and not to be ready to listen to slander."]

In [26]:
inputs = tokenizer(text,return_tensors="pt",max_length=512,truncation=True,padding="max_length")

In [27]:
inputs["labels"] = inputs.input_ids.detach().clone()

In [28]:
inputs.input_ids.shape

torch.Size([507, 512])

In [29]:
rand = torch.rand(inputs.input_ids.shape)

In [30]:
mask_arr = rand<.15

In [31]:
mask_arr = mask_arr *(inputs.input_ids!=101) *(inputs.input_ids!=103) * (inputs.input_ids!=0)

In [32]:
mask_arr.shape

torch.Size([507, 512])

In [33]:
selection = []
for i in range(inputs.input_ids.shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())
    

In [34]:
selection[0]

[7, 11]

In [35]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i,selection[i]]=103
    

In [36]:
inputs.input_ids

tensor([[  101,  2013,  2026,  ...,     0,     0,     0],
        [  101,   103,  1996,  ...,     0,     0,     0],
        [  101,  2013,  2026,  ...,     0,     0,     0],
        ...,
        [  101,  3459,   103,  ...,     0,     0,     0],
        [  101,  2043, 15223,  ...,     0,     0,     0],
        [  101,  7887,  3288,  ...,     0,     0,     0]])

In [37]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [38]:
class MediattaionDataset(Dataset):
    def __init__(self,encodings):
        self.encodings = encodings
    def __getitem__(self,idx):
        return {key: torch.tensor(value[idx]) for key,value in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [39]:
dataset = MediattaionDataset(inputs)

In [40]:
# dataset[0]

In [41]:
loader = DataLoader(dataset,batch_size=16,shuffle=True)

In [42]:
device  = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [43]:
device


device(type='cpu')

In [44]:
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [45]:
from transformers import AdamW
# activate train mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(),lr=5e-5)



In [46]:
len(text)/16

31.6875

In [47]:
batch_size=16

In [48]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())


  return {key: torch.tensor(value[idx]) for key,value in self.encodings.items()}
Epoch 0: 100%|██████████████████████| 32/32 [16:44<00:00, 31.40s/it, loss=0.394]
Epoch 1: 100%|███████████████████| 32/32 [1:06:58<00:00, 125.59s/it, loss=0.142]


### training with Trainer API

In [1]:
from transformers import BertTokenizer,BertForMaskedLM
import torch

In [4]:
text[0:1]

['From my grandfather Verus I learned good morals and the government of my temper.']

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
inputs = tokenizer(text,truncation=True,max_length=512,padding="max_length",return_tensors="pt")

In [7]:
inputs["labels"] = inputs.input_ids.detach().clone()

In [8]:
rand_arr = torch.rand(inputs["input_ids"].shape)
mask_arr = (inputs.input_ids !=101) * (inputs.input_ids !=102) *(inputs.input_ids !=0) * (rand_arr<.15)

In [9]:
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

In [10]:
selection = []
for i in range(inputs.input_ids.shape[0]):
    selection.append(torch.flatten(mask_arr[i].nonzero()).tolist())
    

In [11]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i,selection[i]]=103

In [12]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [13]:
from torch.utils.data import Dataset
class Meditation(Dataset):
    def __init__(self,encodings):
        self.encodings = encodings
    def __getitem__(self,idx):
        return{key :torch.tensor(value[idx]) for key,value in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [14]:
Dataset = Meditation(inputs)

In [15]:
from transformers import TrainingArguments

args = TrainingArguments(output_dir = "out",
                        per_device_train_batch_size=16,
                        num_train_epochs=2)

In [16]:
from transformers import Trainer


In [17]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=Dataset
)

In [19]:
trainer.train()