<a href="https://colab.research.google.com/github/odunayo12/bibleBert/blob/main/tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# !pip install transformers

In [None]:
# !git clone https://github.com/odunayo12/bibleBert.git

In [None]:
from transformers import RobertaTokenizerFast,RobertaConfig, RobertaForMaskedLM, AdamW
from pathlib import Path
import os
from tokenizers import ByteLevelBPETokenizer
# from tokenizer import *
from tqdm.auto import tqdm
from pathlib import Path
import torch
import numpy as np

In [None]:
# import all text files from data folder

if not os.path.exists('/content/txt_data'):
    os.mkdir('/content/txt_data')

paths = [str(f) for f in Path(r"/content/txt_data").glob('*.txt')]

In [None]:
tokenizer = ByteLevelBPETokenizer()
# pre-train the model
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,
                special_tokens=['<s>', '<pad>',
                                '</s>', '<unk>', '<mask>'])

In [None]:
# saves token as txt and json
if not os.path.exists('/content/tokens'):
    os.mkdir('/content/tokens')

tokenizer.save_model('/content/tokens')

['/content/tokens/vocab.json', '/content/tokens/merges.txt']

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("/content/tokens")
tokenizer('Abraham!')


file /content/tokens/config.json not found
file /content/tokens/config.json not found


{'input_ids': [0, 21073, 5, 2], 'attention_mask': [1, 1, 1, 1]}

In [None]:
tokenizer("sin").input_ids

[0, 27965, 2]

In [None]:
input_ids = tokenizer("sin you die").input_ids

In [None]:
for p in tqdm(paths):
    with open(p, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    batch = tokenizer(lines, max_length=512,
                      padding='max_length', truncation=True)
len(batch)

  0%|          | 0/8 [00:00<?, ?it/s]

2

In [None]:
labels = torch.tensor([v for k, v in batch.items()][0])
mask = torch.tensor([v for k, v in batch.items()][1])

In [None]:
print(labels)

tensor([[  0, 372, 310,  ...,   1,   1,   1],
        [  0,   6, 372,  ...,   1,   1,   1],
        [  0,   6, 372,  ...,   1,   1,   1],
        ...,
        [  0,   6, 372,  ...,   1,   1,   1],
        [  0, 372, 310,  ...,   1,   1,   1],
        [  0,   2,   1,  ...,   1,   1,   1]])


In [None]:

# make copy of labels tensor, this will be input_ids
input_ids = labels.detach().clone()
# create random array of floats with equal dims to input_ids
rand = torch.rand(input_ids.shape)
# mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]
mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)
# loop through each row in input_ids tensor (cannot do in parallel)
for i in range(input_ids.shape[0]):
    # get indices of mask positions from mask array
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    # mask input_ids
    input_ids[i, selection] = 3  # our custom [MASK] token == 3

In [None]:
input_ids.shape

torch.Size([31103, 512])

In [None]:
input_ids[0][:100]

tensor([   0,  373,  398,  440,    3,    3,  281,  473,    3,  225,  644,  316,
          30,   21, 1092,    3, 2322,  438, 3672,  264,  925,  273,  264,    3,
          18,    2,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1])

In [None]:
encodings = {'input_ids': input_ids,
             'attention_mask': mask,
             'labels': labels}

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key,
                tensor in self.encodings.items()}


In [None]:
dataset = Dataset(encodings)

In [None]:
loader = torch.utils.data.DataLoader(
    dataset, batch_size=8, shuffle=True)

In [None]:
tokenizer.vocab_size

30522

In [None]:
config = RobertaConfig(
    vocab_size= tokenizer.vocab_size,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [None]:
# initialize the RoBERTa model with a language modeling (LM) head.
model = RobertaForMaskedLM(config)

In [None]:
# # Setup GPU/CPU usage.
torch.cuda.empty_cache()
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
# # move the model over to the selected device
model.to(device)

In [None]:
# Activate the training mode of our model, and initialize our optimizer (Adam with weighted decay - reduces chance of overfitting).
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

In [None]:
epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

# saves token as txt and json
if not os.path.exists('/content/bibleBert'):
    os.mkdir('/content/bibleBert')

model.save_pretrained('/content/bibleBert')

  0%|          | 0/3888 [00:00<?, ?it/s]

  0%|          | 0/3888 [00:00<?, ?it/s]