In [1]:
from transformers import BertTokenizer, BertForPreTraining, AutoTokenizer, AutoModelForPreTraining
import torch
from glob import glob
import random
from torch.utils.data import DataLoader
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForPreTraining.from_pretrained('bert-base-uncased')

In [2]:
def chunks(xs, n):
    """Yield successive n-sized chunks from a list."""
    n = max(1, n)
    return (xs[i:i+n] for i in range(0, len(xs), n))

In [4]:
text_files = glob("lecture_texts/*.txt")
len(text_files)

21

In [5]:
text = ""
for text_file_path in text_files:
    with open(text_file_path, "r", encoding="utf") as f:
        text += f.read()

In [6]:
# with open("lecture_texts/Matematička logika i računarstvo - Čačić - 1.12.2022. - 1. sat (192kbit_AAC)_hrv.txt", "r", encoding="utf") as f:
#     text = f.read()

In [7]:
sentences = text.split(".")
sentences = list(map(lambda x: x.lstrip().rstrip(), sentences))
bag = [sentence for sentence in sentences if sentence != ""]
bag = [sentence+"." for sentence in bag]
bag_size = len(bag)

In [8]:
bag_size

5911

In [9]:
paragraph_size = 10
paragraphs = list(chunks(bag, paragraph_size))
len(paragraphs)

592

In [10]:
sentence_a = []
sentence_b = []
label = []

for sentences in paragraphs:
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [11]:
for i in range(3):
    print(label[i])
    print(sentence_a[i] + '\n---')
    print(sentence_b[i] + '\n')

1
Imali smo niz formula A0, A1 itd.
---
Ima svojstvo konačnih presijeka, dakle, filtr generiran s njom je pravi.

0
Stvarno bih rekao da ovak na prvi pogled da sve štima i nalikuje na ovaj naš dokaz, jedino što ne ide sa nizom formula nego ide sa skupom formula, ali mislim da se vrlo jednostavno može svesti na ovo što smo mi radili, na obliku kojem smo mi radili, tako da mislim da će to biti jedna od zadataka za zadaću, dakle da pročitate taj dokaz na Math Overflowu i da ga zapišete u ovoj našoj notaciji kad smo već kod toga.
---
Dakle, prva zadaća će biti objavljena do kraja tjedna, dakle do Božića, onda ćete naravno ne samo preko Božićnih praznika, ali ono tamo negdje do kraja sječnja bi bilo lijepo da je riješite i pošaljete.

1
Naravno, kad god ih imam konačno mnogo, onda nema nekog, ne pada nam na pamet neki ljepši način da to napravimo, nego da jednostavno odaberemo jednog od njih i kažemo to je to.
---
Ali mislim da sam uspio, samo sad.



In [12]:
tokenizer = AutoTokenizer.from_pretrained("tbs17/MathBERT")
model = AutoModelForPreTraining.from_pretrained("tbs17/MathBERT")
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
model.device

device(type='cuda', index=0)

In [13]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [14]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T
inputs.next_sentence_label[:5]

tensor([[1],
        [0],
        [1],
        [0],
        [1]])

In [15]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [16]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [17]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )
selection[0]

[14, 16, 32, 39, 42]

In [18]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [19]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [20]:
dataset = OurDataset(inputs)

In [21]:
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [22]:
from torch.optim import Adam
from tqdm import tqdm

device = model.device
optim = Adam(model.parameters(), lr=1e-6, weight_decay=0.3)#, weight_decay=True)

epochs = 5

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(data_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 296/296 [01:09<00:00,  4.27it/s, loss=0.879]
Epoch 1: 100%|██████████| 296/296 [01:04<00:00,  4.59it/s, loss=1.21] 
Epoch 2: 100%|██████████| 296/296 [01:04<00:00,  4.56it/s, loss=0.289]
Epoch 3: 100%|██████████| 296/296 [01:05<00:00,  4.51it/s, loss=0.365]
Epoch 4: 100%|██████████| 296/296 [01:05<00:00,  4.52it/s, loss=0.651]


In [23]:
model.save_pretrained("./MathBERT_hr")
tokenizer.save_pretrained("./MathBERT_hr")

('./MathBERT_hr\\tokenizer_config.json',
 './MathBERT_hr\\special_tokens_map.json',
 './MathBERT_hr\\vocab.txt',
 './MathBERT_hr\\added_tokens.json',
 './MathBERT_hr\\tokenizer.json')