In [1]:
from transformers import AutoTokenizer, AutoModelForPreTraining
import torch
from glob import glob
import random
from torch.utils.data import DataLoader

In [2]:
def chunks(xs, n):
    """Yield successive n-sized chunks from a list."""
    n = max(1, n)
    return (xs[i:i+n] for i in range(0, len(xs), n))

In [3]:
text_files = glob("lecture_texts/*.txt")
len(text_files)

35

In [4]:
text = ""
for text_file_path in text_files:
    with open(text_file_path, "r", encoding="utf") as f:
        text += f.read()

In [5]:
sentences = text.split(".")
sentences = list(map(lambda x: x.lstrip().rstrip(), sentences))
bag = [sentence for sentence in sentences if sentence != ""]
bag = [sentence+"." for sentence in bag]
bag_size = len(bag)

In [6]:
bag_size

11085

In [7]:
paragraph_size = 5
paragraphs = list(chunks(bag, paragraph_size))
len(paragraphs)

2217

In [8]:
sentence_a = []
sentence_b = []
label = []

for sentences in paragraphs:
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [9]:
for i in range(3):
    print(label[i])
    print(sentence_a[i] + '\n---')
    print(sentence_b[i] + '\n')

0
Prvi put smo bili zapeli u jednom dijelu samo kod kregove interpolacijske leme.
---
Nije baš bilo jasno kako primijeniti ovaj postupak.

0
Tako da i dalje sačuvamo svojstvo da između ovih F i G nekako nema interpolanta, čak nikad dodam te Bove.
---
I onda je bilo nekako aha, znači svaki pojedini B je konzistentan sa Fom, dakle ovo je ispunjivo.

1
Stvarno bih rekao da ovak na prvi pogled da sve štima i nalikuje na ovaj naš dokaz, jedino što ne ide sa nizom formula nego ide sa skupom formula, ali mislim da se vrlo jednostavno može svesti na ovo što smo mi radili, na obliku kojem smo mi radili, tako da mislim da će to biti jedna od zadataka za zadaću, dakle da pročitate taj dokaz na Math Overflowu i da ga zapišete u ovoj našoj notaciji kad smo već kod toga.
---
Pod ovom dodatnom predopstavkom da teorija nema konečno.



In [10]:
tokenizer = AutoTokenizer.from_pretrained("tbs17/MathBERT")
model = AutoModelForPreTraining.from_pretrained("tbs17/MathBERT")
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
model.device

device(type='cuda', index=0)

In [11]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [12]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T
inputs.next_sentence_label[:5]

tensor([[0],
        [0],
        [1],
        [0],
        [0]])

In [13]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [14]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [15]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )
selection[0]

[3, 4, 21, 30, 37, 39, 49, 53]

In [16]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [17]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [18]:
dataset = OurDataset(inputs)

In [19]:
data_loader = DataLoader(dataset, batch_size=3, shuffle=True)

In [20]:
from torch.optim import Adam
from tqdm import tqdm

device = model.device
optim = Adam(model.parameters(), lr=1e-6, weight_decay=0.3)#, weight_decay=True)

epochs = 5

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(data_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        next_sentence_label = batch['next_sentence_label'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        next_sentence_label=next_sentence_label,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 739/739 [03:42<00:00,  3.33it/s, loss=0.92] 
Epoch 1: 100%|██████████| 739/739 [03:42<00:00,  3.33it/s, loss=0.93] 
Epoch 2: 100%|██████████| 739/739 [03:38<00:00,  3.39it/s, loss=0.988]
Epoch 3: 100%|██████████| 739/739 [03:38<00:00,  3.38it/s, loss=1.2]  
Epoch 4: 100%|██████████| 739/739 [03:39<00:00,  3.37it/s, loss=0.733]


In [22]:
model.save_pretrained("./MathBERT_hr")
tokenizer.save_pretrained("./MathBERT_hr")

('./MathBERT_hr\\tokenizer_config.json',
 './MathBERT_hr\\special_tokens_map.json',
 './MathBERT_hr\\vocab.txt',
 './MathBERT_hr\\added_tokens.json',
 './MathBERT_hr\\tokenizer.json')