In [5]:


import torch

# gpu or cpu?
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# tokenizer object
from transformers import BertTokenizerFast #, BertForMaskedLM
alephbert_tokenizer = BertTokenizerFast.from_pretrained('onlplab/alephbert-base')

# training method
from transformers import AdamW

# loading last tune file:
import pickle
import io

class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else: return super().find_class(module, name)

tune_path = '../myFirstTune/tune_6_10.pkl'

if torch.cuda.is_available():
    alephbert = pickle.load(open(tune_path,'rb'))
else:
    alephbert = CPU_Unpickler(open(tune_path, 'rb')).load()

# swithing to correct processor mode
alephbert.to(device)

# swithing to training mode
alephbert.train()
# if not finetuning - disable dropout

# ATTENTION! you might want to slice the raw data
with open('../../data/wikipedia/wikipedia_FT.raw', 'r') as raw:
    raw_list = raw.read().split('.')
    l = len(raw_list)
    # slicing:
    # raw_list = raw_list[(6*l)//10:(8*l)//10]
    raw_list = raw_list[:10]
    striped_list = [sentence.strip() for sentence in raw_list]
    # len_of_sentences = [len(sentence.split()) for sentence in striped_list]
    # pd.Series(len_of_sentences).hist(bins = 50)
    # most of the data is ~25 long sentences...
    # tokenizing text:
    tokenized_text = alephbert_tokenizer(striped_list,max_length=25, padding='max_length', return_tensors='pt', truncation=True, )
# dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

# adding labels to data
tokenized_text['labels'] = tokenized_text.input_ids.detach().clone()

# randomly choosing mask words:
rand = torch.rand(tokenized_text.input_ids.shape)
mask_arr = (rand < 0.15)*(tokenized_text.input_ids > 2)

# get mask index from tokenizer
mask_index = alephbert_tokenizer.convert_tokens_to_ids('[MASK]')

# get indexes of chosen words to be masked
selection = []
for i in range(mask_arr.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
        
    )

# change the chosen words to mask index
for i in range(mask_arr.shape[0]):
    tokenized_text.input_ids[i, selection[i]] = mask_index

# a class for mimiking a dataset
class MakeDataSet(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

dataset = MakeDataSet(tokenized_text)

# for batching
batch_size = 30
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# optimizer method
optim = AdamW(alephbert.parameters(), lr=1e-5)


# loading bar
from tqdm import tqdm

epochs = 1

In [6]:

# training! do not interupt
for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)

    for batch in loop:
        # batch is a part of the dataset, loaded by the dataloader
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # feeding inputs to the model
        outputs = alephbert(input_ids, attention_mask=attention_mask, labels=labels)

        # loss
        loss = outputs.loss
        loss.backward()
        optim.step()

        # description
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

save_path = '../myFirstTune/deleteme.pkl'

pickle.dump(alephbert, open(save_path, 'wb'))


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 1/1 [00:02<00:00,  2.94s/it, loss=0.304]


# **Test_FT**

In [7]:

model_beforeFT = alephbert
model_afterFT = pickle.load(open(save_path, 'rb'))


hebrew_text = '[CLS]  כל טענה [MASK] יש להוכיח . [SEP]'

tokenized_text = alephbert_tokenizer.tokenize(hebrew_text)
indexed_tokens = alephbert_tokenizer.convert_tokens_to_ids(tokenized_text)


# Create the segments tensors.
segments_ids = [0] * len(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens]).to(device)

segments_tensors = torch.tensor([segments_ids]).to(device)

# Load pre-trained model (weights)
model_beforeFT.eval()
model_afterFT.eval()

# Predict all tokens
with torch.no_grad():
    predictions = model_beforeFT(tokens_tensor, segments_tensors)
    predictions_FT = model_afterFT(tokens_tensor, segments_tensors)

masked_index = tokenized_text.index('[MASK]')

predicted_sorted = torch.argsort(predictions[0][0, masked_index], descending=True)
predicted_sorted_FT = torch.argsort(predictions_FT[0][0, masked_index], descending=True)

print('before:')
print(alephbert_tokenizer.convert_ids_to_tokens([token.item() for token in predicted_sorted[:20]]))
print('after:')
print(alephbert_tokenizer.convert_ids_to_tokens([token.item() for token in predicted_sorted_FT[:20]]))

print('index of מתמטית: ')
print('before:', alephbert_tokenizer.convert_ids_to_tokens([token.item() for token in predicted_sorted]).index('מתמטית'))
print('after:', alephbert_tokenizer.convert_ids_to_tokens([token.item() for token in predicted_sorted_FT]).index('מתמטית'))

print('index of מדעית: ')
print('before:', alephbert_tokenizer.convert_ids_to_tokens([token.item() for token in predicted_sorted]).index('מדעית'))
print('after:', alephbert_tokenizer.convert_ids_to_tokens([token.item() for token in predicted_sorted_FT]).index('מדעית'))

print('index of רצויה: ')
print('before:', alephbert_tokenizer.convert_ids_to_tokens([token.item() for token in predicted_sorted]).index('רצויה'))
print('after:', alephbert_tokenizer.convert_ids_to_tokens([token.item() for token in predicted_sorted_FT]).index('רצויה'))

print('index of וטענה: ')
print('before:', alephbert_tokenizer.convert_ids_to_tokens([token.item() for token in predicted_sorted]).index('וטענה'))
print('after:', alephbert_tokenizer.convert_ids_to_tokens([token.item() for token in predicted_sorted_FT]).index('וטענה'))

before:
['כזו', 'זו', 'אחרת', ',', 'זאת', 'כזאת', 'שהיא', 'שכזו', 'אפשרית', 'נוספת', '-', 'עובדתית', 'קיימת', '–', 'כאמור', 'נכונה', 'בה', 'סבירה', 'עליה', 'משפטית']
after:
['כזו', 'זו', 'אחרת', ',', 'זאת', 'כזאת', 'שהיא', 'שכזו', 'אפשרית', 'נוספת', '-', 'עובדתית', 'קיימת', '–', 'כאמור', 'נכונה', 'בה', 'סבירה', 'עליה', 'משפטית']
index of מתמטית: 
before: 75
after: 75
index of מדעית: 
before: 53
after: 53
index of רצויה: 
before: 209
after: 209
index of וטענה: 
before: 24
after: 24


# delete the last tune: /!\

In [8]:
import os
os.remove(save_path)