# TODO
* Ensure that during pre training, both sentences fit into the model at the same time -> DONE but not tested
* think about visualizations for text, preprocessing text, etc.
* implement interfaces of the task sheet
* cleaner code

In [394]:
#pip install datasets
import datasets

In [395]:
SEQ_LEN = 64 # maximum sequence length
VOCAB_SIZE = 30522  # = len(tokenizer.vocab)
N_SEGMENTS = 3 # number of segmentation labels
EMBED_SIZE = 768 # size of embedding vector
DROPOUT = 0.1 # dropout chance

## Pretraining

### bookcorpus

In [396]:
# Download + load data from cache or online AUTOMATICALLY
# https://huggingface.co/docs/datasets/loading#slice-splits


from datasets import load_dataset

dataset = load_dataset("bookcorpus") # alternative, less size datasets.load_dataset("bookcorpus", split="train[:10%]")
# split="train[10:20]")
# saved here on windows C:\Users\morit\.cache\huggingface

print(dataset)

Found cached dataset bookcorpus (C:/Users/Johannes/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 74004228
    })
})


#### Saving huggingface Dataset on disk

In [397]:
# manual save to disk

#folder_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\pretraining"
#full_path = folder_path+r"\bookcorpus"

#dataset.save_to_disk(full_path)

#### Loading hf dataset from disk

In [398]:
# manual load from disk

#dataset = datasets.load_dataset(full_path)

#### slicing hf dataset

In [399]:
dataset["train"][66]["text"]

'her parents rattled along to each other as they made their way through the tree-lined suburbs where megan had grown up .'

#### Standard dataloader - not sufficient we need tokenized output -> implement own dataset classes

In [400]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset["train"], batch_size=2)
batch = next(iter(dataloader))
batch

{'text': ['usually , he would be tearing around the living room , playing with his toys .',
  'but just one look at a minion sent him practically catatonic .']}

#### Tokenizer - use pretrained, at least for prototype

In [401]:
# https://huggingface.co/docs/transformers/preprocessing
# https://huggingface.co/docs/transformers/main_classes/tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Choose an appropriate tokenizer

In [402]:
#tokenizer.model_max_length = SEQ_LEN # might not be correct in case of pretraining where we add CLS at the end, check that

#### Tokenizer parameters

In [403]:
tokenizer.truncation_side 

'right'

In [404]:
tokenizer.model_max_length # we might need to fixate this

512

In [405]:
tokenizer.mask_token

'[MASK]'

In [406]:
tokenizer.vocab['[MASK]']

103

#### Tokenizer example usage

In [407]:
text = "hi i am moritz, who are you ?"#["hi i am moritz", "no you are not moritz, you are kevin"]
encoded_input = tokenizer(text)#,padding=True, truncation=True)
# , return_tensors='pt') use this for pt tensors
encoded_input

{'input_ids': [101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [408]:
encoded_input["input_ids"]

[101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102]

In [409]:
tokenizer.decode(encoded_input["input_ids"])

'[CLS] hi i am moritz, who are you? [SEP]'

#### Custom Dataset

In [410]:
n_rows = None
n_rows is None

True

In [411]:
from torch.utils.data import Dataset, DataLoader
import random
import itertools
import torch

class Bookcorpus(Dataset):  
    
    def __init__(self, tokenizer, seq_len=64, split="train", n_rows=None):
        """
        n_rows == None means take the whole dataset
        """
     
        if not split in ["train"]:
            raise ValueError("For Bookcorpus there is only a train split")
            
        self.n_rows = n_rows # is only inititialized if __len__() is called
        self.tokenizer = tokenizer 
        self.seq_len = seq_len
        self.split = split
        self.dataset = None # only loaded id needed
    
    # apply lazy loading
    def load_memory(self):
        if self.n_rows is not None:
            self.dataset = load_dataset("bookcorpus", split=self.split+"[0:"+str(self.n_rows)+"]") # [split]
        else:
            self.dataset = load_dataset("bookcorpus") # [split]

    def __len__(self):
        if self.dataset is None:
            self.load_memory() # only loaded if required
        return len(self.dataset)

    def __getitem__(self, item): 
        if self.dataset is None:
            self.load_memory() # only loaded if required
        
        # Create a random pair of sentences, if is_next is true if they are subsequent
        s1, s2, is_next = self.get_sentence_pair(item)
        
        # Replace 15% of the words in each line with masks/random words/the word itself
        s1_random, s1_label = self.random_masking(s1)
        s2_random, s2_label = self.random_masking(s2)
        
        # Step 3: Adding CLS and SEP tokens to the start and end of sentences # copied 
         # Adding PAD token for labels
        cls = [self.tokenizer.vocab['[CLS]']]
        sep = [self.tokenizer.vocab['[SEP]']] 
        pad = [self.tokenizer.vocab['[PAD]']]
        
        # append separating tokens to sequence       
        s1 = cls + s1_random + sep       
        s2 = s2_random + sep
        s1_label = pad + s1_label + pad
        s2_label = s2_label + pad
               
        # add segement label, adding padding
        segment = ([1 for i in range(len(s1))]+[2 for i in range(len(s2))])[:self.seq_len]
        # generate 1 input for model
        model_input = (s1+s2)[:self.seq_len]
        model_label = (s1_label + s2_label)[:self.seq_len]
        # add padding where input is shorter than sequence
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(model_input))]
        model_input.extend(padding)
        model_label.extend(padding)
        segment.extend(padding)
        


        output = {
            "input": torch.tensor(model_input),
            "label": torch.tensor(model_label),
            "segment": torch.tensor(segment),
            "is_next": torch.tensor(is_next)
        }

        return {key: value.clone().detach() for key, value in output.items()}        
        #return  {"s1":s1, "s2":s2, "is_next_label":is_next_label}
        #return {"t1_random":t1_random, "t1_label":t1_label, "t2_random":t2_random, "t2_label":t2_label}
    
    def get_sentence_pair(self, index): 
        '''gets sentence pair as dicitinary s1, s2, isNext'''
        isNext = random.random() > 0.5 # if number > 0.5 isNext is positive
        
        s1 = self.dataset[index]["text"]
        if isNext and index + 1 < len(self.dataset): # select two subsequent lines
            s2 = self.dataset[index+1]["text"]
            return s1, s2, 1 # line1, line2, subsequent
        else: # select two non-Subsequent lines (index+1 is excluded from random selection)
            s2 = self.get_random_line(index+1)["text"]
            return s1, s2, 0 # line1, line2, subsequent
        
    def get_random_line(self, excludedIndex): 
        '''return random single sentence excluding'''
        randIndex = random.randint(1, self.__len__()-1)
            
        # ensure that randIndex is not next sentence
        while randIndex == excludedIndex:
            randIndex = random.randint(1, self.__len__()-1)
        
        return self.dataset[randIndex]
    
    def random_masking(self, sentence):
        words = sentence.split()
        masked_out = []
        masked_labels = []
        
        for i, word in enumerate(words):
            rnd_number1 = random.random() # continuous number from [0,1]
            rnd_number2 = random.random() # continuous number from [0,1]

            # turn word into token, remove [CLS], [SEP]
            token = self.tokenizer(word)['input_ids'] 
            token = token[1:-1]

            # replace a word with a probability of 15%
            if rnd_number1 < 0.15:

                # with 80% chance replace word by mask
                if rnd_number2 < 0.8:
                    for j in range(len(token)):
                        masked_out.append(self.tokenizer.vocab['[MASK]'])
                # with 10% chance replace word by random word
                elif rnd_number2 < 0.9:
                    for k in range(len(token)):
                        masked_out.append(random.randrange(len(self.tokenizer.vocab)))
                # with 10% chance word remains
                else:
                    masked_out.append(token)

                # set corresponding label
                masked_labels.append(token)
            # 85% don't change anything
            else:
                masked_out.append(token)
                # create corrsponding 0-label
                for l in range(len(token)):
                    masked_labels.append(0)
                
        # flatten output
        masked_out = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in masked_out]))
        print(masked_out)
        masked_labels = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in masked_labels]))
        print(masked_labels)

        # check for correct length
        assert len(masked_out) == len(masked_labels)
        #assert len(output) == self.seq_len, "sequence length not fixed! "+str(len(output)) # from moritz
        return masked_out, masked_labels

#### Testing the Dataset and Dataloaders

In [412]:
test = Bookcorpus(tokenizer, n_rows = 100)
len(test)

Found cached dataset bookcorpus (C:/Users/Johannes/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f)


100

In [413]:
dl = DataLoader(test,batch_size=2,shuffle=False)

In [414]:
#is sequence length fixed?
"""for i in range(1,1000):
    batch = next(iter(dl))
    for j in range(1,2): # batchsize
        length_ = len(batch["bert_input"][j])
        #print(length_)
        assert length_==SEQ_LEN, "sequence size is not "+str(SEQ_LEN)+": "+ str(length_)"""

'for i in range(1,1000):\n    batch = next(iter(dl))\n    for j in range(1,2): # batchsize\n        length_ = len(batch["bert_input"][j])\n        #print(length_)\n        assert length_==SEQ_LEN, "sequence size is not "+str(SEQ_LEN)+": "+ str(length_)'

In [415]:
batch = next(iter(dl))
batch

[2788, 1010, 2002, 2052, 2022, 13311, 103, 1996, 2542, 2282, 1010, 2652, 2007, 2010, 10899, 1012]
[0, 0, 0, 0, 2022, 0, 2105, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[2021, 2074, 2028, 2298, 2012, 1037, 7163, 2239, 2741, 2032, 8134, 4937, 22436, 2594, 1012]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[2021, 2074, 2028, 2298, 2012, 1037, 7163, 2239, 2741, 2032, 103, 4937, 22436, 2594, 1012]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8134, 0, 0, 0, 0]
[2008, 2018, 2042, 12756, 1005, 1055, 2933, 2043, 2016, 103, 2032, 5102, 3041, 1012]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 2288, 0, 0, 0, 0]


{'input': tensor([[  101,  2788,  1010,  2002,  2052,  2022, 13311,   103,  1996,  2542,
           2282,  1010,  2652,  2007,  2010, 10899,  1012,   102,  2021,  2074,
           2028,  2298,  2012,  1037,  7163,  2239,  2741,  2032,  8134,  4937,
          22436,  2594,  1012,   102,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0],
         [  101,  2021,  2074,  2028,  2298,  2012,  1037,  7163,  2239,  2741,
           2032,   103,  4937, 22436,  2594,  1012,   102,  2008,  2018,  2042,
          12756,  1005,  1055,  2933,  2043,  2016,   103,  2032,  5102,  3041,
           1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,

#### Visualize encoded sequence

In [416]:
from itertools import chain
flattened = list(chain(*((batch["input"]))))
tokenizer.decode(flattened)

"[CLS] usually, he would be tearing [MASK] the living room, playing with his toys. [SEP] but just one look at a minion sent him practically catatonic. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [CLS] but just one look at a minion sent him [MASK] catatonic. [SEP] that had been megan's plan when she [MASK] him dressed earlier. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

## Embedding

In [417]:
import torch
import torch.nn as nn
import math


class PositionEmbedding(torch.nn.Module):
    def __init__(self, embed_size, seq_len):
        super().__init__()
        n = 10000 # scalar for pos encoding
        # create embedding matrix dim(seq_len  x embed_size)
        self.embed_matrix = torch.zeros(seq_len, embed_size).float()
        # positional encoding not to be updated while gradient descent
        self.embed_matrix.require_grad = False
        
        # compute embedding for each position in input
        for position in range(seq_len):
            # run trough every component of embedding vector for each position with stride 2
            for c in range(0, embed_size, 2):
                # even 
                self.embed_matrix[position,c] = math.sin(position/(n**(2*c/embed_size)))
                # uneven
                self.embed_matrix[position,c+1] = math.cos(position/(n**(2*c/embed_size)))
        
        # self.embed_matrix =  embed_matrix.unsqueeze(0) 
    def forward(self, x):
        return self.embed_matrix
            

class BERTEmbedding(torch.nn.Module):
    def __init__(self, vocab_size, seq_len, n_segments=N_SEGMENTS, embed_size=EMBED_SIZE, dropout=DROPOUT):
        super().__init__()
        # token embedding: transforms (vocabulary size, number of tokens) into (vocabulary size, number of tokens, length of embdding vector)
        self.token = nn.Embedding(vocab_size, embed_size, padding_idx=0) # padding remains 0 during training
        # segment embedding for sentence 1, sentence 2, padding
        self.segment = nn.Embedding(n_segments, embed_size, padding_idx=0)
        # embedding of position
        self.position = PositionEmbedding(embed_size, seq_len) 
        # droput probability per token
        self.dropout = nn.Dropout(p=dropout)
    
    def forward(self, sequence, seg_label):
        return self.dropout(self.token(sequence) + self.segment(seg_label) + self.position(sequence))        
    

In [418]:
# embedding test: tokenized sequence
sample_seq = batch['input'][0] 
sample_seg = batch['segment'][0]
print(sample_seq.size())
print(sample_seq)
print(sample_seg.size())
print(sample_seg)

bert = BERTEmbedding(VOCAB_SIZE, SEQ_LEN, N_SEGMENTS)

batch_embed = bert(batch['input'][0], batch['segment'][0])

print(batch_embed.size())

torch.Size([64])
tensor([  101,  2788,  1010,  2002,  2052,  2022, 13311,   103,  1996,  2542,
         2282,  1010,  2652,  2007,  2010, 10899,  1012,   102,  2021,  2074,
         2028,  2298,  2012,  1037,  7163,  2239,  2741,  2032,  8134,  4937,
        22436,  2594,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
torch.Size([64])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
torch.Size([64, 768])


## Finetuning

Cant be downloaded automatically from huggingface. Needs to be downloaded manually:

1) download from kaggle and 
2) extract in finetuning folder 
3) Delete the zips

In [419]:
# toxic_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\finetuning\kaggle-toxic_comment"
toxic_path = r"C:\Users\Johannes\Project Machine Learning\datasets\finetuning\toxic_comment"
toxic_dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)
toxic_dataset

Found cached dataset jigsaw_toxicity_pred (C:/Users/Johannes/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-ebae0308d0d3f840/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 159571
    })
    test: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 63978
    })
})

#### Test with standard dataloader

In [420]:
from torch.utils.data import DataLoader
dataloader = DataLoader(toxic_dataset["train"], batch_size=1, shuffle = True)
batch = next(iter(dataloader))
batch

{'comment_text': ['Hi enemy\n\nYou will NEVER be able to get rid of me faggot.'],
 'toxic': tensor([1]),
 'severe_toxic': tensor([0]),
 'obscene': tensor([1]),
 'threat': tensor([0]),
 'insult': tensor([1]),
 'identity_hate': tensor([1])}

#### Standard Tokenizer not sufficient, padding is missing and probably also truncation

In [421]:
encoded_input = tokenizer(batch["comment_text"])
encoded_input

{'input_ids': [[101, 7632, 4099, 2017, 2097, 2196, 2022, 2583, 2000, 2131, 9436, 1997, 2033, 6904, 13871, 4140, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [422]:
from itertools import chain
flattened = list(chain(*(encoded_input["input_ids"])))
tokenizer.decode(flattened)

'[CLS] hi enemy you will never be able to get rid of me faggot. [SEP]'

#### Custom Dataset

In [471]:
class ToxicComment(Dataset):
    
    def __init__(self, tokenizer, seq_len=SEQ_LEN, split="train", n_rows:int=None):
        
        if not split in ["train","test"]:
            raise ValueError("Parameter has to be 'train' or 'test'")       
        
        self.dataset = None # only loaded if needed
        self.n_rows = n_rows # only loaded if needed
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.split = split
        
        
    # apply lazy loading
    def load_memory(self):
        if self.n_rows is not None:
            n_rows_str = f"[0:{self.n_rows}]" if self.n_rows is not None else ""
            self.dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path, split=f"{self.split}{n_rows_str}")#[split]
        else:
            self.dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)#[split]

    def __len__(self):
        if self.dataset is None:
            self.load_memory() # only loaded if required
        return len(self.dataset)

    def __getitem__(self, item):
        if self.dataset is None:
            self.load_memory() # only loaded if required
        
        # Step 1: get row
        output = self.dataset[item]

        # Step 2: tokenize comment
        output["input"] = self.tokenizer(
            output["comment_text"],
            max_length=self.seq_len ,
            padding="max_length", 
            truncation=True, 
            return_tensors='pt'
        )["input_ids"]
        
        # flatten output
        output["input"] = output["input"].squeeze()
        
        output.pop("comment_text") #delete raw text
        
        # Step 3: add segment_label like in pretraining task for consistency 
        # output["label"] = torch.zeros(self.seq_len) # this is not embedded anywhere so we can cut this
        output["segment"] = torch.ones(self.seq_len)
        
        # Step 4: collect different labels to one tensor 
        # TODO: desired?
        
        return output

    
    def get_sent(self, index): #selfmade
        '''gets sentence pair as dicitinary s1, s2, isNext'''
        isNext = random.random() > 0.5
        
        t1 = self.dataset[index]["text"]
        if isNext:
            t2 = self.dataset[index+1]["text"]
            return t1, t2, 1
        else:
            t2 = self.get_random_line(index+1)["text"]
            return t1, t2, 0
        

#### Test Dataset

In [472]:
test2 = ToxicComment(tokenizer=tokenizer, seq_len=SEQ_LEN, split = "train", n_rows = 100)
len(test2)

Found cached dataset jigsaw_toxicity_pred (C:/Users/Johannes/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-ebae0308d0d3f840/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85)


100

In [473]:
dl2 = DataLoader(test2,batch_size=1,shuffle=False)
batch = next(iter(dl2))
batch

{'toxic': tensor([0]),
 'severe_toxic': tensor([0]),
 'obscene': tensor([0]),
 'threat': tensor([0]),
 'insult': tensor([0]),
 'identity_hate': tensor([0]),
 'input': tensor([[  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
          18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
           1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
           3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
           1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
           1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
           6486,  1012, 16327,   102]]),
 'segment': tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])}

In [474]:
len(batch)

8

In [475]:
len(batch["input"][0])

64

### Embedding

In [428]:
# embedding test: tokenized sequence
sample_seq = batch['input'][0] 
sample_seg = batch['segment'][0]
print(f'sample_seq size {sample_seq.size()}')
print(sample_seq)
print(f'sample_seg size {sample_seg.size()}')
print(sample_seg)

bert = BERTEmbedding(VOCAB_SIZE, SEQ_LEN)

batch_embed = bert(batch['input'][0].long(), batch['segment'][0].long())

print(batch_embed.size())

sample_seq size torch.Size([64])
tensor([  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
        18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
         1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
         3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
         1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
         1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
         6486,  1012, 16327,   102])
sample_seg size torch.Size([64])
tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
torch.Size([64, 768])


# Model

In [429]:
import torch
import torch.nn as nn

# attention heads
class MultiHeadAttention(nn.Module):
    def __init__(self, number_heads, model_dimension):
        super(MultiHeadAttention, self).__init__()
        
        # model dimension must be divideable into equal parts for the attention heads
        assert model_dimension%number_heads == 0
        self.number_heads = number_heads
        self.att_head_dim = int(model_dimension/number_heads)
        
        # attention mechanism: query, key, value are linear embeddings -> embedding matrix dim: (model_dimension x model_dimension)
        self.query = nn.Linear(model_dimension, model_dimension)
        self.key = nn.Linear(model_dimension, model_dimension)
        self.value = nn.Linear(model_dimension, model_dimension)
        self.lin_output = nn.Linear(model_dimension, model_dimension)
    
    def forward(self, query, key, value, mask):
        
        # output dim (batch_size x seq_len x model_dimension) 
        query = self.query(query)
        key = self.key(key)        
        value = self.value(value) 
        
        # transform q,k,v to fit attention heads:(batch_size x seq_len x model_dimension) -> (batch_size x number_heads x seq_len x att_head_dim)
        query = query.view(query.shape[0], query.shape[1], self.number_heads, self.att_head_dim)
        query = query.permute(0,2,1,3)
        key = key.view(key.shape[0], key.shape[1], self.number_heads, self.att_head_dim)
        key = key.permute(0,2,1,3)
        value = value.view(value.shape[0], value.shape[1], self.number_heads, self.att_head_dim)
        value = value.permute(0,2,1,3)
        
        # calculate dot product between each query and each key and normaliz the output, output dim: (batch_size x number_heads x seq_len x seq_len)
        score = torch.matmul(query, key.permute(0, 1, 3, 2)) 
        score_n = score / math.sqrt(self.att_head_dim) # normalize: <q,k>/sqrt(d_k)
        
        # mask 0 with -infinity so it becomes 0 after softmax, output dim: (batch_size x number_heads x seq_len x seq_len)
        score_m = score_n.masked_fill(mask == 0, -1e10)    
        
        # softmax scores along each query, output dim: (batch_size x number_heads x seq_len x seq_len)
        score_w = nn.functional.softmax(score_m, dim=-1) 
        
        # multiply with value matrix: output weighted sum for each query, output dim: (batch_size x number_heads x seq_len x att_head_dim)
        weighted_sum = torch.matmul(score_w, value)
        
        # concatenate attention heads to 1 output, output dim: (batch_size x number_heads x model_dimension)
        weighted_sum = weighted_sum.permute(0, 2, 1, 3).contiguous().view(weighted_sum.shape[0], -1, self.number_heads * self.att_head_dim)
        
        # linear embedding for output
        out = self.lin_output(weighted_sum)      
        return out    

In [430]:
# feedforward layer
class FeedForwardLayer(nn.Module):
    def __init__(self, model_dimension, hidden_dimension):
        super(FeedForwardLayer, self).__init__()
        
        # linear layer
        self.linear1 = nn.Linear(model_dimension, hidden_dimension)
        self.linear2 = nn.Linear(hidden_dimension, model_dimension)
        # non-linearity
        self.non_linear = nn.ReLU()
    
    def forward(self, x):
        return self.linear2(self.non_linear(self.linear1(x)))       

In [431]:
# encoder stacks together all the previous modules
class Encoder(nn.Module):
    def __init__(self, model_dimension=EMBED_SIZE, number_heads=12, ff_hidden_dim=EMBED_SIZE*4):
        super(Encoder, self).__init__()
        # attention heads
        self.multihead_attention = MultiHeadAttention (number_heads, model_dimension)
        # normalisation layer
        self.normlayer = nn.LayerNorm(model_dimension)
        self.feedforward_layer = FeedForwardLayer(model_dimension, hidden_dimension=ff_hidden_dim)
    
    # also residuals possible here
    def forward(self, x, mask):
        # embeddings: (batch_size, max_len, d_model)
        # encoder mask: (batch_size, 1, 1, max_len)
        # result: (batch_size, max_len, d_model)
        # input x 3x to generate query, key, value
        x = self.normlayer(self.multihead_attention(x, x, x, mask))
        return self.normlayer(self.feedforward_layer(x))

In [448]:
# model class according to task sheet
class Model(nn.Module):
    def __init__(self, vocab_size, model_dimension, number_layers=12, number_heads=12):
        super().__init__()
        self.model_dimension=model_dimension
        self.number_layers=number_layers
        self.number_heads=number_heads
        # hidden layer dimenion of FF is 4*model_dimension (see paper)
        self.ff_hidden_layer = 4*model_dimension
        # embedding of input 
        self.embedding = BERTEmbedding(vocab_size=vocab_size, seq_len=SEQ_LEN, embed_size=model_dimension)
        # stack encoders
        self.encoders = torch.nn.ModuleList() # create empty module list
        for _ in range(self.number_layers):
            self.encoders.append(Encoder(model_dimension=model_dimension, number_heads=number_heads, ff_hidden_dim=4*model_dimension))
        
    def forward(self, x, segment_info):
        # mask to mark the padded tokens
        mask = (x > 0).unsqueeze(1).repeat(1,x.size(1),1).unsqueeze(1)
        x = self.embedding(x, segment_info) # copied: what is segment_info? to be changed
        # run trough encoders
        for encoder in self.encoders:
            x =encoder.forward(x, mask)
        return x

In [449]:
# pretraining
class MaskedPrediction(nn.Module):
    """
    This class predicts the original token which was replaced by a mask. 
    """
    def __init__(self, bert_out, vocab_size):
        super().__init__()
        self.linear = nn.Linear(bert_out, vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        return self.softmax(self.linear(x))

In [450]:
# pretraining
class NextSentencePrediction(nn.Module):
    """
    class to predict two classes: is next, not_next
    """
    def __init__(self, bert_out):
        super().__init__()
        self.linear = nn.Linear(bert_out, 2)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        return self.softmax(self.linear(x[:, 0]))

In [451]:
# finetuning
class ToxicityPrediction(nn.Module):
    """
    class to predict multivariate class of toxicity
    """
    def __init__(self, bert_out):
        super().__init__()
        self.tox_classes = 6 # there are 6 classes of toxicity in the dataset
        self.linear = nn.Linear(bert_out, self.tox_classes)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        return self.softmax(self.linear(x))

In [452]:
# fuse to one model 
class BERT(nn.Module):
    """
    pertraining: masked token prediction, next sentence prediction, 
    finetuning: toxic comment prediction
    """
    def __init__(self, base_model, vocab_size):
        super().__init__()
        # base BERT model
        self.base_model = base_model
        # masked token classfication layer
        self.masked_pred = MaskedPrediction(self.base_model.model_dimension, vocab_size)
        # next sentence predicton layer
        self.next_sentence = NextSentencePrediction(self.base_model.model_dimension)
        # toxic comment classfication layer
        self.toxic_comment = ToxicityPrediction(self.base_model.model_dimension)
    
    def forward(self, x, segment_label):
        x = self.base_model(x, segment_label)
        return self.next_sentence(x), self.masked_pred(x)

# Training

In [453]:
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import tqdm
import torch.nn as nn

# copied: reproduce
class TrainBERT:
    def __init__(self, model, train_dataloader, test_dataloader=None, learning_rate=1e-4, weight_decay=0.01, betas=(0.9, 0.999), log_freq=10, device='cuda'):
        self.device = device
        self.model = model
        self.training_data = train_dataloader
        self.testing_data = test_dataloader

        # optimizer: Adam
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate, betas=betas, weight_decay=weight_decay)
        self.scheduler = StepLR(self.optimizer, step_size=5, gamma=0.1)

        # cost function negative log likelihood loss for masked token prediction
        self.criterion = nn.NLLLoss(ignore_index=0)
        self.log_freq = log_freq
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))

    def training(self, epoch):
        self.iteration(epoch, self.training_data)

    def testing(self, epoch):
        self.iteration(epoch, self.testing_data, train=False)

    def iteration(self, epoch, data_loader, train=True):
        avg_loss = 0.0
        total_correct = 0
        total_element = 0

        mode = "train" if train else "test"

        # progress bar
        data_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc="EP_%s:%d" % (mode, epoch),
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        for i, data in data_iter:
            # 0. batch_data will be sent into the device(GPU or cpu)
            data = {key: value.to(self.device) for key, value in data.items()}

            # 1. forward the next_sentence_prediction and masked_lm model
            next_sent_output, mask_lm_output = self.model.forward(data["input"], data["segment"])

            # 2-1. NLL(negative log likelihood) loss of is_next classification result
            next_loss = self.criterion(next_sent_output, data["is_next"])

            # 2-2. NLLLoss of predicting masked token word
            # transpose to (m, vocab_size, seq_len) vs (m, seq_len)
            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["label"])

            # 2-3. Adding next_loss and mask_loss: 3.4 Pre-training Procedure
            loss = next_loss + mask_loss

            # 3. backward and optimization only in train
            if train:
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            # next sentence prediction accuracy
            correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
            avg_loss += loss.item()
            total_correct += correct
            total_element += data["is_next"].nelement()

            post_fix = {
                "epoch": epoch,
                "iter": i,
                "avg_loss": avg_loss / (i + 1),
                "avg_acc": total_correct / total_element * 100,
                "loss": loss.item()
            }

            if i % self.log_freq == 0:
                data_iter.write(str(post_fix)
        )

        print(
            f"EP{epoch}, {mode}: \
            avg_loss={avg_loss / len(data_iter)}, \
            total_acc={total_correct * 100.0 / total_element}"
        )


In [460]:
'''test run'''
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_data, _ = load_data("bookcorpus", transformation=tokenizer, n_train=100)
# train_data = Bookcorpus(tokenizer, n_rows = 100)

train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)
# def __init__(self, vocab_size=VOCAB_SIZE, model_dimension=EMBED_SIZE, number_layers=12, number_heads=12):
bert_model = Model(
  vocab_size=VOCAB_SIZE,
  model_dimension=EMBED_SIZE
)

bert_lm = BERT(bert_model, len(tokenizer.vocab))
bert_trainer = TrainBERT(bert_lm, train_loader, device='cpu')
epochs = 20

for epoch in range(epochs):
  bert_trainer.training(epoch)

Found cached dataset bookcorpus (C:/Users/Johannes/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f)


Total Parameters: 131956802


EP_train:0:   0%|| 0/4 [00:00<?, ?it/s]

[2002, 6973, 1998, 2059, 11361, 28339, 2014, 2388, 1005, 1055, 2608, 2005, 2014, 2269, 1005, 1055, 103, 1010, 2029, 103, 12756, 103, 103]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2612, 0, 0, 2081, 0, 2868, 1012]
[2002, 2001, 2107, 1037, 2158, 1005, 1055, 2158, 2525, 103]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1012]
[2002, 3866, 2068, 103, 2172, 2000, 2022, 6380, 103]
[0, 0, 0, 2205, 0, 0, 0, 0, 1012]
[1036, 1036, 2182, 2057, 2024, 1010, 103, 103, 2016, 2056, 27726, 1012]
[0, 0, 0, 0, 0, 0, 1005, 1005, 0, 0, 0, 0]
[2012, 2008, 2051, 1010, 2016, 2018, 2014, 2540, 2275, 2006, 2183, 2000, 103, 2082, 1998, 3352, 103, 3460, 1012]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2966, 0, 0, 0, 1037, 0, 0]
[2013, 1996, 2051, 2016, 103, 1037, 2210, 103, 1010, 2016, 2018, 2359, 2498, 103, 2084, 2000, 2393, 2111, 1012]
[0, 0, 0, 0, 2001, 0, 0, 2611, 0, 0, 0, 0, 0, 2062, 0, 0, 0, 0, 0]
[2096, 2009, 2018, 2042, 2053, 3160, 2008, 2016, 2359, 2032, 2004, 23834, 2005, 6701, 1010, 2016, 2018, 2042, 5186, 8686, 2043, 

torch.Size([32])


EP_train:0:   0%|| 0/4 [00:12<?, ?it/s]


KeyboardInterrupt: 

## Functions for report

In [None]:
"""class BertTokenizer():
    def __init__(self, task_type="pretrain"):
        if not task_type in ["pretrain", "text_classification_multi"]:
            raise ValueError("task not implemented")
        pass
    
    def __call__()"""
# i noticed we dont need any callable class to do transformation on the datasets since everything is handeled by our dataloaders
# ie we dont need rescaling etc.
# maybe ask supervisor if we need to save back the tokenized text or if it is okay to do it on the fly and leave the load_data transformation parameter at None

In [458]:
#def __init__(self, tokenizer, seq_len=64, split="train", n_rows=None):

def load_data(dataset:str, transformation=None, n_train:int=None, n_test:int=None): # transformation callable
    
    if dataset == "bookcorpus":
        train = Bookcorpus(
            tokenizer=transformation,
            seq_len=SEQ_LEN,
            split="train",
            n_rows=n_train
        )
        return train, None
    
    elif dataset == "jigsaw_toxicity_pred":
        train = ToxicComment(
            tokenizer=transformation,
            seq_len=SEQ_LEN,
            split="train",
            n_rows=n_train
        )
        
        test = ToxicComment(
            tokenizer=transformation,
            seq_len=SEQ_LEN,
            split="test",
            n_rows=n_test
        )
        return train, test
    
    else:
        raise NotImplementedError("Dataset not implemented")

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train, test = load_data("jigsaw_toxicity_pred", transformation=tokenizer, n_train=1000, n_test=100)

In [None]:
next(iter(train))

In [None]:
next(iter(test))

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train, test = load_data("bookcorpus", transformation=tokenizer, n_train=1000, n_test=100)

In [None]:
test is None

In [None]:
def show(x, outfile:str=None): # can have more args