In [1]:
#pip install datasets
import datasets

In [2]:
SEQ_LEN = 512

## Pretraining

### bookcorpus

In [3]:
# Download + load data from cache or online AUTOMATICALLY
# https://huggingface.co/docs/datasets/loading#slice-splits


from datasets import load_dataset

dataset = load_dataset("bookcorpus") # alternative, less size datasets.load_dataset("bookcorpus", split="train[:10%]")
# saved here on windows C:\Users\morit\.cache\huggingface

print(dataset)

  table = cls._concat_blocks(blocks, axis=0)


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 74004228
    })
})


#### Saving

In [4]:
# manual save to disk

#folder_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\pretraining"
#full_path = folder_path+r"\bookcorpus"

#dataset.save_to_disk(full_path)

#### Loading

In [None]:
# manual load from disk

#dataset = datasets.load_dataset(full_path)

#### Tests with torch

In [None]:
#pip install transformers

In [5]:
dataset.with_format("torch")

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 74004228
    })
})

#### slicing

In [6]:
dataset["train"][4]["text"]

'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .'

#### Standard dataloader - not working?

In [7]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset["train"], batch_size=2)
batch = next(iter(dataloader))
batch

{'text': ['usually , he would be tearing around the living room , playing with his toys .',
  'but just one look at a minion sent him practically catatonic .']}

#### Tokenizer - use pretrained, at least for prototype

In [8]:
# https://huggingface.co/docs/transformers/preprocessing
# https://huggingface.co/docs/transformers/main_classes/tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Choose an appropriate tokenizer

In [9]:
#tokenizer.model_max_length = SEQ_LEN # might not be correct in case of pretraining where we add CLS at the end, check that

In [10]:
tokenizer.truncation_side 

'right'

In [11]:
tokenizer.model_max_length # we might need to fixate this

512

In [12]:
text = "hi i am moritz, who are you ?"#["hi i am moritz", "no you are not moritz, you are kevin"]
encoded_input = tokenizer(text)#,padding=True, truncation=True)
# , return_tensors='pt') use this for pt tensors
encoded_input

{'input_ids': [101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:
encoded_input["input_ids"]

[101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102]

In [14]:
tokenizer.decode(encoded_input["input_ids"])

'[CLS] hi i am moritz, who are you? [SEP]'

In [15]:
tokenizer.mask_token

'[MASK]'

In [16]:
tokenizer.vocab['[MASK]']

103

#### Custom Dataset

In [17]:
from torch.utils.data import Dataset, DataLoader
import random
import itertools
import torch

class Bookcorpus(Dataset):
    
    def __init__(self, tokenizer, seq_len=SEQ_LEN):
        self.dataset = load_dataset("bookcorpus")["train"]
        self.nrows = len(self.dataset) 
        self.tokenizer = tokenizer
        self.seq_len = seq_len

    def __len__(self):
        return self.nrows

    def __getitem__(self, item): # TODO Where is truncation if sequence is to long? How is ensured that both sentences fit into the sequence?
        
        # Step 1: get random sentence pair, either negative or positive (saved as is_next_label)
        s1, s2, is_next_label = self.get_sent(item)
        
        # Step 2: replace random words in EACH sentence with mask / random words # copied 
        t1_random, t1_label = self.random_word(s1)
        t2_random, t2_label = self.random_word(s2)
        
        # Step 3: Adding CLS and SEP tokens to the start and end of sentences # copied 
         # Adding PAD token for labels
        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        # Step 4: combine sentence 1 and 2 as one input # copied 
        # adding PAD tokens to make the sentence same length as seq_len
        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                  "bert_label": bert_label,
                  "segment_label": segment_label,
                  "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}
        
        
        
        #return  {"s1":s1, "s2":s2, "is_next_label":is_next_label}
        #return {"t1_random":t1_random, "t1_label":t1_label, "t2_random":t2_random, "t2_label":t2_label}
    
    def get_sent(self, index): #selfmade
        '''gets sentence pair as dicitinary s1, s2, isNext'''
        isNext = random.random() > 0.5
        
        t1 = self.dataset[index]["text"]
        if isNext:
            t2 = self.dataset[index+1]["text"]
            return t1, t2, 1
        else:
            t2 = self.get_random_line(index+1)["text"]
            return t1, t2, 0
        
    def get_random_line(self, excludedIndex): #selfmade
        '''return random single sentence excluding'''
        randIndex = random.randint(1, self.__len__())
            
        # ensure that randIndex is not next sentence
        while randIndex == excludedIndex:
            randIndex = random.randint(1, self.__len__())
        
        return self.dataset[randIndex]

    def random_word(self, sentence): #copied
        tokens = sentence.split()
        output_label = []
        output = []

        # 15% of the tokens would be replaced
        for i, token in enumerate(tokens):
            prob = random.random()

            # remove cls and sep token
            token_id = self.tokenizer(token)['input_ids'][1:-1]

            if prob < 0.15:
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    for i in range(len(token_id)):
                        output.append(self.tokenizer.vocab['[MASK]'])

                # 10% chance change token to random token
                elif prob < 0.9:
                    for i in range(len(token_id)):
                        output.append(random.randrange(len(self.tokenizer.vocab)))

                # 10% chance change token to current token
                else:
                    output.append(token_id)

                output_label.append(token_id)

            else:
                output.append(token_id)
                for i in range(len(token_id)):
                    output_label.append(0)

        # flattening
        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
        assert len(output) == len(output_label)
        #assert len(output) == self.seq_len, "sequence length not fixed! "+str(len(output)) # from moritz
        return output, output_label

#### Testing the Dataset and Dataloaders

In [18]:
test = Bookcorpus(tokenizer)

  table = cls._concat_blocks(blocks, axis=0)


In [19]:
dl = DataLoader(test,batch_size=2,shuffle=False)

In [20]:
next(iter(dl))

{'bert_input': tensor([[ 101, 2788, 1010,  ...,    0,    0,    0],
         [ 101, 2021, 2074,  ...,    0,    0,    0]]),
 'bert_label': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'segment_label': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'is_next': tensor([1, 1])}

In [25]:
#is sewuence length fixed?
for i in range(1,1000):
    batch = next(iter(dl))
    for j in range(1,2): # batchsize
        length_ = len(batch["bert_input"][j])
        #print(length_)
        assert length_==SEQ_LEN, "sequence size is not "+str(SEQ_LEN)+": "+ str(length_)

## Finetuning - download from kaggle and extract in finetuning folder. Delete the zips


In [26]:
toxic_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\finetuning\kaggle-toxic_comment"
toxic_dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)
toxic_dataset

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 159571
    })
    test: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 63978
    })
})

#### Test with standard dataloader

In [27]:
from torch.utils.data import DataLoader
dataloader = DataLoader(toxic_dataset["train"], batch_size=1, shuffle = True)
batch = next(iter(dataloader))
batch

{'comment_text': ["Right Sector \n\nRight Sector is involved in the current conflict occurring in the Donbass region. I am now aware you have a biasness for your lack of justification in removing material which is well known. Because you seem to think it's fine to remove the Right Sector from the info-box, I will remove the RNU , as I can see no fairer way then to remove a Russian-centric group while you unjustifiably removed a Ukrainian-centric group, and labeled the addition of them as not 'constructive'. There is a better case to be made for Right Sector involvement in the conflict rather than RNU involvement."],
 'toxic': tensor([0]),
 'severe_toxic': tensor([0]),
 'obscene': tensor([0]),
 'threat': tensor([0]),
 'insult': tensor([0]),
 'identity_hate': tensor([0])}

In [28]:
encoded_input = tokenizer(batch["comment_text"])
encoded_input

{'input_ids': [[101, 2157, 4753, 2157, 4753, 2003, 2920, 1999, 1996, 2783, 4736, 10066, 1999, 1996, 2123, 22083, 2015, 2555, 1012, 1045, 2572, 2085, 5204, 2017, 2031, 1037, 13827, 2791, 2005, 2115, 3768, 1997, 19777, 1999, 9268, 3430, 2029, 2003, 2092, 2124, 1012, 2138, 2017, 4025, 2000, 2228, 2009, 1005, 1055, 2986, 2000, 6366, 1996, 2157, 4753, 2013, 1996, 18558, 1011, 3482, 1010, 1045, 2097, 6366, 1996, 29300, 2226, 1010, 2004, 1045, 2064, 2156, 2053, 4189, 2121, 2126, 2059, 2000, 6366, 1037, 2845, 1011, 9358, 7277, 2177, 2096, 2017, 4895, 29427, 10128, 2401, 6321, 3718, 1037, 5969, 1011, 9358, 7277, 2177, 1010, 1998, 12599, 1996, 2804, 1997, 2068, 2004, 2025, 1005, 26157, 1005, 1012, 2045, 2003, 1037, 2488, 2553, 2000, 2022, 2081, 2005, 2157, 4753, 6624, 1999, 1996, 4736, 2738, 2084, 29300, 2226, 6624, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [31]:
from itertools import chain
flattened = list(chain(*(encoded_input["input_ids"])))

In [32]:
tokenizer.decode(flattened)

"[CLS] right sector right sector is involved in the current conflict occurring in the donbass region. i am now aware you have a biasness for your lack of justification in removing material which is well known. because you seem to think it's fine to remove the right sector from the info - box, i will remove the rnu, as i can see no fairer way then to remove a russian - centric group while you unjustifiably removed a ukrainian - centric group, and labeled the addition of them as not'constructive '. there is a better case to be made for right sector involvement in the conflict rather than rnu involvement. [SEP]"

#### Custom Dataset

In [33]:
class ToxicComment(Dataset):
    
    def __init__(self, tokenizer, seq_len=SEQ_LEN, split="train",):
        
        if not split in ["train","test"]:
            raise ValueError("Parameter has to be 'train' or 'test'")
        
        self.dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)[split]
        self.nrows = len(self.dataset) 
        self.tokenizer = tokenizer
        self.seq_len = seq_len

    def __len__(self):
        return self.nrows

    def __getitem__(self, item):
        
        # Step 1: get row
        row = self.dataset[item]
        #print(row)
        
        # Step 2: encode comment
        row["bert_input"] = tokenizer(row["comment_text"],padding=True, truncation=True, return_tensors='pt')["input_ids"]
        
        # TODO
        #print(row["bert_input"])
        
        return row["bert_input"]

    
    def get_sent(self, index): #selfmade
        '''gets sentence pair as dicitinary s1, s2, isNext'''
        isNext = random.random() > 0.5
        
        t1 = self.dataset[index]["text"]
        if isNext:
            t2 = self.dataset[index+1]["text"]
            return t1, t2, 1
        else:
            t2 = self.get_random_line(index+1)["text"]
            return t1, t2, 0
        

In [34]:
test2 = ToxicComment(tokenizer)
dl2 = DataLoader(test2,batch_size=1,shuffle=False)
next(iter(dl2))

tensor([[[  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
          18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
           1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
           3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
           1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
           1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
           6486,  1012, 16327,  1012,  4229,  1012,  2676,   102]]])

In [None]:
len(next(iter(dl2)))

In [None]:
len(next(iter(dl))["bert_input"][0])

* Ensure that during pre training, both sentences fit into the model at the same time -> seq_len