# TODO
* Ensure that during pre training, both sentences fit into the model at the same time -> seq_len 512
* padding for ToxicComment dataset
* think about visualizations for text, preprocessing text, etc.
* implement interfaces of the task sheet
* cleaner code

In [1]:
#pip install datasets
import datasets

In [2]:
SEQ_LEN = 64

## Pretraining

### bookcorpus

In [3]:
# Download + load data from cache or online AUTOMATICALLY
# https://huggingface.co/docs/datasets/loading#slice-splits


from datasets import load_dataset

dataset = load_dataset("bookcorpus") # alternative, less size datasets.load_dataset("bookcorpus", split="train[:10%]")
# saved here on windows C:\Users\morit\.cache\huggingface

print(dataset)

  table = cls._concat_blocks(blocks, axis=0)


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 74004228
    })
})


#### Saving

In [4]:
# manual save to disk

#folder_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\pretraining"
#full_path = folder_path+r"\bookcorpus"

#dataset.save_to_disk(full_path)

#### Loading

In [5]:
# manual load from disk

#dataset = datasets.load_dataset(full_path)

#### Tests with torch

In [6]:
#pip install transformers

In [7]:
dataset.with_format("torch")

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 74004228
    })
})

#### slicing

In [8]:
dataset["train"][4]["text"]

'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .'

#### Standard dataloader - not working?

In [9]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset["train"], batch_size=2)
batch = next(iter(dataloader))
batch

{'text': ['usually , he would be tearing around the living room , playing with his toys .',
  'but just one look at a minion sent him practically catatonic .']}

#### Tokenizer - use pretrained, at least for prototype

In [10]:
# https://huggingface.co/docs/transformers/preprocessing
# https://huggingface.co/docs/transformers/main_classes/tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Choose an appropriate tokenizer

In [11]:
#tokenizer.model_max_length = SEQ_LEN # might not be correct in case of pretraining where we add CLS at the end, check that

In [12]:
tokenizer.truncation_side 

'right'

In [13]:
tokenizer.model_max_length # we might need to fixate this

512

In [14]:
text = "hi i am moritz, who are you ?"#["hi i am moritz", "no you are not moritz, you are kevin"]
encoded_input = tokenizer(text)#,padding=True, truncation=True)
# , return_tensors='pt') use this for pt tensors
encoded_input

{'input_ids': [101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
encoded_input["input_ids"]

[101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102]

In [16]:
tokenizer.decode(encoded_input["input_ids"])

'[CLS] hi i am moritz, who are you? [SEP]'

In [17]:
tokenizer.mask_token

'[MASK]'

In [18]:
tokenizer.vocab['[MASK]']

103

#### Custom Dataset

In [19]:
from torch.utils.data import Dataset, DataLoader
import random
import itertools
import torch

class Bookcorpus(Dataset):
    
    def __init__(self, tokenizer, seq_len=64):
        self.dataset = load_dataset("bookcorpus")["train"]
        self.nrows = len(self.dataset) 
        self.tokenizer = tokenizer
        self.seq_len = seq_len

    def __len__(self):
        return self.nrows

    def __getitem__(self, item): # TODO Where is truncation if sequence is to long? How is ensured that both sentences fit into the sequence?
        
        # Step 1: get random sentence pair, either negative or positive (saved as is_next_label)
        s1, s2, is_next_label = self.get_sent(item)
        
        # Step 2: replace random words in EACH sentence with mask / random words # copied 
        t1_random, t1_label = self.random_word(s1)
        t2_random, t2_label = self.random_word(s2)
        
        # Step 3: Adding CLS and SEP tokens to the start and end of sentences # copied 
         # Adding PAD token for labels
        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        # Step 4: combine sentence 1 and 2 as one input # copied 
        # adding PAD tokens to make the sentence same length as seq_len
        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                  "bert_label": bert_label,
                  "segment_label": segment_label,
                  "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}
        
        
        
        #return  {"s1":s1, "s2":s2, "is_next_label":is_next_label}
        #return {"t1_random":t1_random, "t1_label":t1_label, "t2_random":t2_random, "t2_label":t2_label}
    
    def get_sent(self, index): #selfmade
        '''gets sentence pair as dicitinary s1, s2, isNext'''
        isNext = random.random() > 0.5
        
        t1 = self.dataset[index]["text"]
        if isNext:
            t2 = self.dataset[index+1]["text"]
            return t1, t2, 1
        else:
            t2 = self.get_random_line(index+1)["text"]
            return t1, t2, 0
        
    def get_random_line(self, excludedIndex): #selfmade
        '''return random single sentence excluding'''
        randIndex = random.randint(1, self.__len__())
            
        # ensure that randIndex is not next sentence
        while randIndex == excludedIndex:
            randIndex = random.randint(1, self.__len__())
        
        return self.dataset[randIndex]

    def random_word(self, sentence): #copied
        tokens = sentence.split()
        output_label = []
        output = []

        # 15% of the tokens would be replaced
        for i, token in enumerate(tokens):
            prob = random.random()

            # remove cls and sep token
            token_id = self.tokenizer(token)['input_ids'][1:-1]

            if prob < 0.15:
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    for i in range(len(token_id)):
                        output.append(self.tokenizer.vocab['[MASK]'])

                # 10% chance change token to random token
                elif prob < 0.9:
                    for i in range(len(token_id)):
                        output.append(random.randrange(len(self.tokenizer.vocab)))

                # 10% chance change token to current token
                else:
                    output.append(token_id)

                output_label.append(token_id)

            else:
                output.append(token_id)
                for i in range(len(token_id)):
                    output_label.append(0)

        # flattening
        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
        assert len(output) == len(output_label)
        #assert len(output) == self.seq_len, "sequence length not fixed! "+str(len(output)) # from moritz
        return output, output_label

#### Testing the Dataset and Dataloaders

In [20]:
test = Bookcorpus(tokenizer)

  table = cls._concat_blocks(blocks, axis=0)


In [21]:
dl = DataLoader(test,batch_size=2,shuffle=False)

In [22]:
#is sewuence length fixed?
"""for i in range(1,1000):
    batch = next(iter(dl))
    for j in range(1,2): # batchsize
        length_ = len(batch["bert_input"][j])
        #print(length_)
        assert length_==SEQ_LEN, "sequence size is not "+str(SEQ_LEN)+": "+ str(length_)"""

'for i in range(1,1000):\n    batch = next(iter(dl))\n    for j in range(1,2): # batchsize\n        length_ = len(batch["bert_input"][j])\n        #print(length_)\n        assert length_==SEQ_LEN, "sequence size is not "+str(SEQ_LEN)+": "+ str(length_)'

In [23]:
batch = next(iter(dl))
batch

{'bert_input': tensor([[  101,  2788,  1010,  2002,  2052,   103, 13311,  2105,  1996,  2542,
           2282,  1010,  2652,   103,  2010,   103,  1012,   102,  2002,  2106,
           2025,   103,  2040,   103,  3344, 11249,  1996,  4028,  1010, 17726,
           2002,  2354,  1996, 10984,  2798,  2001,  3331,  2055,  2001,  8321,
           1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0],
         [  101,  2021,  2074,  2028,  2298,  2012,  1037,  7163,  2239,   103,
           2032,  8134,  4937, 22436,  2594,   103,   102,   103,   103,  2042,
          12756,  1005,  1055,  2933,  2043,  2016,  2288,  2032,  5102,  3041,
           1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,  

In [24]:
from itertools import chain
flattened = list(chain(*((batch["bert_input"]))))
tokenizer.decode(flattened)

"[CLS] usually, he would [MASK] tearing around the living room, playing [MASK] his [MASK]. [SEP] he did not [MASK] who [MASK] carried illustrations the murder,evic he knew the timing charles was talking about was accurate. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [CLS] but just one look at a minion [MASK] him practically catatonic [MASK] [SEP] [MASK] [MASK] been megan's plan when she got him dressed earlier. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

## Finetuning - download from kaggle and extract in finetuning folder. Delete the zips


In [25]:
toxic_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\finetuning\kaggle-toxic_comment"
toxic_dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)
toxic_dataset

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 159571
    })
    test: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 63978
    })
})

#### Test with standard dataloader

In [26]:
from torch.utils.data import DataLoader
dataloader = DataLoader(toxic_dataset["train"], batch_size=1, shuffle = True)
batch = next(iter(dataloader))
batch

{'comment_text': ["Brahma Kumaris article \n\nHi Jossi. Hope you are well. As someone who has some familiarity with the article, although I know you haven't been involved with it for a good while, I was hoping you might be able to stop by and have a look at the current discussion on the external links wording in the article. I am in conflict with two other editors - I think we all have POVs about the wording. Hence, I am trying to bring in indepedent editors/admin to give their comments. Your thoughts on the wording, but also the websites that are being linked to (and whether they are appropriate or not) would be of great value. Regards"],
 'toxic': tensor([0]),
 'severe_toxic': tensor([0]),
 'obscene': tensor([0]),
 'threat': tensor([0]),
 'insult': tensor([0]),
 'identity_hate': tensor([0])}

In [27]:
encoded_input = tokenizer(batch["comment_text"])
encoded_input

{'input_ids': [[101, 11655, 22444, 9600, 2483, 3720, 7632, 8183, 18719, 1012, 3246, 2017, 2024, 2092, 1012, 2004, 2619, 2040, 2038, 2070, 24666, 2007, 1996, 3720, 1010, 2348, 1045, 2113, 2017, 4033, 1005, 1056, 2042, 2920, 2007, 2009, 2005, 1037, 2204, 2096, 1010, 1045, 2001, 5327, 2017, 2453, 2022, 2583, 2000, 2644, 2011, 1998, 2031, 1037, 2298, 2012, 1996, 2783, 6594, 2006, 1996, 6327, 6971, 2773, 2075, 1999, 1996, 3720, 1012, 1045, 2572, 1999, 4736, 2007, 2048, 2060, 10195, 1011, 1045, 2228, 2057, 2035, 2031, 13433, 15088, 2055, 1996, 2773, 2075, 1012, 6516, 1010, 1045, 2572, 2667, 2000, 3288, 1999, 27427, 13699, 14728, 3372, 10195, 1013, 4748, 10020, 2000, 2507, 2037, 7928, 1012, 2115, 4301, 2006, 1996, 2773, 2075, 1010, 2021, 2036, 1996, 11744, 2008, 2024, 2108, 5799, 2000, 1006, 1998, 3251, 2027, 2024, 6413, 2030, 2025, 1007, 2052, 2022, 1997, 2307, 3643, 1012, 12362, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [28]:
from itertools import chain
flattened = list(chain(*(encoded_input["input_ids"])))
tokenizer.decode(flattened)

"[CLS] brahma kumaris article hi jossi. hope you are well. as someone who has some familiarity with the article, although i know you haven't been involved with it for a good while, i was hoping you might be able to stop by and have a look at the current discussion on the external links wording in the article. i am in conflict with two other editors - i think we all have povs about the wording. hence, i am trying to bring in indepedent editors / admin to give their comments. your thoughts on the wording, but also the websites that are being linked to ( and whether they are appropriate or not ) would be of great value. regards [SEP]"

#### Custom Dataset

In [58]:
class ToxicComment(Dataset):
    
    def __init__(self, tokenizer, seq_len=64, split="train",):
        
        if not split in ["train","test"]:
            raise ValueError("Parameter has to be 'train' or 'test'")
        
        self.dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)[split]
        self.nrows = len(self.dataset) 
        self.tokenizer = tokenizer
        self.seq_len = seq_len

    def __len__(self):
        return self.nrows

    def __getitem__(self, item):
        
        # Step 1: get row
        output = self.dataset[item]
        #print(output)
        
        # Step 2: tokenize comment
        output["bert_input"] = tokenizer(
            output["comment_text"],
            max_length=self.seq_len ,
            padding="max_length", 
            truncation=True, 
            return_tensors='pt'
        )["input_ids"]
        
        output.pop("comment_text") #delete raw text
        
        # Step 3: add bert_label and segment_label like in pretraining task for consistency TODO: Correct?
        output["bert_label"] = torch.zeros(self.seq_len)
        output["segment_label"] = torch.ones(self.seq_len)
        
        # Step 4: collect different labels to one tensor 
        # TODO: necessary?
        
        return output

    
    def get_sent(self, index): #selfmade
        '''gets sentence pair as dicitinary s1, s2, isNext'''
        isNext = random.random() > 0.5
        
        t1 = self.dataset[index]["text"]
        if isNext:
            t2 = self.dataset[index+1]["text"]
            return t1, t2, 1
        else:
            t2 = self.get_random_line(index+1)["text"]
            return t1, t2, 0
        

In [60]:
test2 = ToxicComment(tokenizer, seq_len=SEQ_LEN, split = "train")
dl2 = DataLoader(test2,batch_size=10,shuffle=False)
next(iter(dl2))

{'toxic': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'severe_toxic': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'obscene': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'threat': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'insult': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'identity_hate': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'bert_input': tensor([[[  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
           18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
            1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
            3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
            1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
            1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
            6486,  1012, 16327,   102]],
 
         [[  101,  1040,  1005, 22091,  2860,   999,  2002,  3503,  2023,  4281,
            6120,  1045,  1005,  1049,  9428,  5881,  200

In [31]:
len(next(iter(dl2)))

1

In [32]:
len(next(iter(dl))["bert_input"][0])

64