# TODO
* Ensure that during pre training, both sentences fit into the model at the same time -> DONE but not teted
* think about visualizations for text, preprocessing text, etc.
* implement interfaces of the task sheet
* cleaner code

In [1]:
#pip install datasets
import datasets

In [2]:
SEQ_LEN = 64

## Pretraining

### bookcorpus

In [3]:
# Download + load data from cache or online AUTOMATICALLY
# https://huggingface.co/docs/datasets/loading#slice-splits


from datasets import load_dataset

dataset = load_dataset("bookcorpus") # alternative, less size datasets.load_dataset("bookcorpus", split="train[:10%]")
# split="train[10:20]")
# saved here on windows C:\Users\morit\.cache\huggingface

print(dataset)

  table = cls._concat_blocks(blocks, axis=0)


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 74004228
    })
})


#### Saving huggingface Dataset on disk

In [4]:
# manual save to disk

#folder_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\pretraining"
#full_path = folder_path+r"\bookcorpus"

#dataset.save_to_disk(full_path)

#### Loading hf dataset from disk

In [5]:
# manual load from disk

#dataset = datasets.load_dataset(full_path)

#### slicing hf dataset

In [6]:
dataset["train"][4]["text"]

'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .'

#### Standard dataloader - not sufficient we need tokenized output -> implement own dataset classes

In [7]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset["train"], batch_size=2)
batch = next(iter(dataloader))
batch

{'text': ['usually , he would be tearing around the living room , playing with his toys .',
  'but just one look at a minion sent him practically catatonic .']}

#### Tokenizer - use pretrained, at least for prototype

In [8]:
# https://huggingface.co/docs/transformers/preprocessing
# https://huggingface.co/docs/transformers/main_classes/tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Choose an appropriate tokenizer

In [9]:
#tokenizer.model_max_length = SEQ_LEN # might not be correct in case of pretraining where we add CLS at the end, check that

#### Tokenizer parameters

In [10]:
tokenizer.truncation_side 

'right'

In [11]:
tokenizer.model_max_length # we might need to fixate this

512

In [12]:
tokenizer.mask_token

'[MASK]'

In [13]:
tokenizer.vocab['[MASK]']

103

#### Tokenizer example usage

In [14]:
text = "hi i am moritz, who are you ?"#["hi i am moritz", "no you are not moritz, you are kevin"]
encoded_input = tokenizer(text)#,padding=True, truncation=True)
# , return_tensors='pt') use this for pt tensors
encoded_input

{'input_ids': [101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
encoded_input["input_ids"]

[101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102]

In [16]:
tokenizer.decode(encoded_input["input_ids"])

'[CLS] hi i am moritz, who are you? [SEP]'

#### Custom Dataset

In [17]:
n_rows = None
n_rows is None

True

In [18]:
from torch.utils.data import Dataset, DataLoader
import random
import itertools
import torch

class Bookcorpus(Dataset): # TODO rewrite 
    
    def __init__(self, tokenizer, seq_len=64, split="train", n_rows=None):
        """
        n_rows None means take the whole dataset
        """
        
        if not split in ["train"]:
            raise ValueError("For Bookcorpus there is only a train split")
            
        if n_rows is not None:
            self.dataset = load_dataset("bookcorpus", split=split+"[0:"+str(n_rows)+"]")#[split]
        else:
            self.dataset = load_dataset("bookcorpus")#[split]
            
        self.n_rows = len(self.dataset) 
        self.tokenizer = tokenizer
        self.seq_len = seq_len

    def __len__(self):
        return self.n_rows

    def __getitem__(self, item): # TODO Where is truncation if sequence is to long? How is ensured that both sentences fit into the sequence?
        
        # Step 1: get random sentence pair, either negative or positive (saved as is_next_label)
        s1, s2, is_next_label = self.get_sent(item)
        
        # Step 2: replace random words in EACH sentence with mask / random words # copied 
        t1_random, t1_label = self.random_word(s1)
        t2_random, t2_label = self.random_word(s2)
        
        # Step 3: Adding CLS and SEP tokens to the start and end of sentences # copied 
         # Adding PAD token for labels
        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        # Step 4: combine sentence 1 and 2 as one input # copied 
        # adding PAD tokens to make the sentence same length as seq_len
        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                  "bert_label": bert_label,
                  "segment_label": segment_label,
                  "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}
        
        
        
        #return  {"s1":s1, "s2":s2, "is_next_label":is_next_label}
        #return {"t1_random":t1_random, "t1_label":t1_label, "t2_random":t2_random, "t2_label":t2_label}
    
    def get_sent(self, index): #selfmade
        '''gets sentence pair as dicitinary s1, s2, isNext'''
        isNext = random.random() > 0.5
        
        t1 = self.dataset[index]["text"]
        if isNext:
            t2 = self.dataset[index+1]["text"]
            return t1, t2, 1
        else:
            t2 = self.get_random_line(index+1)["text"]
            return t1, t2, 0
        
    def get_random_line(self, excludedIndex): #selfmade
        '''return random single sentence excluding'''
        randIndex = random.randint(1, self.__len__())
            
        # ensure that randIndex is not next sentence
        while randIndex == excludedIndex:
            randIndex = random.randint(1, self.__len__())
        
        return self.dataset[randIndex]

    def random_word(self, sentence): #copied
        tokens = sentence.split()
        output_label = []
        output = []

        # 15% of the tokens would be replaced
        for i, token in enumerate(tokens):
            prob = random.random()

            # remove cls and sep token
            token_id = self.tokenizer(token)['input_ids'][1:-1]

            if prob < 0.15:
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    for i in range(len(token_id)):
                        output.append(self.tokenizer.vocab['[MASK]'])

                # 10% chance change token to random token
                elif prob < 0.9:
                    for i in range(len(token_id)):
                        output.append(random.randrange(len(self.tokenizer.vocab)))

                # 10% chance change token to current token
                else:
                    output.append(token_id)

                output_label.append(token_id)

            else:
                output.append(token_id)
                for i in range(len(token_id)):
                    output_label.append(0)

        # flattening
        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
        assert len(output) == len(output_label)
        #assert len(output) == self.seq_len, "sequence length not fixed! "+str(len(output)) # from moritz
        return output, output_label

#### Testing the Dataset and Dataloaders

In [19]:
test = Bookcorpus(tokenizer, n_rows = 100)
len(test)

100

In [20]:
dl = DataLoader(test,batch_size=2,shuffle=False)

In [21]:
#is sewuence length fixed?
"""for i in range(1,1000):
    batch = next(iter(dl))
    for j in range(1,2): # batchsize
        length_ = len(batch["bert_input"][j])
        #print(length_)
        assert length_==SEQ_LEN, "sequence size is not "+str(SEQ_LEN)+": "+ str(length_)"""

'for i in range(1,1000):\n    batch = next(iter(dl))\n    for j in range(1,2): # batchsize\n        length_ = len(batch["bert_input"][j])\n        #print(length_)\n        assert length_==SEQ_LEN, "sequence size is not "+str(SEQ_LEN)+": "+ str(length_)'

In [22]:
batch = next(iter(dl))
batch

{'bert_input': tensor([[  101,  2788,  1010,  2002,  2052,  2022, 13311,  2105,  1996,  2542,
           2282,  1010,  2652,   103,  2010, 10899,  1012,   102,  2044,  2008,
            103,  2002,  2018, 18397,   694,  2686,  2042,  4699,  1999,  2151,
           1997,  1996,  4620,   103, 22028, 12756,  2741,  1012,   102,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0],
         [  101,  2021,  2074,  2028,  2298,  2012,   103,  7163,  2239,  2741,
           2032,  8134,  4937, 22436,  2594,  1012,   102,  2016,  2001,  1050,
           1005,  1056,  4527,  2000,  2156,  2027,  2018,  3369,  2431,  2019,
           3178,  2077,  1996, 18336,  2318,  1012,   102,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,  

#### Visualize encoded sequence

In [23]:
from itertools import chain
flattened = list(chain(*((batch["bert_input"]))))
tokenizer.decode(flattened)

"[CLS] usually, he would be tearing around the living room, playing [MASK] his toys. [SEP] after that [MASK] he had digitally [unused689] space been interested in any of the pictures [MASK] emails megan sent. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [CLS] but just one look at [MASK] minion sent him practically catatonic. [SEP] she wasn't surprised to see they had arrived half an hour before the baptism started. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

## Finetuning

Cant be downloaded automatically from huggingface. Needs to be downloaded manually:

1) download from kaggle and 
2) extract in finetuning folder 
3) Delete the zips

In [24]:
toxic_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\finetuning\kaggle-toxic_comment"
toxic_dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)
toxic_dataset

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 159571
    })
    test: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 63978
    })
})

#### Test with standard dataloader

In [25]:
from torch.utils.data import DataLoader
dataloader = DataLoader(toxic_dataset["train"], batch_size=1, shuffle = True)
batch = next(iter(dataloader))
batch

{'comment_text': ['All Grown Up\n\nPlease stop purposefully adding nonsense to Wikipedia.  You have been reported to Wikipedia:Vandalism in progress  19:27, Jun 16, 2005 (UTC)'],
 'toxic': tensor([0]),
 'severe_toxic': tensor([0]),
 'obscene': tensor([0]),
 'threat': tensor([0]),
 'insult': tensor([0]),
 'identity_hate': tensor([0])}

#### Standard Tokenizer not sufficient, padding is missing and probably also truncation

In [26]:
encoded_input = tokenizer(batch["comment_text"])
encoded_input

{'input_ids': [[101, 2035, 4961, 2039, 3531, 2644, 3800, 7699, 5815, 14652, 2000, 16948, 1012, 2017, 2031, 2042, 2988, 2000, 16948, 1024, 3158, 9305, 2964, 1999, 5082, 2539, 1024, 2676, 1010, 12022, 2385, 1010, 2384, 1006, 11396, 1007, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [27]:
from itertools import chain
flattened = list(chain(*(encoded_input["input_ids"])))
tokenizer.decode(flattened)

'[CLS] all grown up please stop purposefully adding nonsense to wikipedia. you have been reported to wikipedia : vandalism in progress 19 : 27, jun 16, 2005 ( utc ) [SEP]'

#### Custom Dataset

In [39]:
class ToxicComment(Dataset):
    
    def __init__(self, tokenizer, seq_len=64, split="train", n_rows:int=None):
        
        if not split in ["train","test"]:
            raise ValueError("Parameter has to be 'train' or 'test'")
            
        if n_rows is not None:
            self.dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path, split=split+"[0:"+str(n_rows)+"]")#[split]
        else:
            self.dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)#[split]
        
        
        self.nrows = len(self.dataset) 
        self.tokenizer = tokenizer
        self.seq_len = seq_len

    def __len__(self):
        return self.nrows

    def __getitem__(self, item):
        
        # Step 1: get row
        output = self.dataset[item]
        #print(output)
        
        # Step 2: tokenize comment
        output["bert_input"] = tokenizer(
            output["comment_text"],
            max_length=self.seq_len ,
            padding="max_length", 
            truncation=True, 
            return_tensors='pt'
        )["input_ids"]
        
        output.pop("comment_text") #delete raw text
        
        # Step 3: add bert_label and segment_label like in pretraining task for consistency TODO: Correct?
        output["bert_label"] = torch.zeros(self.seq_len)
        output["segment_label"] = torch.ones(self.seq_len)
        
        # Step 4: collect different labels to one tensor 
        # TODO: desired?
        
        return output

    
    def get_sent(self, index): #selfmade
        '''gets sentence pair as dicitinary s1, s2, isNext'''
        isNext = random.random() > 0.5
        
        t1 = self.dataset[index]["text"]
        if isNext:
            t2 = self.dataset[index+1]["text"]
            return t1, t2, 1
        else:
            t2 = self.get_random_line(index+1)["text"]
            return t1, t2, 0
        

#### Test Dataset

In [40]:
test2 = ToxicComment(tokenizer=tokenizer, seq_len=SEQ_LEN, split = "train", n_rows = 100)
len(test2)

100

In [41]:
dl2 = DataLoader(test2,batch_size=10,shuffle=False)
next(iter(dl2))

{'toxic': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'severe_toxic': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'obscene': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'threat': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'insult': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'identity_hate': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'bert_input': tensor([[[  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
           18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
            1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
            3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
            1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
            1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
            6486,  1012, 16327,   102]],
 
         [[  101,  1040,  1005, 22091,  2860,   999,  2002,  3503,  2023,  4281,
            6120,  1045,  1005,  1049,  9428,  5881,  200

In [42]:
len(next(iter(dl2)))

9

In [43]:
len(next(iter(dl))["bert_input"][0])

64

## Functions for report

In [32]:
"""class BertTokenizer():
    def __init__(self, task_type="pretrain"):
        if not task_type in ["pretrain", "text_classification_multi"]:
            raise ValueError("task not implemented")
        pass
    
    def __call__()"""
# i noticed we dont need any callable class to do transformation on the datasets since everything is handeled by our dataloaders
# ie we dont need rescaling etc.
# maybe ask supervisor if we need to save back the tokenized text or if it is okay to do it on the fly and leave the load_data transformation parameter at None

'class BertTokenizer():\n    def __init__(self, task_type="pretrain"):\n        if not task_type in ["pretrain", "text_classification_multi"]:\n            raise ValueError("task not implemented")\n        pass\n    \n    def __call__()'

In [33]:
def load_data(dataset:str, transformation=None, n_train:int, n_test:int=None): # transformation callable
    if dataset == "bookcorpus":
        
        
        pass
    elif dataset == "jigsaw_toxicity_pred":
        ToxicComment(tokenizer)
        pass
    else:
        pass


    
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train, test = load_data("jigsaw_toxicity_pred", transformation=tokenizer, n_train=1000, n_test=100)

SyntaxError: non-default argument follows default argument (1178960395.py, line 1)

In [None]:
def show(x, outfile:str=None): # can have more args