In [2]:
#pip install datasets
import datasets

## Pretraining

### bookcorpus

In [3]:
# Download + load data from cache or online AUTOMATICALLY
# https://huggingface.co/docs/datasets/loading#slice-splits


from datasets import load_dataset

dataset = load_dataset("bookcorpus") # alternative, less size datasets.load_dataset("bookcorpus", split="train[:10%]")
# saved here on windows C:\Users\morit\.cache\huggingface

print(dataset)

  table = cls._concat_blocks(blocks, axis=0)


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 74004228
    })
})


#### Saving

In [4]:
# manual save to disk

#folder_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\pretraining"
#full_path = folder_path+r"\bookcorpus"

#dataset.save_to_disk(full_path)

#### Loading

In [5]:
# manual load from disk

#dataset = datasets.load_dataset(full_path)

#### Tests with torch

In [7]:
#pip install transformers

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Choose an appropriate tokenizer

In [6]:
dataset.with_format("torch")

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 74004228
    })
})

#### slicing

In [36]:
dataset["train"][4]["text"]

'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .'

#### Standard dataloader - not working?

In [34]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset["train"], batch_size=2)

In [35]:
batch = next(iter(dataloader))
batch

{'text': ['usually , he would be tearing around the living room , playing with his toys .',
  'but just one look at a minion sent him practically catatonic .']}

#### Tokenizer - use pretrained, at least for prototype

In [143]:
# https://huggingface.co/docs/transformers/preprocessing
# https://huggingface.co/docs/transformers/main_classes/tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Choose an appropriate tokenizer

In [144]:
text = "hi i am moritz, who are you ?"#["hi i am moritz", "no you are not moritz, you are kevin"]
encoded_input = tokenizer(text)#,padding=True, truncation=True)
# , return_tensors='pt') use this for pt tensors
encoded_input

{'input_ids': [101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [139]:
encoded_input["input_ids"]

[101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102]

In [140]:
tokenizer.decode(encoded_input["input_ids"])

'[CLS] hi i am moritz, who are you? [SEP]'

In [145]:
tokenizer.mask_token

'[MASK]'

#### Custom Dataset

In [167]:
from torch.utils.data import Dataset, DataLoader
import random
import itertools
import torch

class Bookcorpus(Dataset):
    
    def __init__(self, tokenizer, seq_len=64):
        self.dataset = load_dataset("bookcorpus")["train"]
        self.nrows = len(self.dataset) 
        self.tokenizer = tokenizer
        self.seq_len = seq_len

    def __len__(self):
        return self.nrows

    def __getitem__(self, item):
        
        # Step 1: get random sentence pair, either negative or positive (saved as is_next_label)
        s1, s2, is_next_label = self.get_sent(item)
        
        # Step 2: replace random words in EACH sentence with mask / random words # copied 
        t1_random, t1_label = self.random_word(s1)
        t2_random, t2_label = self.random_word(s2)
        
        # Step 3: Adding CLS and SEP tokens to the start and end of sentences # copied 
         # Adding PAD token for labels
        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        # Step 4: combine sentence 1 and 2 as one input # copied 
        # adding PAD tokens to make the sentence same length as seq_len
        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                  "bert_label": bert_label,
                  "segment_label": segment_label,
                  "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}
        
        
        
        #return  {"s1":s1, "s2":s2, "is_next_label":is_next_label}
        #return {"t1_random":t1_random, "t1_label":t1_label, "t2_random":t2_random, "t2_label":t2_label}
    
    def get_sent(self, index): #selfmade
        '''gets sentence pair as dicitinary s1, s2, isNext'''
        isNext = random.random() > 0.5
        
        t1 = self.dataset[index]["text"]
        if isNext:
            t2 = self.dataset[index+1]["text"]
            return t1, t2, 1
        else:
            t2 = self.get_random_line(index+1)["text"]
            return t1, t2, 0
        
    def get_random_line(self, excludedIndex): #selfmade
        '''return random single sentence excluding'''
        randIndex = random.randint(1, self.__len__())
            
        # ensure that randIndex is not next sentence
        while randIndex == excludedIndex:
            randIndex = random.randint(1, self.__len__())
        
        return self.dataset[randIndex]

    def random_word(self, sentence): #copied
        tokens = sentence.split()
        output_label = []
        output = []

        # 15% of the tokens would be replaced
        for i, token in enumerate(tokens):
            prob = random.random()

            # remove cls and sep token
            token_id = self.tokenizer(token)['input_ids'][1:-1]

            if prob < 0.15:
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    for i in range(len(token_id)):
                        output.append(self.tokenizer.vocab['[MASK]'])

                # 10% chance change token to random token
                elif prob < 0.9:
                    for i in range(len(token_id)):
                        output.append(random.randrange(len(self.tokenizer.vocab)))

                # 10% chance change token to current token
                else:
                    output.append(token_id)

                output_label.append(token_id)

            else:
                output.append(token_id)
                for i in range(len(token_id)):
                    output_label.append(0)

        # flattening
        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
        assert len(output) == len(output_label)
        return output, output_label

#### Testing the Dataset and Dataloaders

In [168]:
test = Bookcorpus(tokenizer)

In [173]:
dl = DataLoader(test,128,shuffle=False)

In [174]:
next(iter(dl))

{'bert_input': tensor([[  101,  2788,  1010,  ...,     0,     0,     0],
         [  101,  2021,  2074,  ...,     0,     0,     0],
         [  101,  2008,  2018,  ...,     0,     0,     0],
         ...,
         [  101,  2002,  2018,  ...,     0,     0,     0],
         [  101,  9036, 15235,  ...,     0,     0,     0],
         [  101,  1036,  1036,  ...,     0,     0,     0]]),
 'bert_label': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'segment_label': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'is_next': tensor([0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
         1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,

In [175]:
# speed test

for i in range(1,10):
    next(iter(dl))

## Finetuning
