# TODO
* Ensure that during pre training, both sentences fit into the model at the same time -> DONE but not teted
* think about visualizations for text, preprocessing text, etc.
* implement interfaces of the task sheet
* cleaner code

In [1]:
#pip install datasets
import datasets

In [2]:
SEQ_LEN = 64 # maximum sequence length
VOCAB_SIZE = 30522  # = len(tokenizer.vocab)
N_SEGMENTS = 3 # number of segmentation labels
EMBED_SIZE = 768 # size of embedding vector
DROPOUT = 0.1 # dropout chance

## Pretraining

### bookcorpus

In [3]:
# Download + load data from cache or online AUTOMATICALLY
# https://huggingface.co/docs/datasets/loading#slice-splits


from datasets import load_dataset

dataset = load_dataset("bookcorpus") # alternative, less size datasets.load_dataset("bookcorpus", split="train[:10%]")
# split="train[10:20]")
# saved here on windows C:\Users\morit\.cache\huggingface

print(dataset)

Found cached dataset bookcorpus (C:/Users/Johannes/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 74004228
    })
})


#### Saving huggingface Dataset on disk

In [7]:
# manual save to disk

#folder_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\pretraining"
#full_path = folder_path+r"\bookcorpus"

#dataset.save_to_disk(full_path)

#### Loading hf dataset from disk

In [8]:
# manual load from disk

#dataset = datasets.load_dataset(full_path)

#### slicing hf dataset

In [9]:
dataset["train"][66]["text"]

'her parents rattled along to each other as they made their way through the tree-lined suburbs where megan had grown up .'

#### Standard dataloader - not sufficient we need tokenized output -> implement own dataset classes

In [10]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset["train"], batch_size=2)
batch = next(iter(dataloader))
batch

{'text': ['usually , he would be tearing around the living room , playing with his toys .',
  'but just one look at a minion sent him practically catatonic .']}

#### Tokenizer - use pretrained, at least for prototype

In [11]:
# https://huggingface.co/docs/transformers/preprocessing
# https://huggingface.co/docs/transformers/main_classes/tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Choose an appropriate tokenizer

In [12]:
#tokenizer.model_max_length = SEQ_LEN # might not be correct in case of pretraining where we add CLS at the end, check that

#### Tokenizer parameters

In [13]:
tokenizer.truncation_side 

'right'

In [14]:
tokenizer.model_max_length # we might need to fixate this

512

In [15]:
tokenizer.mask_token

'[MASK]'

In [16]:
tokenizer.vocab['[MASK]']

103

#### Tokenizer example usage

In [17]:
text = "hi i am moritz, who are you ?"#["hi i am moritz", "no you are not moritz, you are kevin"]
encoded_input = tokenizer(text)#,padding=True, truncation=True)
# , return_tensors='pt') use this for pt tensors
encoded_input

{'input_ids': [101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
encoded_input["input_ids"]

[101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102]

In [19]:
tokenizer.decode(encoded_input["input_ids"])

'[CLS] hi i am moritz, who are you? [SEP]'

#### Custom Dataset

In [20]:
n_rows = None
n_rows is None

True

In [21]:
from torch.utils.data import Dataset, DataLoader
import random
import itertools
import torch

class Bookcorpus(Dataset): # TODO rewrite 
    
    def __init__(self, tokenizer, seq_len=64, split="train", n_rows=None):
        """
        n_rows == None means take the whole dataset
        """
     
        if not split in ["train"]:
            raise ValueError("For Bookcorpus there is only a train split")
            
        self.n_rows = n_rows # is only inititialized if __len__() is called
        self.tokenizer = tokenizer 
        self.seq_len = seq_len
        self.split = split
        self.dataset = None # only loaded id needed
    
    # apply lazy loading
    def load_memory(self):
        if self.n_rows is not None:
            self.dataset = load_dataset("bookcorpus", split=self.split+"[0:"+str(self.n_rows)+"]") # [split]
        else:
            self.dataset = load_dataset("bookcorpus") # [split]

    def __len__(self):
        if self.dataset is None:
            self.load_memory() # only loaded if required
        return len(self.dataset)

    def __getitem__(self, item): 
        if self.dataset is None:
            self.load_data() # only loaded if required
        
        # Create a random pair of sentences, if subseq is true if they are subsequent
        s1, s2, subseq = self.get_sentence_pair(item)
        
        # Replace 15% of the words in each line with masks/random words/the word itself
        s1_random, s1_label = self.random_masking(s1)
        s2_random, s2_label = self.random_masking(s2)
        
        # Step 3: Adding CLS and SEP tokens to the start and end of sentences # copied 
         # Adding PAD token for labels
        cls = [self.tokenizer.vocab['[CLS]']]
        sep = [self.tokenizer.vocab['[SEP]']] 
        pad = [self.tokenizer.vocab['[PAD]']]
        
        # append separating tokens to sequence       
        s1 = cls + s1_random + sep       
        s2 = s2_random + sep
        s1_label = pad + s1_label + pad
        s2_label = s2_label + pad
               
        # add segement label, adding padding
        segment = ([1 for i in range(len(s1))]+[2 for i in range(len(s2))])[:self.seq_len]
        # generate 1 input for model
        model_input = (s1+s2)[:self.seq_len]
        model_label = (s1_label + s2_label)[:self.seq_len]
        # add padding where input is shorter than sequence
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(model_input))]
        model_input.extend(padding)
        model_label.extend(padding)
        segment.extend(padding)
        


        output = {
            "input": torch.tensor(model_input),
            "label": torch.tensor(model_label),
            "segment": torch.tensor(segment),
            "subseq": torch.tensor(subseq)
        }

        return {key: value.clone().detach() for key, value in output.items()}        
        #return  {"s1":s1, "s2":s2, "is_next_label":is_next_label}
        #return {"t1_random":t1_random, "t1_label":t1_label, "t2_random":t2_random, "t2_label":t2_label}
    
    def get_sentence_pair(self, index): 
        '''gets sentence pair as dicitinary s1, s2, isNext'''
        isNext = random.random() > 0.5 # if number > 0.5 isNext is positive
        
        t1 = self.dataset[index]["text"]
        if isNext: # select two subsequent lines
            t2 = self.dataset[index+1]["text"]
            return t1, t2, 1 # line1, line2, subsequent
        else: # select two non-Subsequent lines (index+1 is excluded from random selection)
            t2 = self.get_random_line(index+1)["text"]
            return t1, t2, 0 # line1, line2, subsequent
        
    def get_random_line(self, excludedIndex): 
        '''return random single sentence excluding'''
        randIndex = random.randint(1, self.__len__())
            
        # ensure that randIndex is not next sentence
        while randIndex == excludedIndex:
            randIndex = random.randint(1, self.__len__())
        
        return self.dataset[randIndex]
    
    def random_masking(self, sentence):
        words = sentence.split()
        masked_out = []
        masked_labels = []
        
        for i, word in enumerate(words):
            rnd_number1 = random.random() # continuous number from [0,1]
            rnd_number2 = random.random() # continuous number from [0,1]

            # turn word into token, remove [CLS], [SEP]
            token = self.tokenizer(word)['input_ids'] 
            token = token[1:-1]

            # replace a word with a probability of 15%
            if rnd_number1 < 0.15:

                # with 80% chance replace word by mask
                if rnd_number2 < 0.8:
                    for j in range(len(token)):
                        masked_out.append(self.tokenizer.vocab['[MASK]'])
                # with 10% chance replace word by random word
                elif rnd_number2 < 0.9:
                    for k in range(len(token)):
                        masked_out.append(random.randrange(len(self.tokenizer.vocab)))
                # with 10% chance word remains
                else:
                    masked_out.append(token)

                # set corresponding label
                masked_labels.append(token)
            # 85% don't change anything
            else:
                masked_out.append(token)
                # create corrsponding 0-label
                for l in range(len(token)):
                    masked_labels.append(0)
                
        # flatten output
        masked_out = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in masked_out]))
        print(masked_out)
        masked_labels = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in masked_labels]))
        print(masked_labels)

        # check for correct length
        assert len(masked_out) == len(masked_labels)
        #assert len(output) == self.seq_len, "sequence length not fixed! "+str(len(output)) # from moritz
        return masked_out, masked_labels

#### Testing the Dataset and Dataloaders

In [22]:
test = Bookcorpus(tokenizer, n_rows = 100)
len(test)

Found cached dataset bookcorpus (C:/Users/Johannes/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f)


100

In [23]:
dl = DataLoader(test,batch_size=2,shuffle=False)

In [24]:
#is sequence length fixed?
"""for i in range(1,1000):
    batch = next(iter(dl))
    for j in range(1,2): # batchsize
        length_ = len(batch["bert_input"][j])
        #print(length_)
        assert length_==SEQ_LEN, "sequence size is not "+str(SEQ_LEN)+": "+ str(length_)"""

'for i in range(1,1000):\n    batch = next(iter(dl))\n    for j in range(1,2): # batchsize\n        length_ = len(batch["bert_input"][j])\n        #print(length_)\n        assert length_==SEQ_LEN, "sequence size is not "+str(SEQ_LEN)+": "+ str(length_)'

In [25]:
batch = next(iter(dl))
batch

[2788, 1010, 2002, 2052, 2022, 13311, 2105, 1996, 2542, 2282, 1010, 2652, 103, 2010, 10899, 1012]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2007, 0, 0, 0]
[2002, 1005, 1040, 2464, 1996, 103, 2471, 2011, 6707, 103, 6195, 103, 103, 1037, 2210, 2402, 2005, 1996, 103, 9476, 1010, 2021, 2007, 3080, 103, 1010, 25294, 2007, 2014, 3428, 103, 6701, 2001, 2411, 103, 2000, 2477, 2008, 2020, 3080, 1012]
[0, 0, 0, 0, 0, 3185, 0, 0, 0, 1010, 0, 2002, 2001, 0, 0, 0, 2005, 0, 18720, 0, 0, 0, 0, 0, 12334, 0, 2247, 0, 0, 0, 1010, 0, 0, 0, 6086, 0, 0, 0, 0, 0, 0]
[103, 2074, 103, 2298, 2012, 103, 7163, 2239, 2741, 2032, 8134, 4937, 22436, 2594, 1012]
[2021, 0, 2028, 0, 0, 1037, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[2008, 2018, 6793, 12756, 1005, 1055, 2933, 2043, 2016, 2288, 10839, 5102, 103, 1012]
[0, 0, 2042, 0, 0, 0, 0, 0, 0, 0, 2032, 0, 3041, 0]


{'input': tensor([[  101,  2788,  1010,  2002,  2052,  2022, 13311,  2105,  1996,  2542,
           2282,  1010,  2652,   103,  2010, 10899,  1012,   102,  2002,  1005,
           1040,  2464,  1996,   103,  2471,  2011,  6707,   103,  6195,   103,
            103,  1037,  2210,  2402,  2005,  1996,   103,  9476,  1010,  2021,
           2007,  3080,   103,  1010, 25294,  2007,  2014,  3428,   103,  6701,
           2001,  2411,   103,  2000,  2477,  2008,  2020,  3080,  1012,   102,
              0,     0,     0,     0],
         [  101,   103,  2074,   103,  2298,  2012,   103,  7163,  2239,  2741,
           2032,  8134,  4937, 22436,  2594,  1012,   102,  2008,  2018,  6793,
          12756,  1005,  1055,  2933,  2043,  2016,  2288, 10839,  5102,   103,
           1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,

#### Visualize encoded sequence

In [26]:
from itertools import chain
flattened = list(chain(*((batch["input"]))))
tokenizer.decode(flattened)

"[CLS] usually, he would be tearing around the living room, playing [MASK] his toys. [SEP] he'd seen the [MASK] almost by mistake [MASK] considering [MASK] [MASK] a little young for the [MASK] cartoon, but with older [MASK], argus with her brothers [MASK] mason was often [MASK] to things that were older. [SEP] [PAD] [PAD] [PAD] [PAD] [CLS] [MASK] just [MASK] look at [MASK] minion sent him practically catatonic. [SEP] that had supporters megan's plan when she got hurling dressed [MASK]. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

## Embedding

Positional Embedding (see To DO) must be altered otherwise use nn.Embedding

In [29]:
import torch
import torch.nn as nn
import math


class PositionEmbedding(torch.nn.Module):
    def __init__(self, embed_size, seq_len):
        super().__init__()
        n = 10000 # scalar for pos encoding
        # create embedding matrix dim(seq_len  x embed_size)
        self.embed_matrix = torch.zeros(seq_len, embed_size).float()
        # positional encoding not to be updated while gradient descent
        self.embed_matrix.require_grad = False
        
        # compute embedding for each position in input
        for position in range(seq_len):
            # run trough every component of embedding vector for each position with stride 2
            for c in range(0, embed_size, 2):
                # even 
                self.embed_matrix[position,c] = math.sin(position/(n**(2*c/embed_size)))
                # uneven
                self.embed_matrix[position,c+1] = math.cos(position/(n**(2*c/embed_size)))
        
        # self.embed_matrix =  embed_matrix.unsqueeze(0) 
    def forward(self, x):
        return self.embed_matrix
            

class BERTEmbedding(torch.nn.Module):
    def __init__(self, vocab_size, embed_size, seq_len=SEQ_LEN, n_segments=N_SEGMENTS, dropout=DROPOUT):
        super().__init__()
        # token embedding: transforms (vocabulary size, number of tokens) into (vocabulary size, number of tokens, length of embdding vector)
        self.token = nn.Embedding(vocab_size, embed_size, padding_idx=0) # padding remains 0 during training
        # segment embedding for sentence 1, sentence 2, padding
        self.segment = nn.Embedding(n_segments, embed_size, padding_idx=0)
        # embedding of position
        self.position = PositionEmbedding(embed_size, seq_len) 
        # droput probability per token
        self.dropout = nn.Dropout(p=dropout)
    
    def forward(self, sequence, seg_label):
        return self.dropout(self.token(sequence) + self.segment(seg_label) + self.position(sequence))        
    

In [30]:
# embedding test: tokenized sequence
sample_seq = batch['input'][0] 
sample_seg = batch['segment'][0]
print(sample_seq.size())
print(sample_seq)
print(sample_seg.size())
print(sample_seg)

bert = BERTEmbedding(VOCAB_SIZE, EMBED_SIZE)

batch_embed = bert(batch['input'][0], batch['segment'][0])

print(batch_embed.size())

torch.Size([64])
tensor([  101,  2788,  1010,  2002,  2052,  2022, 13311,  2105,  1996,  2542,
         2282,  1010,  2652,   103,  2010, 10899,  1012,   102,  2002,  1005,
         1040,  2464,  1996,   103,  2471,  2011,  6707,   103,  6195,   103,
          103,  1037,  2210,  2402,  2005,  1996,   103,  9476,  1010,  2021,
         2007,  3080,   103,  1010, 25294,  2007,  2014,  3428,   103,  6701,
         2001,  2411,   103,  2000,  2477,  2008,  2020,  3080,  1012,   102,
            0,     0,     0,     0])
torch.Size([64])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0])
torch.Size([64, 768])


## Finetuning

Cant be downloaded automatically from huggingface. Needs to be downloaded manually:

1) download from kaggle and 
2) extract in finetuning folder 
3) Delete the zips

In [100]:
toxic_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\finetuning\kaggle-toxic_comment"
toxic_dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)
toxic_dataset

Downloading and preparing dataset jigsaw_toxicity_pred/default to C:/Users/Johannes/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-2c7f4622bb3d8449/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85...


FileNotFoundError: C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\finetuning\kaggle-toxic_comment does not exist. Make sure you insert a manual dir via `datasets.load_dataset('jigsaw_toxicity_pred', data_dir=...)`. Manual download instructions:             To use jigsaw_toxicity_pred you have to download it manually from Kaggle: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data
            You can manually download the data from it's homepage or use the Kaggle CLI tool (follow the instructions here: https://www.kaggle.com/docs/api)
            Please extract all files in one folder and then load the dataset with:
            `datasets.load_dataset('jigsaw_toxicity_pred', data_dir='/path/to/extracted/data/')`

#### Test with standard dataloader

In [186]:
from torch.utils.data import DataLoader
dataloader = DataLoader(toxic_dataset["train"], batch_size=1, shuffle = True)
batch = next(iter(dataloader))
batch

NameError: name 'toxic_dataset' is not defined

#### Standard Tokenizer not sufficient, padding is missing and probably also truncation

In [26]:
encoded_input = tokenizer(batch["comment_text"])
encoded_input

{'input_ids': [[101, 2035, 4961, 2039, 3531, 2644, 3800, 7699, 5815, 14652, 2000, 16948, 1012, 2017, 2031, 2042, 2988, 2000, 16948, 1024, 3158, 9305, 2964, 1999, 5082, 2539, 1024, 2676, 1010, 12022, 2385, 1010, 2384, 1006, 11396, 1007, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [27]:
from itertools import chain
flattened = list(chain(*(encoded_input["input_ids"])))
tokenizer.decode(flattened)

'[CLS] all grown up please stop purposefully adding nonsense to wikipedia. you have been reported to wikipedia : vandalism in progress 19 : 27, jun 16, 2005 ( utc ) [SEP]'

#### Custom Dataset

In [39]:
class ToxicComment(Dataset):
    
    def __init__(self, tokenizer, seq_len=64, split="train", n_rows:int=None):
        
        if not split in ["train","test"]:
            raise ValueError("Parameter has to be 'train' or 'test'")
            
        if n_rows is not None:
            self.dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path, split=split+"[0:"+str(n_rows)+"]")#[split]
        else:
            self.dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)#[split]
        
        
        self.nrows = len(self.dataset) 
        self.tokenizer = tokenizer
        self.seq_len = seq_len

    def __len__(self):
        return self.nrows

    def __getitem__(self, item):
        
        # Step 1: get row
        output = self.dataset[item]
        #print(output)
        
        # Step 2: tokenize comment
        output["bert_input"] = tokenizer(
            output["comment_text"],
            max_length=self.seq_len ,
            padding="max_length", 
            truncation=True, 
            return_tensors='pt'
        )["input_ids"]
        
        output.pop("comment_text") #delete raw text
        
        # Step 3: add bert_label and segment_label like in pretraining task for consistency TODO: Correct?
        output["bert_label"] = torch.zeros(self.seq_len)
        output["segment_label"] = torch.ones(self.seq_len)
        
        # Step 4: collect different labels to one tensor 
        # TODO: desired?
        
        return output

    
    def get_sent(self, index): #selfmade
        '''gets sentence pair as dicitinary s1, s2, isNext'''
        isNext = random.random() > 0.5
        
        t1 = self.dataset[index]["text"]
        if isNext:
            t2 = self.dataset[index+1]["text"]
            return t1, t2, 1
        else:
            t2 = self.get_random_line(index+1)["text"]
            return t1, t2, 0
        

#### Test Dataset

In [185]:
test2 = ToxicComment(tokenizer=tokenizer, seq_len=SEQ_LEN, split = "train", n_rows = 100)
len(test2)

NameError: name 'ToxicComment' is not defined

In [41]:
dl2 = DataLoader(test2,batch_size=10,shuffle=False)
next(iter(dl2))

{'toxic': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'severe_toxic': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'obscene': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'threat': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'insult': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 'identity_hate': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'bert_input': tensor([[[  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
           18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
            1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
            3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
            1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
            1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
            6486,  1012, 16327,   102]],
 
         [[  101,  1040,  1005, 22091,  2860,   999,  2002,  3503,  2023,  4281,
            6120,  1045,  1005,  1049,  9428,  5881,  200

In [42]:
len(next(iter(dl2)))

9

In [43]:
len(next(iter(dl))["bert_input"][0])

64

## Functions for report

In [32]:
"""class BertTokenizer():
    def __init__(self, task_type="pretrain"):
        if not task_type in ["pretrain", "text_classification_multi"]:
            raise ValueError("task not implemented")
        pass
    
    def __call__()"""
# i noticed we dont need any callable class to do transformation on the datasets since everything is handeled by our dataloaders
# ie we dont need rescaling etc.
# maybe ask supervisor if we need to save back the tokenized text or if it is okay to do it on the fly and leave the load_data transformation parameter at None

'class BertTokenizer():\n    def __init__(self, task_type="pretrain"):\n        if not task_type in ["pretrain", "text_classification_multi"]:\n            raise ValueError("task not implemented")\n        pass\n    \n    def __call__()'

In [45]:
#def __init__(self, tokenizer, seq_len=64, split="train", n_rows=None):

def load_data(dataset:str, transformation=None, n_train:int=None, n_test:int=None): # transformation callable
    
    if dataset == "bookcorpus":
        train = Bookcorpus(
            tokenizer=transformation,
            seq_len=SEQ_LEN,
            split="train",
            n_rows=n_train
        )
        return train, None
    
    elif dataset == "jigsaw_toxicity_pred":
        train = ToxicComment(
            tokenizer=transformation,
            seq_len=SEQ_LEN,
            split="train",
            n_rows=n_train
        )
        
        test = ToxicComment(
            tokenizer=transformation,
            seq_len=SEQ_LEN,
            split="test",
            n_rows=n_test
        )
        return train, test
    
    else:
        raise NotImplementedError("Dataset not implemented")

In [46]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train, test = load_data("jigsaw_toxicity_pred", transformation=tokenizer, n_train=1000, n_test=100)

In [47]:
next(iter(train))

{'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0,
 'bert_input': tensor([[  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
          18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
           1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
           3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
           1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
           1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
           6486,  1012, 16327,   102]]),
 'bert_label': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'segment_label': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,

In [48]:
next(iter(test))

{'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0,
 'bert_input': tensor([[ 101, 4067, 2017, 2005, 4824, 1012, 1045, 2228, 2200, 3811, 1997, 2017,
          1998, 2052, 2025, 7065, 8743, 2302, 6594, 1012,  102,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0]]),
 'bert_label': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'segment_label': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 

In [49]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train, test = load_data("bookcorpus", transformation=tokenizer, n_train=1000, n_test=100)

In [51]:
test is None

True

In [None]:
def show(x, outfile:str=None): # can have more args