# TODO
* think about visualizations for text, preprocessing text, etc.
* cleaner code

In [91]:
# imports
import datasets
from datasets import load_dataset
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm import tqdm

In [48]:
SEQ_LEN = 64 # maximum sequence length
VOCAB_SIZE = 30522  # = len(tokenizer.vocab)
N_SEGMENTS = 3 # number of segmentation labels
EMBED_SIZE = 768 # size of embedding vector
DROPOUT = 0.1 # dropout chance

#### Tokenizer - use pretrained, at least for prototype

In [49]:
#tokenizer.model_max_length = SEQ_LEN # might not be correct in case of pretraining where we add CLS at the end, check that

In [50]:
# https://huggingface.co/docs/transformers/preprocessing
# https://huggingface.co/docs/transformers/main_classes/tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Choose an appropriate tokenizer

#### Tokenizer parameters

In [51]:
tokenizer.truncation_side 

'right'

In [52]:
tokenizer.model_max_length # we might need to fixate this

512

In [53]:
tokenizer.mask_token

'[MASK]'

In [54]:
tokenizer.vocab['[MASK]']

103

#### Tokenizer example usage

In [55]:
text = "hi i am moritz, who are you ?"#["hi i am moritz", "no you are not moritz, you are kevin"]
encoded_input = tokenizer(text)#,padding=True, truncation=True)
# , return_tensors='pt') use this for pt tensors
encoded_input

{'input_ids': [101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [56]:
encoded_input["input_ids"]

[101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102]

In [57]:
tokenizer.decode(encoded_input["input_ids"])

'[CLS] hi i am moritz, who are you? [SEP]'

## Finetuning

Cant be downloaded automatically from huggingface. Needs to be downloaded manually:

1) download from kaggle and 
2) extract in finetuning folder 
3) Delete the zips

In [58]:
# toxic_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\finetuning\kaggle-toxic_comment"
toxic_path = r"C:\Users\Johannes\Project Machine Learning\datasets\finetuning\toxic_comment"
toxic_dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)
toxic_dataset

Found cached dataset jigsaw_toxicity_pred (C:/Users/Johannes/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-ebae0308d0d3f840/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 159571
    })
    test: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 63978
    })
})

#### Test with standard dataloader

In [127]:
from torch.utils.data import DataLoader
dataloader = DataLoader(toxic_dataset["train"], batch_size=1, shuffle = True)
dataset_length = len(train)
print("Length of dataset:", dataset_length)
batch = next(iter(dataloader))
batch

Length of dataset: 10000


{'comment_text': ['fuck off you stupid aspy asshole'],
 'toxic': tensor([1]),
 'severe_toxic': tensor([1]),
 'obscene': tensor([1]),
 'threat': tensor([0]),
 'insult': tensor([1]),
 'identity_hate': tensor([0])}

#### Standard Tokenizer not sufficient, padding is missing and probably also truncation

In [60]:
encoded_input = tokenizer(batch["comment_text"])
encoded_input

{'input_ids': [[101, 2031, 2017, 2464, 2026, 7514, 2000, 2115, 3437, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [61]:
from itertools import chain
flattened = list(chain(*(encoded_input["input_ids"])))
tokenizer.decode(flattened)

'[CLS] have you seen my reply to your answer? [SEP]'

#### Custom Dataset

In [132]:
class ToxicComment(Dataset):
    
    def __init__(self, tokenizer, seq_len=SEQ_LEN, split="train", n_rows:int=None):
        
        if not split in ["train","test"]:
            raise ValueError("Parameter has to be 'train' or 'test'")  
        
        self.n_rows = n_rows
        self.split = split
        self.tokenizer = tokenizer
        self.seq_len = seq_len
            
        if self.n_rows is not None:
            n_rows_str = f"[0:{self.n_rows}]" if self.n_rows is not None else ""
            self.dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path, split=f"{self.split}{n_rows_str}")#[split]
        else:
            self.dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)#[split]
        
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        
        # Step 1: get row
        output = self.dataset[item]

        # Step 2: tokenize comment
        output["input"] = self.tokenizer(
            output["comment_text"],
            max_length=self.seq_len ,
            padding="max_length", 
            truncation=True, 
            return_tensors='pt'
        )["input_ids"]
        
        # flatten output
        output["input"] = output["input"].squeeze()
        
        output.pop("comment_text") #delete raw text
        
        # Step 3: add segment_label like in pretraining task for consistency 
        output["segment"] = torch.ones(self.seq_len)
        
        # Step 4: collect different labels to one tensor 
        labels = torch.cat([output[key] if isinstance(output[key], torch.Tensor) else torch.tensor([output[key]]) for key in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],dim=-1)
        output["labels"] = labels
        
        return output

#### Test Dataset

In [131]:
test2 = ToxicComment(tokenizer=tokenizer, seq_len=SEQ_LEN, split = "train", n_rows = 100)
len(test2)

AttributeError: 'ToxicComment' object has no attribute 'split'

In [78]:
dl2 = DataLoader(test2,batch_size=1,shuffle=False)
batch = next(iter(dl2))
batch

{'toxic': tensor([0]),
 'severe_toxic': tensor([0]),
 'obscene': tensor([0]),
 'threat': tensor([0]),
 'insult': tensor([0]),
 'identity_hate': tensor([0]),
 'input': tensor([[  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
          18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
           1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
           3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
           1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
           1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
           6486,  1012, 16327,   102]]),
 'segment': tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 'l

In [79]:
len(batch)

9

In [80]:
len(batch["input"][0])

64

# Embedding

In [81]:
class PositionEmbedding(torch.nn.Module):
    def __init__(self, embed_size, seq_len):
        super().__init__()
        n = 10000 # scalar for pos encoding
        # create embedding matrix dim(seq_len  x embed_size)
        self.embed_matrix = torch.zeros(seq_len, embed_size).float()
        # positional encoding not to be updated while gradient descent
        self.embed_matrix.require_grad = False
        
        # compute embedding for each position in input
        for position in range(seq_len):
            # run trough every component of embedding vector for each position with stride 2
            for c in range(0, embed_size, 2):
                # even 
                self.embed_matrix[position,c] = math.sin(position/(n**(2*c/embed_size)))
                # uneven
                self.embed_matrix[position,c+1] = math.cos(position/(n**(2*c/embed_size)))
        
        # self.embed_matrix =  embed_matrix.unsqueeze(0) 
    def forward(self, x):
        return self.embed_matrix

In [82]:
class BERTEmbedding(torch.nn.Module):
    def __init__(self, vocab_size, seq_len, embed_size=EMBED_SIZE, dropout=DROPOUT):
        super().__init__()
        # token embedding: transforms (vocabulary size, number of tokens) into (vocabulary size, number of tokens, length of embdding vector)
        self.token = nn.Embedding(vocab_size, embed_size, padding_idx=0) # padding remains 0 during training
        # embedding of position
        self.position = PositionEmbedding(embed_size, seq_len) 
        # droput probability per token
        self.dropout = nn.Dropout(p=dropout)
    
    def forward(self, sequence):
        return self.dropout(self.token(sequence) + self.position(sequence))        
    

In [83]:
# embedding test: tokenized sequence
sample_seq = batch['input'][0] 
print(f'sample_seq size {sample_seq.size()}')
print(sample_seq)

bert = BERTEmbedding(VOCAB_SIZE, SEQ_LEN)

batch_embed = bert(batch['input'][0].long())

print(batch_embed.size())

sample_seq size torch.Size([64])
tensor([  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
        18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
         1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
         3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
         1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
         1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
         6486,  1012, 16327,   102])
torch.Size([64, 768])


# Model

In [84]:
# attention heads
class MultiHeadAttention(nn.Module):
    def __init__(self, number_heads, model_dimension):
        super(MultiHeadAttention, self).__init__()
        
        # model dimension must be divideable into equal parts for the attention heads
        assert model_dimension%number_heads == 0
        self.number_heads = number_heads
        self.att_head_dim = int(model_dimension/number_heads)
        
        # attention mechanism: query, key, value are linear embeddings -> embedding matrix dim: (model_dimension x model_dimension)
        self.query = nn.Linear(model_dimension, model_dimension)
        self.key = nn.Linear(model_dimension, model_dimension)
        self.value = nn.Linear(model_dimension, model_dimension)
        self.lin_output = nn.Linear(model_dimension, model_dimension)
    
    def forward(self, query, key, value, mask):
        
        # output dim (batch_size x seq_len x model_dimension) 
        query = self.query(query)
        key = self.key(key)        
        value = self.value(value) 
        
        # transform q,k,v to fit attention heads:(batch_size x seq_len x model_dimension) -> (batch_size x number_heads x seq_len x att_head_dim)
        query = query.view(query.shape[0], query.shape[1], self.number_heads, self.att_head_dim)
        query = query.permute(0,2,1,3)
        key = key.view(key.shape[0], key.shape[1], self.number_heads, self.att_head_dim)
        key = key.permute(0,2,1,3)
        value = value.view(value.shape[0], value.shape[1], self.number_heads, self.att_head_dim)
        value = value.permute(0,2,1,3)
        
        # calculate dot product between each query and each key and normaliz the output, output dim: (batch_size x number_heads x seq_len x seq_len)
        score = torch.matmul(query, key.permute(0, 1, 3, 2)) 
        score_n = score / math.sqrt(self.att_head_dim) # normalize: <q,k>/sqrt(d_k)
        
        # mask 0 with -infinity so it becomes 0 after softmax, output dim: (batch_size x number_heads x seq_len x seq_len)
        score_m = score_n.masked_fill(mask == 0, -1e10)    
        
        # softmax scores along each query, output dim: (batch_size x number_heads x seq_len x seq_len)
        score_w = nn.functional.softmax(score_m, dim=-1) 
        
        # multiply with value matrix: output weighted sum for each query, output dim: (batch_size x number_heads x seq_len x att_head_dim)
        weighted_sum = torch.matmul(score_w, value)
        
        # concatenate attention heads to 1 output, output dim: (batch_size x number_heads x model_dimension)
        weighted_sum = weighted_sum.permute(0, 2, 1, 3).contiguous().view(weighted_sum.shape[0], -1, self.number_heads * self.att_head_dim)
        
        # linear embedding for output
        out = self.lin_output(weighted_sum)      
        return out    

In [85]:
# feedforward layer
class FeedForwardLayer(nn.Module):
    def __init__(self, model_dimension, hidden_dimension):
        super(FeedForwardLayer, self).__init__()
        
        # linear layer
        self.linear1 = nn.Linear(model_dimension, hidden_dimension)
        self.linear2 = nn.Linear(hidden_dimension, model_dimension)
        # non-linearity
        self.non_linear = nn.ReLU()
    
    def forward(self, x):
        return self.linear2(self.non_linear(self.linear1(x)))       

In [86]:
# encoder stacks together all the previous modules
class Encoder(nn.Module):
    def __init__(self, model_dimension=EMBED_SIZE, number_heads=12, ff_hidden_dim=EMBED_SIZE*4):
        super(Encoder, self).__init__()
        # attention heads
        self.multihead_attention = MultiHeadAttention (number_heads, model_dimension)
        # normalisation layer
        self.normlayer = nn.LayerNorm(model_dimension)
        self.feedforward_layer = FeedForwardLayer(model_dimension, hidden_dimension=ff_hidden_dim)
    
    # also residuals possible here
    def forward(self, x, mask):
        # embeddings: (batch_size, max_len, d_model)
        # encoder mask: (batch_size, 1, 1, max_len)
        # result: (batch_size, max_len, d_model)
        # input x 3x to generate query, key, value
        x = self.normlayer(self.multihead_attention(x, x, x, mask))
        return self.normlayer(self.feedforward_layer(x))

In [87]:
# base class for BERT
class BERTBase(nn.Module):
    # __init__ function takes hyperparameters, initializes the model accordingly and sets up trainable parameters
    def __init__(self, vocab_size, model_dimension, number_layers, number_heads):
        super().__init__()
        self.model_dimension=model_dimension
        self.number_layers=number_layers
        self.number_heads=number_heads
        # hidden layer dimenion of FF is 4*model_dimension (see paper)
        self.ff_hidden_layer = 4*model_dimension
        # embedding of input 
        self.embedding = BERTEmbedding(vocab_size=vocab_size, seq_len=SEQ_LEN, embed_size=model_dimension)
        # stack encoders
        self.encoders = torch.nn.ModuleList() # create empty module list
        for _ in range(self.number_layers):
            self.encoders.append(Encoder(model_dimension=model_dimension, number_heads=number_heads, ff_hidden_dim=4*model_dimension))
        
    def forward(self, x):
        # mask to mark the padded tokens
        mask = (x > 0).unsqueeze(1).repeat(1,x.size(1),1).unsqueeze(1)
        x = self.embedding(x) 
        # run trough encoders
        for encoder in self.encoders:
            x =encoder.forward(x, mask)
        return x

In [88]:
# finetuning
class ToxicityPrediction(nn.Module):
    """
    class to predict multivariate class of toxicity
    """
    def __init__(self, bert_out):
        super().__init__()
        self.tox_classes = 6 # there are 6 classes of toxicity in the dataset
        self.linear = nn.Linear(bert_out, self.tox_classes)
        self.softmax = nn.LogSoftmax(dim=-1) # not necessary, included in torch.nn.CrossEntropyLoss
        
    def forward(self, x):
        # recieve output dimension (batch_size, self.tox_classes)
        return self.softmax(self.linear(x[:, 0]))

In [89]:
class Model(nn.Module):
    """
    Model class according to Milestone 1 task sheet
    """
    def __init__(self, vocab_size, model_dimension, number_layers=12, number_heads=12):
        super().__init__()
        # base BERT model
        self.base_model = BERTBase(vocab_size, model_dimension, number_layers, number_heads)
        # toxic comment classfication layer
        self.toxic_comment = ToxicityPrediction(self.base_model.model_dimension)
    
    def forward(self, x):
        x = self.base_model(x)
        return self.toxic_comment(x)

# Training

In [124]:
class TrainBERT:
    def __init__(self, model, train_dataloader, epochs, test_dataloader=None, learning_rate=0.001, threshold=0.5, device='cuda'):
        
        # hyperparameters for optimization
        self.device = device
        self.bar = None
        self.model = model
        self.epochs = epochs
        self.training_data = train_dataloader
        self.testing_data = test_dataloader

        # optimizer: Adam
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        # learning rate scheduler
        self.scheduler = StepLR(self.optimizer, step_size=5, gamma=0.1)

        # cost function cross entropy loss for predicting classes of toxicity
        self.criterion = nn.CrossEntropyLoss()
        
        # predictions threshold above which predictions are set True
        self.threshold = threshold 
        
        # run training
        for epoch in range(self.epochs):
            self.training(epoch)

    def training(self, epoch):
        # init stats
        avg_loss = 0.0
        corrects_sum = 0
        trues_sum = 0
        
        # set back progress bar
        self.bar = None
        # create new progress bar
        self.bar = tqdm(total=len(self.training_data.dataset), desc=f'Training epoch {epoch+1}', leave=True, position=0)

        for i, data in enumerate(self.training_data):
            
            # send data to GPU/CPU
            data ={key: value.to(self.device) for key, value in data.items()}
            
            # labels convert to float()
            labels = data['labels'].float()
            
            # forward pass: comments trough model
            output = self.model.forward(data['input'])
            
            # compute loss with labels (input, target)
            loss = self.criterion(output, labels)
            
            # backward pass for training
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            # average loss per batch
            avg_loss += loss.item()
            
            # compute accuracy 
            # softmax the output vector to get probabilites
            predictions = nn.functional.softmax(output, dim=1)
            # use threshold to determine which of the outputs are considered True
            predictions = torch.ge(predictions, self.threshold).int()
            # compare with the label and count correct classifications
            corrects_sum += (predictions == labels).sum().item()
            # sump up total number of Trues in labels for batch
            trues_sum += labels.nelement()
            
            # update progress bar
            self.bar.update(self.training_data.batch_size)
        
        # update learning rate scheduler
        self.scheduler.step() 
        # print stats
        print(f'Trainig epoch: {epoch+1}\nAvg. training loss: {(avg_loss / len(self.training_data)):.2f}\nAccuracy: {(corrects_sum * 100.0 / trues_sum):.2f}')   

In [120]:
# Training test
# set up tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# load test dataset
train, test = load_data("jigsaw_toxicity_pred", transformation=tokenizer, n_train=100, n_test=None)

# set up dataloader
train_loader = DataLoader(train, batch_size=32, shuffle=True)

# set up BERT model
bert = Model(vocab_size=VOCAB_SIZE, model_dimension=EMBED_SIZE, number_layers=12, number_heads=12)

# number of epochs
epochs = 3

# train model
bert_trainer = TrainBERT(bert_lm, train_loader, epochs, device='cpu')

Found cached dataset jigsaw_toxicity_pred (C:/Users/Johannes/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-ebae0308d0d3f840/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85)


0


Training epoch 1:   0%|                                                                        | 0/100 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [125]:
# Training (for cluster)
# set up tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# load the entire training data (length 10000) into train
train, _ = load_data("jigsaw_toxicity_pred", transformation=tokenizer, n_train=10000, n_test=None)

# set up dataloader
train_loader = DataLoader(train, batch_size=128, shuffle=True)

# set up BERT model
bert_lm = Model(vocab_size=VOCAB_SIZE, model_dimension=EMBED_SIZE, number_layers=12, number_heads=12)

# number of epochs
epochs = 10

# train model (device to be updated according to cluster GPU)
bert_trainer = TrainBERT(bert_lm, train_loader, epochs, device='cuda:0')

Found cached dataset jigsaw_toxicity_pred (C:/Users/Johannes/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-ebae0308d0d3f840/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85)
Training epoch 1:   0%|                                                                      | 0/10000 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Functions for report

In [None]:
"""class BertTokenizer():
    def __init__(self, task_type="pretrain"):
        if not task_type in ["pretrain", "text_classification_multi"]:
            raise ValueError("task not implemented")
        pass
    
    def __call__()"""
# i noticed we dont need any callable class to do transformation on the datasets since everything is handeled by our dataloaders
# ie we dont need rescaling etc.
# maybe ask supervisor if we need to save back the tokenized text or if it is okay to do it on the fly and leave the load_data transformation parameter at None

In [93]:
#def __init__(self, tokenizer, seq_len=64, split="train", n_rows=None):
def load_data(dataset:str, transformation=None, n_train:int=None, n_test:int=None): # transformation callable
    
    if dataset == "bookcorpus":
        train = Bookcorpus(
            tokenizer=transformation,
            seq_len=SEQ_LEN,
            split="train",
            n_rows=n_train
        )
        return train, None
    
    elif dataset == "jigsaw_toxicity_pred":
        train = ToxicComment(
            tokenizer=transformation,
            seq_len=SEQ_LEN,
            split="train",
            n_rows=n_train
        )
        
        test = ToxicComment(
            tokenizer=transformation,
            seq_len=SEQ_LEN,
            split="test",
            n_rows=n_test
        )
        return train, test
    
    else:
        raise NotImplementedError("Dataset not implemented")

In [204]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train, test = load_data("jigsaw_toxicity_pred", transformation=tokenizer, n_train=1000, n_test=100)

In [205]:
next(iter(train))

Found cached dataset jigsaw_toxicity_pred (C:/Users/Johannes/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-ebae0308d0d3f840/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85)


{'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0,
 'input': tensor([  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
         18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
          1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
          3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
          1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
          1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
          6486,  1012, 16327,   102]),
 'segment': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'labels': tensor([0, 0, 0, 0, 0, 0])}

In [206]:
next(iter(test))

Found cached dataset jigsaw_toxicity_pred (C:/Users/Johannes/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-ebae0308d0d3f840/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85)


{'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0,
 'input': tensor([ 101, 4067, 2017, 2005, 4824, 1012, 1045, 2228, 2200, 3811, 1997, 2017,
         1998, 2052, 2025, 7065, 8743, 2302, 6594, 1012,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]),
 'segment': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'labels': tensor([0, 0, 0, 0, 0, 0])}

In [207]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train, test = load_data("bookcorpus", transformation=tokenizer, n_train=1000, n_test=100)

In [208]:
test is None

True

In [209]:
def show(x, outfile:str=None): # can have more args

SyntaxError: incomplete input (2645215428.py, line 1)