# TODO
* think about visualizations for text, preprocessing text, etc.
* cleaner code

In [2]:
import datasets
from datasets import load_dataset
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm import tqdm

In [3]:
SEQ_LEN = 64 # maximum sequence length
VOCAB_SIZE = 30522  # = len(tokenizer.vocab)
N_SEGMENTS = 3 # number of segmentation labels
EMBED_SIZE = 768 # size of embedding vector
DROPOUT = 0.1 # dropout chance

#### Tokenizer - use pretrained, at least for prototype

In [4]:
#tokenizer.model_max_length = SEQ_LEN # might not be correct in case of pretraining where we add CLS at the end, check that

In [5]:
# https://huggingface.co/docs/transformers/preprocessing
# https://huggingface.co/docs/transformers/main_classes/tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  # Choose an appropriate tokenizer

#### Tokenizer parameters

In [6]:
tokenizer.truncation_side 

'right'

In [7]:
tokenizer.model_max_length # we might need to fixate this

512

In [8]:
tokenizer.mask_token

'[MASK]'

In [9]:
tokenizer.vocab['[MASK]']

103

#### Tokenizer example usage

In [10]:
text = "hi i am moritz, who are you ?"#["hi i am moritz", "no you are not moritz, you are kevin"]
encoded_input = tokenizer(text)#,padding=True, truncation=True)
# , return_tensors='pt') use this for pt tensors
encoded_input

{'input_ids': [101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
encoded_input["input_ids"]

[101, 7632, 1045, 2572, 28461, 1010, 2040, 2024, 2017, 1029, 102]

In [12]:
tokenizer.decode(encoded_input["input_ids"])

'[CLS] hi i am moritz, who are you? [SEP]'

## Finetuning

Cant be downloaded automatically from huggingface. Needs to be downloaded manually:

1) download from kaggle and 
2) extract in finetuning folder 
3) Delete the zips

In [14]:
#toxic_path = r"C:\Users\morit\OneDrive\UNI\Master\WS23\PML\repo\bert_from_scratch.toxic_comment\datasets\finetuning\kaggle-toxic_comment"
toxic_path = r"C:\Users\Johannes\Project Machine Learning\datasets\finetuning\toxic_comment"
toxic_dataset = load_dataset("jigsaw_toxicity_pred", data_dir=toxic_path)
toxic_dataset

Found cached dataset jigsaw_toxicity_pred (C:/Users/Johannes/.cache/huggingface/datasets/jigsaw_toxicity_pred/default-ebae0308d0d3f840/1.1.0/9cf096ac4341c35839bc8a9f6a19d93e18e5ad3d84cf05f690d2bc6f7384af85)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 159571
    })
    test: Dataset({
        features: ['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 63978
    })
})

#### Test with standard dataloader

In [15]:
from torch.utils.data import DataLoader
dataloader = DataLoader(toxic_dataset["train"], batch_size=1, shuffle = True)
dataset_length = len(toxic_dataset["train"])
print("Length of dataset:", dataset_length)
batch = next(iter(dataloader))
batch

Length of dataset: 159571


{'comment_text': ['Reply to Berean Hunter. \n\nHi I found the source here. http://en.wikipedia.org/wiki/Battle_of_Trenton\n\nAnd for the battle of New Orleans I saw the source on the History Channel Documentary: First Invasion War of 1812.'],
 'toxic': tensor([0]),
 'severe_toxic': tensor([0]),
 'obscene': tensor([0]),
 'threat': tensor([0]),
 'insult': tensor([0]),
 'identity_hate': tensor([0])}

In [16]:
dataloader = DataLoader(toxic_dataset["train"], batch_size=1, shuffle = True)

sums = {
 'nothing' : 0,
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0
}

for i in range(dataset_length):
    if i == 1000:
        break
    batch = next(iter(dataloader))
    for key in batch.keys():
        if key in sums:
            sums[key] += batch[key]


In [17]:
sums

{'nothing': 0,
 'toxic': tensor([87]),
 'severe_toxic': tensor([7]),
 'obscene': tensor([45]),
 'threat': tensor([3]),
 'insult': tensor([36]),
 'identity_hate': tensor([4])}

In [18]:
dataset_length = 159571
weights = {
 'nothing' : 1/(143346/dataset_length),
 'toxic': 1/(15294/dataset_length),
 'severe_toxic':  1/(1595/dataset_length),
 'obscene':  1/(8449/dataset_length),
 'threat':  1/(478/dataset_length),
 'insult':  1/(7877/dataset_length),
 'identity_hate':  1/(1405/dataset_length)
}

summe = sum(weights.values())

for key in weights.keys():
    weights[key] = weights[key]/(summe/7)
weights

{'nothing': 0.013027581601785167,
 'toxic': 0.1221035512154764,
 'severe_toxic': 1.170816120557678,
 'obscene': 0.22102635960344377,
 'threat': 3.9068027453755154,
 'insult': 0.23707651546140618,
 'identity_hate': 1.3291471261846948}

In [19]:
dataset_length = 159571
weights = {
 'nothing' : (143346/dataset_length),
 'toxic': (15294/dataset_length),
 'severe_toxic':  (1595/dataset_length),
 'obscene':  (8449/dataset_length),
 'threat':  (478/dataset_length),
 'insult':  (7877/dataset_length),
 'identity_hate':  (1405/dataset_length)
}

summe = sum(weights.values())

for key in weights.keys():
    weights[key] = weights[key]/(summe/7)
weights

{'nothing': 5.623175898321041,
 'toxic': 0.5999529264082849,
 'severe_toxic': 0.06256864898791778,
 'obscene': 0.3314373136670328,
 'threat': 0.018750980699827394,
 'insult': 0.30899890161619326,
 'identity_hate': 0.055115330299701865}

In [20]:
sum(weights.values())

7.0

In [21]:
dataset_length = 159571
weights = {
 'nothing' : (143346/TRAIN_TOTAL),
 'toxic': (TRAIN_TOTAL/15294),
 'severe_toxic':  (TRAIN_TOTAL/1595),
 'obscene':  (TRAIN_TOTAL/8449),
 'threat':  (TRAIN_TOTAL/478),
 'insult':  (TRAIN_TOTAL/7877),
 'identity_hate':  (TRAIN_TOTAL/1405)
}

summe = sum(weights.values())

for key in weights.keys():
    weights[key] = weights[key]
weights

{'nothing': 0.8983211235124177,
 'toxic': 10.433568719759382,
 'severe_toxic': 100.04451410658307,
 'obscene': 18.886377086045687,
 'threat': 333.8305439330544,
 'insult': 20.25783927891329,
 'identity_hate': 113.57366548042705}

In [23]:
# weight_for_class_i = total_samples / (num_samples_in_class_i * num_classes)
NUM_CLASSES = 7
TRAIN_TOTAL = 159571
CLASS_WEIGHTS = {
 'toxic': (TRAIN_TOTAL/(15294*NUM_CLASSES)),
 'severe_toxic': (TRAIN_TOTAL/(1595*NUM_CLASSES)),
 'obscene':  TRAIN_TOTAL/(8449*NUM_CLASSES),
 'threat':  TRAIN_TOTAL/(478*NUM_CLASSES),
 'insult': TRAIN_TOTAL/(7877*NUM_CLASSES),
 'identity_hate':  TRAIN_TOTAL/(1405*NUM_CLASSES)
}

for key in CLASS_WEIGHTS.keys():
    CLASS_WEIGHTS[key] = CLASS_WEIGHTS[key]
CLASS_WEIGHTS

{'toxic': 1.4905098171084832,
 'severe_toxic': 14.292073443797582,
 'obscene': 2.698053869435098,
 'threat': 47.69007770472206,
 'insult': 2.893977039844756,
 'identity_hate': 16.22480935434672}

In [24]:
sum(CLASS_WEIGHTS.values())

85.2895012292547