# Import the training, test and validation set

In [18]:
import numpy as np
import datasets
from collections import Counter
import evaluate
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch
torch.cuda.empty_cache()

In [19]:
#load a dataset form the HuggingFace Hub
hate = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech')

# check the dataset structure
print(hate)

# check the size of the dataset
print(hate.shape)

hate["train"][0]

DatasetDict({
    train: Dataset({
        features: ['comment_id', 'annotator_id', 'platform', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'hate_speech_score', 'text', 'infitms', 'outfitms', 'annotator_severity', 'std_err', 'annotator_infitms', 'annotator_outfitms', 'hypothesis', 'target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'target_race', 'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'target_religion', 'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'target_origin', 'target_gender_men', 'target

{'comment_id': 47777,
 'annotator_id': 10873,
 'platform': 3,
 'sentiment': 0.0,
 'respect': 0.0,
 'insult': 0.0,
 'humiliate': 0.0,
 'status': 2.0,
 'dehumanize': 0.0,
 'violence': 0.0,
 'genocide': 0.0,
 'attack_defend': 0.0,
 'hatespeech': 0.0,
 'hate_speech_score': -3.9,
 'text': 'Yes indeed. She sort of reminds me of the elder lady that played the part in the movie "Titanic" who was telling her story!!! And I wouldn\'t have wanted to cover who I really am!! I would be proud!!!! WE should be proud of our race no matter what it is!!',
 'infitms': 0.81,
 'outfitms': 1.88,
 'annotator_severity': 0.36,
 'std_err': 0.34,
 'annotator_infitms': 1.35,
 'annotator_outfitms': 1.23,
 'hypothesis': -1.1301777576839678,
 'target_race_asian': True,
 'target_race_black': True,
 'target_race_latinx': True,
 'target_race_middle_eastern': True,
 'target_race_native_american': True,
 'target_race_pacific_islander': True,
 'target_race_white': True,
 'target_race_other': False,
 'target_race': True,
 

In [20]:
train_testvalid = hate['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
hate = datasets.DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

print(hate)

DatasetDict({
    train: Dataset({
        features: ['comment_id', 'annotator_id', 'platform', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'hate_speech_score', 'text', 'infitms', 'outfitms', 'annotator_severity', 'std_err', 'annotator_infitms', 'annotator_outfitms', 'hypothesis', 'target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'target_race', 'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'target_religion', 'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'target_origin', 'target_gender_men', 'target

In [21]:
#check class distribution in training and testing set

Counter(hate["train"]["hate_speech_score"])

Counter({2.73: 1284,
         2.14: 1265,
         2.1: 1116,
         2.27: 958,
         0.12: 785,
         1.48: 749,
         1.47: 731,
         1.63: 725,
         1.27: 723,
         -0.94: 705,
         2.45: 691,
         -1.89: 686,
         -3.01: 679,
         1.57: 678,
         -1.5: 668,
         2.54: 668,
         2.66: 663,
         3.62: 660,
         -2.77: 657,
         1.96: 657,
         2.36: 656,
         3.91: 652,
         -4.3: 652,
         -4.44: 651,
         -4.41: 651,
         2.01: 650,
         1.42: 645,
         -5.4: 635,
         2.97: 630,
         3.44: 628,
         -2.34: 625,
         3.16: 623,
         0.14: 620,
         3.13: 616,
         3.53: 616,
         3.42: 612,
         -2.43: 609,
         -3.98: 606,
         -2.36: 599,
         -4.8: 589,
         -5.43: 560,
         -4.86: 555,
         -2.61: 523,
         -2.35: 512,
         -0.42: 485,
         -0.23: 481,
         -5.12: 469,
         -0.96: 457,
         0.35: 443,


In [22]:
hate_small = hate.select_columns(["text", "hate_speech_score"])
hate_small = hate_small.rename_column("hate_speech_score", "label")


# Tokenizer

In [23]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [24]:
def tokenize_function(examples):

    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [25]:
tokenized_hate_small = hate_small.map(tokenize_function, batched=True)

Map: 100%|██████████| 108444/108444 [00:16<00:00, 6447.17 examples/s]
Map: 100%|██████████| 13556/13556 [00:02<00:00, 6621.06 examples/s]
Map: 100%|██████████| 13556/13556 [00:02<00:00, 6240.07 examples/s]


In [26]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [28]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_hate_small["train"],
    eval_dataset=tokenized_hate_small["test"],
    compute_metrics=compute_metrics,
)

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 3.81 GiB of which 9.31 MiB is free. Including non-PyTorch memory, this process has 3.79 GiB memory in use. Of the allocated memory 3.66 GiB is allocated by PyTorch, and 44.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.train()

  0%|          | 0/40668 [01:15<?, ?it/s]
  0%|          | 0/40668 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 3.81 GiB of which 9.31 MiB is free. Including non-PyTorch memory, this process has 3.79 GiB memory in use. Of the allocated memory 3.66 GiB is allocated by PyTorch, and 41.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)