# Import the training, test and validation set

In [36]:
import getpass
import os
import re
import torch
import transformers
import datasets
import evaluate
import pandas as pd
import numpy as np
from datasets import *

In [38]:
#load a dataset form the HuggingFace Hub
hate = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech')

# check the dataset structure
print(hate)

# check the size of the dataset
print(hate.shape)

DatasetDict({
    train: Dataset({
        features: ['comment_id', 'annotator_id', 'platform', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'hate_speech_score', 'text', 'infitms', 'outfitms', 'annotator_severity', 'std_err', 'annotator_infitms', 'annotator_outfitms', 'hypothesis', 'target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'target_race', 'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'target_religion', 'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'target_origin', 'target_gender_men', 'target

In [42]:
train_testvalid = hate['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
hate = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

print(hate)

DatasetDict({
    train: Dataset({
        features: ['comment_id', 'annotator_id', 'platform', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'hate_speech_score', 'text', 'infitms', 'outfitms', 'annotator_severity', 'std_err', 'annotator_infitms', 'annotator_outfitms', 'hypothesis', 'target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'target_race', 'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'target_religion', 'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'target_origin', 'target_gender_men', 'target

In [43]:
#check class distribution in training and testing set

from collections import Counter
print(Counter(hate["train"]["annotator_gender_men"]))

print(Counter(hate["test"]["annotator_gender_men"]))

Counter({False: 49813, True: 36942})
Counter({False: 6303, True: 4542})


# Tokenizer

In [44]:
from transformers import AutoTokenizer

#  Instantiating  AutoTokenizer will directly create a class of the relevant architecture.
tokenizer  = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2", model_max_length=512)

In [45]:
#explore the tokenizer
tokenizer

BertTokenizerFast(name_or_path='google/bert_uncased_L-2_H-128_A-2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [48]:
def tokenize_function(examples):
    #  padding: 'max_length': pad to a length specified by the max_length argument or the
    #  maximum length accepted by the model if no max_length is provided (max_length=None).
    #  Padding will still be applied if you only provide a single sequence. [from documentation]


    #  truncation: True or 'longest_first': truncate to a maximum length specified
    #  by the max_length argument or the maximum length accepted by the model if
    #  no max_length is provided (max_length=None). This will truncate token by
    #  token, removing a token from the longest sequence in the pair until the
    #  proper length is reached. [from documentation]
    return tokenizer(examples["text"], padding="max_length", truncation=True)


#  apply the function to all the elements in the dataset (individually or in batches)
#  https://huggingface.co/docs/datasets/v1.11.0/package_reference/main_classes.html?highlight=dataset%20map#datasets.Dataset.map
#  batch mode is very powerful. It allows you to speed up processing
#  more info here: https://huggingface.co/docs/datasets/en/about_map_batch
tokenized_hate = hate.map(tokenize_function, batched=True)

In [49]:
print(tokenized_hate)

DatasetDict({
    train: Dataset({
        features: ['comment_id', 'annotator_id', 'platform', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'hate_speech_score', 'text', 'infitms', 'outfitms', 'annotator_severity', 'std_err', 'annotator_infitms', 'annotator_outfitms', 'hypothesis', 'target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'target_race', 'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'target_religion', 'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'target_origin', 'target_gender_men', 'target

In [54]:
# print the first review of the tokenized dataset
# 1 line of code
### BEGIN SOLUTION
print(tokenized_hate["train"][0]["text"])
### END SOLUTION


# print the label of the first review of the tokenized dataset
# 1 line of code
### BEGIN SOLUTION
print(tokenized_hate["train"][0]["hate_speech_score"])
### END SOLUTION

# print the tokens of the first review of the tokenized dataset
# Can you spot the [CLS] and [SEP] tokens in the tokenized review?
# 1 line of code
### BEGIN SOLUTION
print(tokenized_hate["train"][0]["input_ids"])
### END SOLUTION


# print the attention_mask of the first review of the tokenized dataset
# 1 line of code
### BEGIN SOLUTION
print(tokenized_hate["train"][0]["attention_mask"])
### END SOLUTION


yeah, heres a clue for the nigger author of this article. Chinks hate niggers, Kikes hate niggers, Spicks hate niggers and Towelheads hate niggers. EVen niggers hate niggers. Black Africans hate the gauche lazy American blacks because they have no morals or work ethic. Niggrus Americanus is the lowest form of human (if you can even call them that) on the planet, we should erase them
3.91
[101, 3398, 1010, 2182, 2015, 1037, 9789, 2005, 1996, 9152, 13327, 3166, 1997, 2023, 3720, 1012, 5413, 5705, 5223, 9152, 13327, 2015, 1010, 11382, 9681, 5223, 9152, 13327, 2015, 1010, 11867, 6799, 2015, 5223, 9152, 13327, 2015, 1998, 10257, 13038, 5223, 9152, 13327, 2015, 1012, 2130, 9152, 13327, 2015, 5223, 9152, 13327, 2015, 1012, 2304, 18076, 5223, 1996, 11721, 19140, 13971, 2137, 10823, 2138, 2027, 2031, 2053, 25288, 2030, 2147, 3802, 16066, 1012, 9152, 13871, 7946, 2137, 2271, 2003, 1996, 7290, 2433, 1997, 2529, 1006, 2065, 2017, 2064, 2130, 2655, 2068, 2008, 1007, 2006, 1996, 4774, 1010, 2057, 23

In [56]:
# print the token_type_ids of the first review of the tokenized dataset
# 1 line of code
### BEGIN SOLUTION
print(tokenized_hate["train"][0]["token_type_ids"])
### END SOLUTION

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 