In [39]:
import transformers
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import numpy as np
import evaluate

In [40]:
model_name = "facebook/roberta-hate-speech-dynabench-r4-target"

In [41]:
# create a transformer model to classify hatespeach using pysentimiento/bertweet-hate-speech
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)

In [42]:
# create a tokenizer to tokenize the text
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [43]:
# load the training data
dataset = load_dataset("csv", data_files="data/input/labeled_data.csv")
dataset["train"][0]

Using custom data configuration default-fcd45673ee3a1c07
Found cached dataset csv (C:/Users/Tobias/.cache/huggingface/datasets/csv/default-fcd45673ee3a1c07/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/1 [00:00<?, ?it/s]

{'Unnamed: 0': 0,
 'count': 3,
 'hate_speech': 0,
 'offensive_language': 0,
 'neither': 3,
 'class': 2,
 'tweet': "!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out..."}

In [44]:
def tokenize_function(examples):
    return tokenizer(examples["tweet"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Loading cached processed dataset at C:/Users/Tobias/.cache/huggingface/datasets/csv/default-fcd45673ee3a1c07/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-197a45c3c547060c.arrow


In [45]:
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(100, 200))

Loading cached shuffled indices for dataset at C:/Users/Tobias/.cache/huggingface/datasets/csv/default-fcd45673ee3a1c07/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-a72a05bf7f124dbc.arrow
Loading cached shuffled indices for dataset at C:/Users/Tobias/.cache/huggingface/datasets/csv/default-fcd45673ee3a1c07/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a\cache-a72a05bf7f124dbc.arrow


In [58]:
training_args = TrainingArguments(
    "test_trainer",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

metric = evaluate.load("accuracy")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [59]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [60]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [61]:
# finetune the model on the training data
trainer.train()

***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 300
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet, count, class, neither, offensive_language, Unnamed: 0, hate_speech. If tweet, count, class, neither, offensive_language, Unnamed: 0, hate_speech are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.


RuntimeError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 10.00 GiB total capacity; 8.37 GiB already allocated; 0 bytes free; 8.39 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [17]:
# create a pipeline to classify the text
pipeline = transformers.pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [26]:
# classify the text
pipeline(" bitch who do you love ")

[{'label': 'hate', 'score': 0.9976891279220581}]