### Fine tune an HF model on Italian Hate Speech detection
* starting point: bert-base-italian-cased
* fine tuned on: cardiffnlp/tweet_sentiment_multilingual, italian subset
* + a set of email generated for one Town demo (contained in a csv file)

In [1]:
import numpy as np
from datasets import load_dataset, concatenate_datasets, ClassLabel, Features, Value
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import TrainingArguments, Trainer
import evaluate
from sklearn.metrics import f1_score

[2023-11-27 19:10:59,781] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
# contains email and sentiment to add to the training dataset
# hand labelled

DATASET_NAME = "Paul/hatecheck-italian"

PRETRAINED_MODEL_NAME = "MilaNLProc/hate-ita-xlm-r-large"

features = Features({
    'test_case': Value('string'),
    'label_gold': ClassLabel(names=['non-hateful', 'hateful'])
})

In [3]:
# labels are defined in english since the original dataset use english labels

In [4]:
# all tools here
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=2)

metric = evaluate.load("accuracy")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    acc = metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 =  f1_score(labels, predictions, average='macro')
    
    return {"f1_macro": f1, "accuracy": acc}

# for my custom dataset
def remove_non_breaking_space(example):
    # Assuming the text is under the key 'text'
    example['text'] = example['text'].replace('\xa0', ' ')
    return example

def to_lower(example):
    example['text'] = [s.lower() for s in example['text']]
    
    return example

### Load and preprocess the data

In [5]:
# load hatespeech ita dataset
dataset = load_dataset(DATASET_NAME, usecols=['test_case', 'label_gold'], features=features)["test"]

dataset = dataset.shuffle(seed=42)

In [6]:
# have a look at one example
dataset[0]

{'test_case': 'Le recensioni di quel film sono semplicemente patetiche.',
 'label_gold': 0}

In [7]:
dataset.features

{'test_case': Value(dtype='string', id=None),
 'label_gold': ClassLabel(names=['non-hateful', 'hateful'], id=None)}

In [8]:
# tokenize
dataset = dataset.rename_column("test_case", "text")
dataset = dataset.rename_column("label_gold", "label")

dataset = dataset.map(to_lower, batched=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [9]:
tokenized_datasets_train_test_split = tokenized_datasets.train_test_split(test_size=0.1)

In [10]:
# build the training and validation dataset
# we add the custom dataset, from csv, only to training dataset
train_ds = tokenized_datasets_train_test_split["train"].shuffle(seed=42)
eval_ds = tokenized_datasets_train_test_split["test"].shuffle(seed=42)

In [11]:
print(f"We have {len(train_ds)} samples in training dataset...")
print(f"We have {len(eval_ds)} samples in validation dataset...")

We have 3321 samples in training dataset...
We have 369 samples in validation dataset...


In [12]:
train_ds

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3321
})

### Prepare for training

In [13]:
SAVE_DIR = "haspeech_ita"

EPOCHS = 3

batch_size = 8

training_args = TrainingArguments(output_dir=SAVE_DIR, 
                                  evaluation_strategy="steps",
                                  save_strategy="steps",
                                  logging_strategy="steps",
                                  num_train_epochs=EPOCHS,
                                  logging_steps=100,
                                  save_steps=100,
                                  eval_steps=100,
                                  warmup_steps=500,
                                  learning_rate=2e-5,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="accuracy",
                                  greater_is_better=True,
                                  save_total_limit=1,
                                  per_device_train_batch_size=8
                                 )

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
)

### Training

In [15]:
trainer.train()

Step,Training Loss,Validation Loss,F1 Macro,Accuracy
100,0.4001,0.172156,0.919847,0.937669
200,0.1669,0.035503,0.986532,0.98916
300,0.0824,0.083852,0.986368,0.98916
400,0.0672,0.027654,0.993266,0.99458
500,0.0761,0.048303,0.993266,0.99458
600,0.0413,0.024969,0.996623,0.99729


TrainOutput(global_step=624, training_loss=0.13370367174203962, metrics={'train_runtime': 1117.1415, 'train_samples_per_second': 8.918, 'train_steps_per_second': 0.559, 'total_flos': 9284832172836864.0, 'train_loss': 0.13370367174203962, 'epoch': 3.0})

### Saving

In [16]:
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

('haspeech_ita/tokenizer_config.json',
 'haspeech_ita/special_tokens_map.json',
 'haspeech_ita/sentencepiece.bpe.model',
 'haspeech_ita/added_tokens.json',
 'haspeech_ita/tokenizer.json')

### Save to HF Hub

In [17]:
trainer.push_to_hub(SAVE_DIR)

events.out.tfevents.1701111657.eee07ad15600.1142.0:   0%|          | 0.00/5.54k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.16k [00:00<?, ?B/s]

events.out.tfevents.1701112130.eee07ad15600.1142.1:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

events.out.tfevents.1701112270.eee07ad15600.2832.0:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

'https://huggingface.co/luigisaetta/haspeech_ita/tree/main/'