### Fine tune an HF model on Italian Sentiment Analysis
* starting point: the neuraly/bert-base-italian-cased-sentiment model
* fine tuned on: cardiffnlp/tweet_sentiment_multilingual, italian subset
* + a set of email generated for one Town demo (contained in a csv file)

In [1]:
import numpy as np
from datasets import load_dataset, concatenate_datasets, ClassLabel, Features, Value
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import TrainingArguments, Trainer
import evaluate

[2023-11-24 11:44:38,807] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
# contains email and sentiment to add to the training dataset
# hand labelled
FILE_NAME = "labelled_text.csv"

DATASET_NAME = "cardiffnlp/tweet_sentiment_multilingual"
SUBSET_NAME = "italian"

PRETRAINED_MODEL_NAME = "neuraly/bert-base-italian-cased-sentiment"

In [3]:
# all tools here
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=3)

metric = evaluate.load("accuracy")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# for my custom dataset
def remove_non_breaking_space(example):
    # Assuming the text is under the key 'text'
    example['text'] = example['text'].replace('\xa0', ' ')
    return example

### Load and preprocess the data

In [4]:
# load cardiff ita dataset
dataset = load_dataset(DATASET_NAME, SUBSET_NAME)

In [5]:
# have a look at one example
dataset["train"][100]

{'text': 'articolo di Valerio Valentini per http Mario Monti ha presentato, nei giorni scorsi, la... http',
 'label': 1}

In [6]:
dataset["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None)}

#### Adding to train dataset my emails

In [7]:
# need to cast int to ClassLabel in order to be able to concatenat
features = Features({
    'text': Value('string'),
    'label': ClassLabel(names=['negative', 'neutral', 'positive'])
})

# remove the split and take (the only) train
DELIMITER = ","

my_csv_ds = load_dataset('csv', data_files=FILE_NAME, delimiter=DELIMITER, features=features)["train"]

# remove non breaking space
my_csv_ds = my_csv_ds.map(remove_non_breaking_space)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

In [8]:
# have a look at one example
my_csv_ds[1]

{'text': 'Buongiorno sono utente9  volevo chiedere alcune informazioni relative al servizio4 e dove devo recarmi per poter sbrigare le pratiche relative  grazie  Matteo',
 'label': 1}

In [9]:
# tokenize

tokenized_datasets = dataset.map(tokenize_function, batched=True)

my_csv_ds_tokenized = my_csv_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

In [10]:
# build the training and validationdataset
# we add the custom dataset, from csv, only to training dataset
train_ds = tokenized_datasets["train"].shuffle(seed=42)
test_ds = tokenized_datasets["test"].shuffle(seed=42)
eval_ds = tokenized_datasets["validation"].shuffle(seed=42)

# concatenate to get final training dataset
ext_train_ds = concatenate_datasets([train_ds, test_ds, my_csv_ds_tokenized])

In [12]:
print(f"We have {len(ext_train_ds)} samples in training dataset...")
print(f"We have {len(eval_ds)} samples in validation dataset...")

We have 2771 samples in training dataset...
We have 324 samples in validation dataset...


### Prepare for training

In [13]:
SAVE_DIR = "sentiment_ita"

EPOCHS = 14

training_args = TrainingArguments(output_dir=SAVE_DIR, 
                                  evaluation_strategy="steps",
                                  save_strategy="steps",
                                  logging_strategy="steps",
                                  num_train_epochs=EPOCHS,
                                  logging_steps=100,
                                  save_steps=100,
                                  eval_steps=100,
                                  warmup_steps=800,
                                  learning_rate=4e-5,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="accuracy",
                                  greater_is_better=True,
                                  save_total_limit=1
                                 )

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ext_train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
)

### Training

In [15]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss,Accuracy
100,1.4695,1.028592,0.617284
200,0.7416,0.785186,0.645062
300,0.6165,0.786511,0.657407
400,0.5304,0.862937,0.682099
500,0.4296,0.831812,0.691358
600,0.282,1.058694,0.691358
700,0.2344,1.329301,0.657407
800,0.1327,1.549295,0.67284
900,0.1603,1.809868,0.660494
1000,0.1048,1.756739,0.669753


TrainOutput(global_step=2436, training_loss=0.20304089768717834, metrics={'train_runtime': 1009.115, 'train_samples_per_second': 38.444, 'train_steps_per_second': 2.414, 'total_flos': 1.0207221927340032e+16, 'train_loss': 0.20304089768717834, 'epoch': 14.0})

### Saving

In [16]:
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

('sentiment_ita/tokenizer_config.json',
 'sentiment_ita/special_tokens_map.json',
 'sentiment_ita/vocab.txt',
 'sentiment_ita/added_tokens.json',
 'sentiment_ita/tokenizer.json')

### Save to HF Hub

In [17]:
trainer.push_to_hub(SAVE_DIR)

training_args.bin:   0%|          | 0.00/4.16k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1700826307.5aec582beaec.38279.0:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

'https://huggingface.co/luigisaetta/sentiment_ita/tree/main/'