First of all, let's import.

In [1]:
# import
import torch
import numpy as np
import pandas as pd
import os
from typing import Dict
import torch
from datasets import load_dataset
from datasets import load_metric
from transformers import DataCollatorWithPadding
from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
) 

Now, we set some model and training's parameters.

In [2]:
torch.cuda.is_available()

True

In [3]:
### Model Parameters

language_model_name = "distilbert-base-uncased"

### Training Argurments
batch_size = 32
output_dir = "training_dir"
trained = False

# optim
learning_rate = 1e-4
weight_decay = 0.001

# training
epochs = 1
device = "cuda" if torch.cuda.is_available() else "cpu"

set_seed(42)

print(f"Language Model: {language_model_name} and Device: {device}")

Language Model: distilbert-base-uncased and Device: cuda


In [4]:
# load the dataset
dataset = load_dataset("tommasobonomo/sem_augmented_fever_nli")

# Let's see an example...
print(dataset)
print(dataset['train'][0])
print("Label of the first sentence:", dataset['train'][0]['label']) 

# Take a look at the labels
all_labels = []
for label in dataset['train'].unique("label"):
    all_labels.append(label)

print(all_labels) ## ['ENTAILMENT', 'NEUTRAL', 'CONTRADICTION'] == ['SUPPORTS', 'NOT ENOUGH INFO', 'REFUTES']

# Label mapping

label_map = {
    'ENTAILMENT': 0,
    'NEUTRAL': 1,
    'CONTRADICTION': 2
}

def convert_labels(example):
    example['label'] = label_map[example['label']]
    return example

DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl'],
        num_rows: 51086
    })
    validation: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl'],
        num_rows: 2288
    })
    test: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl'],
        num_rows: 2287
    })
})
{'id': '150448', 'premise': "Roman Atwood . He is best known for his vlogs , where he posts updates about his life on a daily basis . His vlogging channel , `` RomanAtwoodVlogs '' , has a total of 3.3 billion views and 11.9 million subscribers . He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks .", 'hypothesis': 'Roman Atwood is a content creator.', 'label': 'ENTAILMENT', 'wsd': {'premise': [{'index': 0, 'text': 'Roman', 'pos': 'ADJ', 'lemma': 'roman', 'bnSynsetId': 'bn:00109913a', 'wnSynsetOffset': '2921569a', 'nltkSynset': 'roman.a.01'}, {'index': 1, 'text': '

In [5]:
# Define the metrics

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average='macro')["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [6]:
## Initialize the model

if os.path.exists(output_dir) and os.path.isdir(output_dir):
    print("Loading the fine-tuned model from the saved directory...")
    model = AutoModelForSequenceClassification.from_pretrained(output_dir).to(device)
    tokenizer = AutoTokenizer.from_pretrained(output_dir)
    trained = True
else:
    print("Training the model from scratch...")
    model = AutoModelForSequenceClassification.from_pretrained(language_model_name, 
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False, output_hidden_states=False,
                                                                   num_labels=3).to(device)
    tokenizer = AutoTokenizer.from_pretrained(language_model_name)
    trained = False

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples["premise"], examples["hypothesis"], padding=True, truncation=True)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.map(convert_labels)

Loading the fine-tuned model from the saved directory...


In [7]:
# Let's see the tokenized dataset and the first example

print(tokenized_datasets)
print(f"First example premise: {tokenized_datasets['train'][0]['premise']} and hypothesis: {tokenized_datasets['train'][0]['hypothesis']}")
print("Tokenized input: ", tokenized_datasets['train'][0]['input_ids'])

# Print the labels
print("Label of the first sentence:", tokenized_datasets['train'][0]['label'])
all_labels = []
for label in tokenized_datasets['train'].unique("label"):
    all_labels.append(label)
print(all_labels)

# Test the corrispondence by decoding the input_ids
print("Decoded input: ", tokenizer.decode(tokenized_datasets['train'][0]['input_ids']))



DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl', 'input_ids', 'attention_mask'],
        num_rows: 51086
    })
    validation: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl', 'input_ids', 'attention_mask'],
        num_rows: 2288
    })
    test: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label', 'wsd', 'srl', 'input_ids', 'attention_mask'],
        num_rows: 2287
    })
})
First example premise: Roman Atwood . He is best known for his vlogs , where he posts updates about his life on a daily basis . His vlogging channel , `` RomanAtwoodVlogs '' , has a total of 3.3 billion views and 11.9 million subscribers . He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks . and hypothesis: Roman Atwood is a content creator.
Tokenized input:  [101, 3142, 2012, 3702, 1012, 2002, 2003, 2190, 2124, 2005, 2010, 1058, 21197, 2015, 1010, 2073, 2002, 8466, 14409

### MODEL TRAINING

In [8]:
os.makedirs(output_dir, exist_ok=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir=output_dir,                            # output directory [Mandatory]
    num_train_epochs=epochs,                          # total number of training epochs
    per_device_train_batch_size=batch_size,           # batch size per device during training
    per_device_eval_batch_size=batch_size,            # batch size for evaluation
    warmup_steps=500,                                 # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                        # strength of weight decay
    save_strategy="no",
    learning_rate=learning_rate                       # learning rate
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                                       # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                                # training arguments, defined above
    train_dataset=tokenized_datasets["train"],         # training dataset
    eval_dataset=tokenized_datasets["validation"],     # evaluation dataset
    tokenizer=tokenizer,                               # the tokenizer
    data_collator=data_collator,                       # data collator
    compute_metrics=compute_metrics                    # the callback that computes metrics of interest
)

In [9]:
# Train the model
if not trained:
    print("Training the model...")
    trainer.train()
else:
    print("Model already trained.")

Model already trained.


In [10]:
if not trained:
    print("Saving the model...")
    trainer.save_model(output_dir)
    trained = True
else:
    print("Model already saved.")

Model already saved.


In [11]:
# evaluate the model
if trained:
    print("Evaluating the model...")
    metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
    # print the metrics
    print("Evaluation results for the initial test dataset: ", metrics)


Evaluating the model...


  0%|          | 0/72 [00:00<?, ?it/s]

  load_accuracy = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluation results for the initial test dataset:  {'eval_loss': 0.7723000049591064, 'eval_accuracy': 0.7022299956274596, 'eval_f1': 0.686098316835566, 'eval_runtime': 20.2788, 'eval_samples_per_second': 112.778, 'eval_steps_per_second': 3.551}


In [12]:
# Now let's do evaluation on the Adversarial Fever Dataset.
# Let's start loading the new dataset.

adversarial_dataset = load_dataset("iperbole/adversarial_fever_nli")

# Let's see an example...
print(adversarial_dataset)
print(adversarial_dataset['test'][0])

DatasetDict({
    test: Dataset({
        features: ['part', 'cid', 'premise', 'hypothesis', 'label'],
        num_rows: 337
    })
})
{'part': 'manual_adversarial', 'cid': 58846, 'premise': 'Johnny Galecki . He is known for playing David Healy in the ABC sitcom Roseanne from 1992 -- 1997 and Dr. Leonard Hofstadter in the CBS sitcom The Big Bang Theory since 2007 .', 'hypothesis': 'The number of sitcoms from France in which Johnny Galecki has played a character is greater or equal to 2', 'label': 'NEUTRAL'}


In [13]:
# Tokenize and convert the labels
tokenized_adversarial = adversarial_dataset.map(tokenize_function, batched=True)
tokenized_adversarial = tokenized_adversarial.map(convert_labels)

print(tokenized_adversarial)
print(tokenized_adversarial['test'][0]['label'])

DatasetDict({
    test: Dataset({
        features: ['part', 'cid', 'premise', 'hypothesis', 'label', 'input_ids', 'attention_mask'],
        num_rows: 337
    })
})
1


In [14]:
# Let's do the evaluation
eval_results = trainer.evaluate(eval_dataset= tokenized_adversarial["test"])
print("Evaluation results for the Adversarial Fever Dataset: ", eval_results)


  0%|          | 0/11 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluation results for the Adversarial Fever Dataset:  {'eval_loss': 1.291935682296753, 'eval_accuracy': 0.5014836795252225, 'eval_f1': 0.5038331002848744, 'eval_runtime': 3.6556, 'eval_samples_per_second': 92.188, 'eval_steps_per_second': 3.009}


In [15]:
# Now let's work on another task: we are going to train the model on the dataset composed by the concatenation of the initial training Fever Dataset and my augmented dataset.
# Let's start loading the new dataset.
from datasets import DatasetDict


augmented_dataset = DatasetDict.load_from_disk('dataset_dict')
# Drop the 'wsd' and 'srl' features. -> Labels already converted.
augmented_dataset['test'] = augmented_dataset['test'].remove_columns(['wsd', 'srl'])
print("My augmented dataset: \n", augmented_dataset)

# We use the same training parameters, tokenizer and model as before. -> different training dir.
output_dir_augmented = "training_dir_augmented"


My augmented dataset: 
 DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label'],
        num_rows: 61086
    })
    validation: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label'],
        num_rows: 7288
    })
    test: Dataset({
        features: ['id', 'premise', 'hypothesis', 'label'],
        num_rows: 2287
    })
})


In [16]:
## Initialize the model

if os.path.exists(output_dir_augmented) and os.path.isdir(output_dir_augmented):
    print("Loading the fine-tuned model from the saved directory...")
    model_aug = AutoModelForSequenceClassification.from_pretrained(output_dir_augmented).to(device)
    tokenizer_aug = AutoTokenizer.from_pretrained(output_dir_augmented)
    trained = True
else:
    print("Training the new model from scratch...")
    model_aug = AutoModelForSequenceClassification.from_pretrained(language_model_name, 
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False, output_hidden_states=False,
                                                                   num_labels=3).to(device)
    tokenizer_aug = AutoTokenizer.from_pretrained(language_model_name)
    trained = False

data_collator = DataCollatorWithPadding(tokenizer=tokenizer_aug)

def tokenize_function(examples):
    return tokenizer_aug(examples["premise"], examples["hypothesis"], padding=True, truncation=True)

# Tokenize the dataset
tokenized_augmented_dataset = augmented_dataset.map(tokenize_function, batched=True)

Loading the fine-tuned model from the saved directory...


In [17]:
# Let's check if the tokenization is correct
print("Tokenized input: ", tokenized_augmented_dataset['train'][0]['input_ids'])

Tokenized input:  [101, 3142, 2012, 3702, 1012, 2002, 2003, 2190, 2124, 2005, 2010, 1058, 21197, 2015, 1010, 2073, 2002, 8466, 14409, 2055, 2010, 2166, 2006, 1037, 3679, 3978, 1012, 2010, 1058, 21197, 4726, 3149, 1010, 1036, 1036, 3142, 4017, 3702, 2615, 21197, 2015, 1005, 1005, 1010, 2038, 1037, 2561, 1997, 1017, 1012, 1017, 4551, 5328, 1998, 2340, 1012, 1023, 2454, 17073, 1012, 2002, 2036, 2038, 2178, 7858, 3149, 2170, 1036, 1036, 3142, 4017, 3702, 1005, 1005, 1010, 2073, 2002, 8466, 26418, 2015, 1012, 102, 3142, 2012, 3702, 2003, 1037, 4180, 8543, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [18]:
os.makedirs(output_dir_augmented, exist_ok=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir=output_dir_augmented,                  # output directory [Mandatory]
    num_train_epochs=epochs,                          # total number of training epochs
    per_device_train_batch_size=batch_size,           # batch size per device during training
    per_device_eval_batch_size=batch_size,                    # batch size for evaluation
    warmup_steps=500,                                 # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                        # strength of weight decay
    save_strategy="no",
    learning_rate=learning_rate                       # learning rate
)

# Initialize the Trainer
trainer_aug = Trainer(
    model=model_aug,                                            # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                                         # training arguments, defined above
    train_dataset=tokenized_augmented_dataset["train"],         # training dataset
    eval_dataset=tokenized_augmented_dataset["validation"],     # evaluation dataset
    tokenizer=tokenizer_aug,                                    # the tokenizer
    data_collator=data_collator,                                # data collator
    compute_metrics=compute_metrics                             # the callback that computes metrics of interest
)

In [19]:
# Train the model
if not trained:
    print("Training the model...")
    trainer_aug.train()
else:
    print("Model already trained.")

Model already trained.


In [20]:
if not trained:
    print("Saving the model...")
    trainer_aug.save_model(output_dir_augmented)
    trained = True
else:
    print("Model already saved.")

Model already saved.


In [21]:
# evaluate the model
if trained:
    print("Evaluating the model...")
    metrics = trainer_aug.evaluate(eval_dataset=tokenized_augmented_dataset["test"])
    # print the metrics
    print("Evaluation results for the initial test dataset: ", metrics)


Evaluating the model...


  0%|          | 0/72 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluation results for the initial test dataset:  {'eval_loss': 0.8025291562080383, 'eval_accuracy': 0.7092260603410582, 'eval_f1': 0.692296233672235, 'eval_runtime': 19.431, 'eval_samples_per_second': 117.699, 'eval_steps_per_second': 3.705}


In [22]:
#Finally let's do evaluation on the Adversarial Fever Dataset.
# Let's do the evaluation
new_eval_results = trainer_aug.evaluate(eval_dataset= tokenized_adversarial["test"])
print("Evaluation results for the Adversarial Fever Dataset: ", new_eval_results)

  0%|          | 0/11 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Evaluation results for the Adversarial Fever Dataset:  {'eval_loss': 1.3428587913513184, 'eval_accuracy': 0.5133531157270029, 'eval_f1': 0.5140298545393321, 'eval_runtime': 3.6459, 'eval_samples_per_second': 92.433, 'eval_steps_per_second': 3.017}
