Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version:

In [1]:
import transformers

print(transformers.__version__)

4.32.1


# Fine-tuning a model on a token classification task

This notebook is built to run on any token classification task, with any model checkpoint from the [Model Hub](https://huggingface.co/models) as long as that model has a version with a token classification head and a fast tokenizer (check on [this table](https://huggingface.co/transformers/index.html#bigtable) if this is the case). It might just need some small adjustments if you decide to use a different dataset than the one used here. Depending on you model and the GPU you are using, you might need to adjust the batch size to avoid out-of-memory errors. Set those three parameters, then the rest of the notebook should run smoothly:

In [2]:
# Setting up the task and model for natural language processing:
# - 'task' specifies the type of task. It can be 'ner' (Named Entity Recognition), 
#   'pos' (Part-of-Speech tagging), or 'chunk' (chunking). Here, 'ner' is chosen.
# - 'model_checkpoint' is set to use 'distilbert-base-uncased', which is a lightweight 
#   version of the BERT model and is not case-sensitive.
# - 'batch_size' is set to 16, defining the number of samples to work through
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

## Loading the dataset

We will load our data locally, define the feature columns, and represent it in DataseDict format.

In [3]:
import pandas as pd
from datasets import Dataset, DatasetDict, ClassLabel, Sequence, Value, Features, load_dataset, load_metric

In [4]:
# Importing necessary libraries and modules.
# pandas is used for data manipulation and analysis.
# Dataset, DatasetDict, ClassLabel, Sequence, Value, Features are from the 'datasets' library for handling structured data.
# We used ChatGPT, prompted the dataset output of the original template and step by step built the same structure for our data.

def load_data(file_path):
    """
    Load the TSV file into a Pandas DataFrame.
    
    Args:
    file_path (str): Path to the .tsv file.

    Returns:
    DataFrame: A DataFrame containing the loaded data.
    """
    
    # Reading data from a TSV file into a DataFrame with specified column names.
    df = pd.read_csv(file_path, delimiter='\t', header=None, names=[
        'chapter', 'sentence_num', 'token_num', 'token', 'lemma', 
        'pos', 'constituency', 'negation_cue', 'negation_scope', 'negated_event'
    ])

    # Dropping rows where any column has missing values (NA).
    df = df.dropna(how='any')

    return df

def transform_data(df, pos_label_mapping, negation_scope_label_mapping):
    """
    Transform the DataFrame into a format suitable for machine learning models.

    Args:
    df (DataFrame): The input data frame.
    pos_label_mapping (dict): Mapping for POS tags to indices.
    negation_scope_label_mapping (dict): Mapping for negation scopes to indices.

    Returns:
    list: A list of dictionaries, each representing a transformed data record.
    """
    # Grouping the data by 'chapter' and 'sentence_num' for processing.
    grouped = df.groupby(['chapter', 'sentence_num'])
    transformed_data = []

    # Transforming the grouped data into a format suitable for training.
    for (chapter, sentence_num), group in grouped:
        record = {
            'id': f"{chapter}_{sentence_num}",
            'tokens': group['token'].tolist(),
            'lemmas': group['lemma'].tolist(),
            'pos': [pos_label_mapping[label] for label in group['pos'].tolist()],
            'constituency': group['constituency'].tolist(),
            'negation_cue': group['negation_cue'].tolist(),
            'negation_scope': [negation_scope_label_mapping[label] for label in group['negation_scope'].tolist()],
            'negated_event': group['negated_event'].tolist()
        }
        transformed_data.append(record)
    return transformed_data

# Paths to the training, testing, and development data files.
training_file_path = './data/bert_input_training.tsv' # Update with your file path
test_file_path = './data/bert_input_test.tsv' # Update with your file path
dev_file_path = './data/bert_input_dev.tsv'

# Loading the data from the specified file paths.
training_data = load_data(training_file_path)
test_data = load_data(test_file_path)
dev_data = load_data(dev_file_path)

# Identifying unique labels for POS tags and negation scopes.
# Extracting and combining POS tags from training and development datasets.
training_pos_tags = training_data['pos']
dev_pos_tags = dev_data['pos']
combined_pos_tags = pd.concat([training_pos_tags, dev_pos_tags])
unique_pos_tags = sorted(combined_pos_tags.unique().tolist())

# Creating a mapping of POS tags and negation scopes to unique indices.
pos_label_mapping = {label: idx for idx, label in enumerate(unique_pos_tags)}
unique_negation_scopes = sorted(training_data['negation_scope'].unique().tolist())
negation_scope_label_mapping = {label: idx for idx, label in enumerate(unique_negation_scopes)}

# Transforming the data to the required format.
transformed_training_data = transform_data(training_data, pos_label_mapping, negation_scope_label_mapping)
transformed_test_data = transform_data(test_data, pos_label_mapping, negation_scope_label_mapping)
transformed_dev_data = transform_data(dev_data, pos_label_mapping, negation_scope_label_mapping)

# Defining the features for the Hugging Face dataset.
features = Features({
    'id': Value(dtype='string'),
    'tokens': Sequence(feature=Value(dtype='string')),
    'lemmas': Sequence(feature=Value(dtype='string')),
    'pos': Sequence(feature=ClassLabel(names=unique_pos_tags)),
    'constituency': Sequence(feature=Value(dtype='string')),
    'negation_cue': Sequence(feature=Value(dtype='string')),
    'negation_scope': Sequence(feature=ClassLabel(names=unique_negation_scopes)),
    'negated_event': Sequence(feature=Value(dtype='string')),
})

# Converting the transformed data to Hugging Face datasets.
hf_training_dataset = Dataset.from_pandas(pd.DataFrame(transformed_training_data), features=features)
hf_test_dataset = Dataset.from_pandas(pd.DataFrame(transformed_test_data), features=features)
hf_dev_dataset = Dataset.from_pandas(pd.DataFrame(transformed_dev_data), features=features)

# Creating a DatasetDict to hold the training, validation, and test datasets.
dataset_dict = DatasetDict({
    'train': hf_training_dataset,
    'validation': hf_dev_dataset,
    'test': hf_test_dataset
})

# Printing the dataset dictionary to check its structure and contents
# The datasets object itself is DatasetDict, which contains one key for the training, validation and test set
# We can see the training, validation and test sets all have a column for the tokens (the input texts split into words) and one column of labels

print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'lemmas', 'pos', 'constituency', 'negation_cue', 'negation_scope', 'negated_event'],
        num_rows: 3780
    })
    validation: Dataset({
        features: ['id', 'tokens', 'lemmas', 'pos', 'constituency', 'negation_cue', 'negation_scope', 'negated_event'],
        num_rows: 816
    })
    test: Dataset({
        features: ['id', 'tokens', 'lemmas', 'pos', 'constituency', 'negation_cue', 'negation_scope', 'negated_event'],
        num_rows: 1118
    })
})


## Preprocessing the data

Before we can feed those texts to our model, we need to preprocess them. This is done by a 🤗 Transformers `Tokenizer` which will (as the name indicates) tokenize the inputs (including converting the tokens to their corresponding IDs in the pretrained vocabulary) and put it in a format the model expects, as well as generate the other inputs that model requires.

To do all of this, we instantiate our tokenizer with the `AutoTokenizer.from_pretrained` method, which will ensure:

- we get a tokenizer that corresponds to the model architecture we want to use,
- we download the vocabulary used when pretraining this specific checkpoint.

That vocabulary will be cached, so it's not downloaded again the next time we run the cell.

In [5]:
# Importing the AutoTokenizer class from the transformers library.
from transformers import AutoTokenizer

# Initializing the tokenizer with a pre-trained model.
# 'model_checkpoint' specifies the pre-trained model to use.
# This tokenizer will be used to convert text into tokens that the model can understand.
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


The following assertion ensures that our tokenizer is a fast tokenizers (backed by Rust) from the 🤗 Tokenizers library. Those fast tokenizers are available for almost all models, and we will need some of the special features they have for our preprocessing.

In [6]:
# Importing the transformers library.
import transformers

# Asserting that the 'tokenizer' object is an instance of PreTrainedTokenizerFast.
# This assertion ensures that the tokenizer has the desired properties and functionalities
# of the PreTrainedTokenizerFast class, which is optimized for speed and efficiency.
# If the assertion fails, it will raise an AssertionError.
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)


In [7]:
label_all_tokens = True

We're now ready to write the function that will preprocess our samples. We feed them to the `tokenizer` with the argument `truncation=True` (to truncate texts that are bigger than the maximum size allowed by the model) and `is_split_into_words=True` (as seen above). Then we align the labels with the token ids using the strategy we picked:

In [8]:
def find_max_length_across_datasets(dataset_dict):
    # This function is added to the block in order to maintain a consistent max_length for padding.
    max_length = 0
    for dataset_name, dataset in dataset_dict.items():
        for example in dataset:
            # 'tokens' is assumed to be the key for tokenized text
            length = len(example["tokens"])
            if length > max_length:
                max_length = length
    return max_length
# We find the longest input througout the dataset and keep that as the input length
max_length = find_max_length_across_datasets(dataset_dict)
    
def tokenize_and_align_labels(examples):
    
    # Tokenizing the input text. 'truncation=True' ensures inputs fit model max size.
    # 'is_split_into_words=True' indicates the input is already split into words.
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding='max_length', max_length=max_length)

    labels = []  # Initialize a list to store aligned labels for each tokenized input.

    # Iterate over each example. 'enumerate' provides a counter 'i' and the 'label' (negation scope).
    for i, label in enumerate(examples["negation_scope"]):
        # Get word IDs for each token to map them back to their respective words.
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None  # Initialize a variable to track the previous word index.
        label_ids = []  # List to hold the labels for each token in the current example.

        # Iterate over each word ID in 'word_ids'.
        for word_idx in word_ids:
            # Assign -100 to special tokens (word ID is None), which are ignored in loss calculation.
            if word_idx is None:
                label_ids.append(-100)
            # Assign the label to the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For subsequent tokens of the same word:
            else:
                # If 'label_all_tokens' is True, use the same label; otherwise, use -100.
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx  # Update the previous word index.

        labels.append(label_ids)  # Add the list of labels for this example to the main list.

    # Add the aligned labels to the tokenized inputs.
    tokenized_inputs["labels"] = labels
    return tokenized_inputs  # Return the tokenized inputs with aligned labels.


To apply this function on all the sentences (or pairs of sentences) in our dataset, we just use the `map` method of our `dataset` object we created earlier. This will apply the function on all the elements of all the splits in `dataset`, so our training, validation and testing data will be preprocessed in one single command.

In [9]:
# Map the 'tokenize_and_align_labels' function to the 'dataset_dict' with batch processing (batched=True).
tokenized_datasets = dataset_dict.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3780 [00:00<?, ? examples/s]

Map:   0%|          | 0/816 [00:00<?, ? examples/s]

Map:   0%|          | 0/1118 [00:00<?, ? examples/s]

## Fine-tuning the model

Now that our data is ready, we can download the pretrained model and fine-tune it. Since all our tasks are about token classification, we use the `AutoModelForTokenClassification` class. Like with the tokenizer, the `from_pretrained` method will download and cache the model for us. The only thing we have to specify is the number of labels for our problem (which we can get from the features, as seen before):

In [11]:
# Import necessary libraries and classes from the 'transformers' library.
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# Extracting the list of Negation Scope tag names from the 'hf_training_dataset'.
# This is done by accessing the 'negation_scope' feature in the dataset's features, 
# and then retrieving the names associated with this feature.
# The result is stored in 'label_list', which will contain all the unique Negation Scope tags 
# used in the training dataset.
label_list = hf_training_dataset.features["negation_scope"].feature.names

# Create a model instance using 'AutoModelForTokenClassification' and load the pretrained model using 'from_pretrained'.
# 'model_checkpoint' should contain the model name or path to the pretrained model.
# 'num_labels' is the number of unique labels for your token classification task, obtained from the features.
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

# At this point, 'model' contains the pretrained model with the specified number of output labels for token classification.

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The warning is telling us we are throwing away some weights (the `vocab_transform` and `vocab_layer_norm` layers) and randomly initializing some other (the `pre_classifier` and `classifier` layers). This is absolutely normal in this case, because we are removing the head used to pretrain the model on a masked language modeling objective and replacing it with a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do.

To instantiate a `Trainer`, we will need to define three more things. The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments), which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model, and all other arguments are optional:

In [12]:
# Extract the model name from the 'model_checkpoint' path.
model_name = model_checkpoint.split("/")[-1]

# Create a TrainingArguments object to configure the fine-tuning process.
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",  # Specify the output directory for the fine-tuned model.
    evaluation_strategy="epoch",        # Evaluation strategy, in this case, "epoch".
    learning_rate=1e-4,                # Learning rate for fine-tuning.
    per_device_train_batch_size=batch_size,  # Batch size for training on each device.
    per_device_eval_batch_size=batch_size,   # Batch size for evaluation on each device.
    num_train_epochs=1,                # Number of training epochs.
    weight_decay=0.01,                 # Weight decay for regularization.
    push_to_hub=False,                 # Whether or not to push the model to the Hugging Face Model Hub.
)

# After this code block, the 'args' object contains the configuration for the fine-tuning process.


Then we will need a data collator that will batch our processed examples together while applying padding to make them all the same size (each pad will be padded to the length of its longest example). There is a data collator for this task in the Transformers library, that not only pads the inputs, but also the labels:

In [13]:
from transformers import DataCollatorForTokenClassification
from evaluate import load

data_collator = DataCollatorForTokenClassification(tokenizer)


The last thing to define for our `Trainer` is how to compute the metrics from the predictions. Here we will load the [`seqeval`](https://github.com/chakki-works/seqeval) metric (which is commonly used to evaluate results on the CONLL dataset) via the Datasets library.
We changed the method of setting the metric as older method was going to be depracated : https://huggingface.co/docs/evaluate/choosing_a_metric

In [14]:
metric = load("seqeval")

So we will need to do a bit of post-processing on our predictions:
- select the predicted index (with the maximum logit) for each token
- convert it to its string label
- ignore everywhere we set a label of -100

The following function does all this post-processing on the result of `Trainer.evaluate` (which is a namedtuple containing predictions and labels) before applying the metric:

In [16]:
import numpy as np

# Define a function 'compute_metrics' that takes predictions and labels as input and calculates evaluation metrics.
def compute_metrics(p):
    predictions, labels = p

    # Find the index with the maximum logit for each token (argmax over the last dimension).
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100), convert predicted and true labels to their string representations.
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Calculate evaluation metrics using 'metric.compute'.
    results = metric.compute(predictions=true_predictions, references=true_labels)

    # Return a dictionary containing precision, recall, F1 score, and accuracy.
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


Note that we drop the precision/recall/f1 computed for each category and only focus on the overall precision/recall/f1/accuracy.

Then we just need to pass all of this along with our datasets to the `Trainer`:

In [17]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

We can now finetune our model by just calling the `train` method:

In [18]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.060097,0.707547,0.757576,0.731707,0.979132




TrainOutput(global_step=237, training_loss=0.08531076294460377, metrics={'train_runtime': 641.2396, 'train_samples_per_second': 5.895, 'train_steps_per_second': 0.37, 'total_flos': 80060749226640.0, 'train_loss': 0.08531076294460377, 'epoch': 1.0})

The `evaluate` method allows you to evaluate again on the evaluation dataset or on another dataset:

In [19]:
trainer.evaluate()

{'eval_loss': 0.06009732931852341,
 'eval_precision': 0.7075471698113207,
 'eval_recall': 0.7575757575757576,
 'eval_f1': 0.7317073170731707,
 'eval_accuracy': 0.9791321426379427,
 'eval_runtime': 37.1309,
 'eval_samples_per_second': 21.976,
 'eval_steps_per_second': 1.374,
 'epoch': 1.0}

To get the precision/recall/f1 computed for each category now that we have finished training, we can apply the same function as before on the result of the `predict` method:

In [20]:
# Generate word_id_lists
word_id_lists = []
for example in tokenized_datasets["validation"]:
    word_ids = tokenizer(example["tokens"], truncation=True, is_split_into_words=True).word_ids()
    word_id_lists.append(word_ids)

# Perform prediction
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Process predictions and labels
updated_true_predictions = []
updated_true_labels = []

for prediction, label, word_ids in zip(predictions, labels, word_id_lists):
    new_prediction = []
    new_label = []
    current_label = None
    previous_word_id = None

    for p, l, word_id in zip(prediction, label, word_ids):
        if l != -100:  # Only consider non-special tokens
            if word_id is None or word_id != previous_word_id:
                current_label = label_list[p]
            new_prediction.append(current_label)
            new_label.append(label_list[l])
            previous_word_id = word_id

    updated_true_predictions.append(new_prediction)
    updated_true_labels.append(new_label)

# Compute results with the updated predictions and labels
results = metric.compute(predictions=updated_true_predictions, references=updated_true_labels)
results


{'S': {'precision': 0.7356687898089171,
  'recall': 0.7777777777777778,
  'f1': 0.7561374795417348,
  'number': 297},
 'overall_precision': 0.7356687898089171,
 'overall_recall': 0.7777777777777778,
 'overall_f1': 0.7561374795417348,
 'overall_accuracy': 0.9797459031485914}

In [21]:
token_list = []
pred_list = []
label_list = []
sentence_id_list = []

# This block saves all tokens, predictions and their gold labels with their sentence ids into lists 
# and writes them into a tsv file to use as input for our evaluation script.

for i in range(len(hf_dev_dataset)):
    sentence = hf_dev_dataset[i]['tokens']
    sentence_id = hf_dev_dataset[i]['id']
    tokenized_input = tokenizer(sentence, is_split_into_words=True)
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
    no_cls_tokens = tokens[1:-1]
    for token in no_cls_tokens:
        token_list.append(token)
        sentence_id_list.append(sentence_id)
    token_list.append(' ')
    sentence_id_list.append(' ')


for pred, lab in zip(updated_true_predictions, updated_true_labels):
    # Iterate through the tokens in each sentence pair.
    for left_token, right_token in zip(pred, lab):
        pred_list.append(left_token)
        label_list.append(right_token)
    pred_list.append('\n')
    label_list.append('\n')
    
# Define the file name where you want to save the TSV file.
output_file = "../bert/data/predictions_260124_last.tsv"

# Open the file for writing in TSV format.
with open(output_file, "w") as tsv_file:
    for sent_id, token, lab, pred in zip(sentence_id_list,token_list,label_list,pred_list):
        if sent_id != ' ' and token != ' ' and lab != ' ' and pred != ' ':

            line = f"{sent_id}\t{token}\t{lab}\t{pred}"
            tsv_file.write(line + "\n")       
        else:
            tsv_file.write("\n")