In [104]:
# !pip install transformers datasets evaluate seqeval
from datasets import load_dataset
import numpy as np
import evaluate
from transformers import DataCollatorForTokenClassification
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
from transformers.keras_callbacks import KerasMetricCallback
from transformers import pipeline
from tensorflow.keras.models import load_model

In [105]:
# Define the file paths for the training and evaluation
PATH_TO_TRAIN_DATA = "../tokenaized_data/train_data.json"
PATH_TO_EVAL_DATA = "../tokenaized_data/eval_data.json"

# Load the dataset from JSON files using the datasets library
dataset = load_dataset(
    "json",
    data_files={
        "train": PATH_TO_TRAIN_DATA,
        "evaluation": PATH_TO_EVAL_DATA,
    },
    field="data"  # Indicate the field in the JSON files where the actual data is located
)

In [106]:
# Import the AutoTokenizer class from the transformers library
from transformers import AutoTokenizer

# Create an instance of AutoTokenizer and load the tokenizer for the "distilbert-base-uncased" model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [107]:
# # Retrieves the first example from the training split of the dataset
# example = dataset["train"][0]
#
# # Tokenizes the input text using the previously loaded tokenizer.
# # is_split_into_words=True indicates that the input is already split into words.
# tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
#
# # Converts the token IDs back to tokens (words or subwords)
# tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

# Define a function to tokenize inputs and align labels for NER
def tokenize_and_align_labels(examples):
    """
    Tokenizes the inputs using the tokenizer, specifying truncation
    and that the input is already split into words.

    Args:
        examples (dict): Dictionary containing data for tokenization.

    Returns:
        dict: Tokenized data with aligned labels.
    """
    # Tokenizes the inputs using the tokenizer, specifying truncation and that the input is already split into words
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    # Initialize an empty list to store aligned labels
    labels = []

    # Iterate through examples and their NER tags
    for i, label in enumerate(examples[f"ner_tags"]):
        # Map tokens to their respective word
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        # Align labels with tokenized inputs
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Set special tokens to -100
            elif word_idx != previous_word_idx:  # Only label the first token of a given word
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)  # Append the label IDs for this example

    tokenized_inputs["labels"] = labels  # Add aligned label IDs to tokenized inputs
    return tokenized_inputs  # Return the tokenized inputs with labels aligned

# Apply the tokenize_and_align_labels function to the entire dataset in a batched manner
# This processes the dataset for NER by tokenizing inputs and aligning labels
tokenized_data = dataset.map(tokenize_and_align_labels, batched=True)
print(tokenized_data)

Map:   0%|          | 0/642 [00:00<?, ? examples/s]

Map:   0%|          | 0/161 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ner_tags', 'id', 'tokens', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 642
    })
    evaluation: Dataset({
        features: ['ner_tags', 'id', 'tokens', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 161
    })
})


In [108]:
# Create an instance of DataCollatorForTokenClassification
# This class prepares batches of tokenized inputs and labels for token classification
# The tokenizer argument specifies the tokenizer to use for tokenization
# return_tensors="tf" configures the output to be TensorFlow tensors
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [109]:
# Conduct sequence labeling evaluation
seqeval = evaluate.load("seqeval")

In [110]:
# List of labels used in Named Entity Recognition (NER)
label_list = ["0", "B-geo", "I-geo"]

def compute_metrics(p):
    """
    Computes evaluation metrics.

    Args:
        p (tuple): A tuple containing predictions and true labels.

    Returns:
        dict: A dictionary containing precision, recall, F1-score, and accuracy metrics.
    """
    # Extract predictions and true labels from the input tuple
    predictions, labels = p

    # Convert predicted labels to indices with maximum probability along the last axis
    predictions = np.argmax(predictions, axis=2)

    # Convert predicted and true labels to label strings from label_list, excluding padding tokens (-100)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Define a checkpoint path for saving model weights
    checkpoint_path = "runs/cp-{epoch:04d}.ckpt"

    # Save model weights at epoch 0
    model.save_weights(checkpoint_path.format(epoch=0))

    # Compute NER evaluation metrics using seqeval library
    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    # Return a dictionary containing precision, recall, F1-score, and accuracy metrics
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [111]:
# Maps numerical IDs to their respective labels. Provides the reverse mapping, associating labels with their corresponding numerical id.
id2label = {
    0: "O",
    1: "B-geo",
    2: "I-geo",
}
label2id = {
    "O": 0,
    "B-geo": 1,
    "I-geo": 2,
}

In [112]:
# Load a pre-trained model ("distilbert-base-uncased") for token classification
# Set the number of labels to 3 (assuming 3 labels in this case: "O", "B-geo", "I-geo")
# Provide mappings between numerical id and labels using id2label and label2id dictionaries
model = TFAutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [113]:
# Prepare a TensorFlow dataset for training
tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

# Prepare a TensorFlow dataset for validation
tf_validation_set = model.prepare_tf_dataset(
    tokenized_data["evaluation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [114]:
# Number of samples processed in each training batch
batch_size = 4

# Number of training epochs
num_train_epochs = 10

# Calculate the total number of training steps
num_train_steps = (len(tokenized_data["train"]) // batch_size) * num_train_epochs

# Create optimizer and learning rate scheduler
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,  # Initial learning rate
    num_train_steps=num_train_steps,  # Total number of training steps
    weight_decay_rate=0.01,  # Rate of weight decay for regularization
    num_warmup_steps=0,  # Number of warm-up steps for learning rate warm-up
)

In [115]:
# model.compile(optimizer='adam', metrics=['accuracy'])
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# model.compile(optimizer=optimizer, metrics=['accuracy'])  # No loss argument!
# Compiles the model
model.compile(optimizer=optimizer)  # No loss argument!
#
# model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01), loss=tf.keras.losses.Loss(),
#               metrics=[tf.keras.metrics.BinaryAccuracy(),
#                        tf.keras.metrics.FalseNegatives()])

In [116]:
# Create a KerasMetricCallback for tracking custom evaluation metrics during training
# Uses the compute_metrics function for evaluation on the validation dataset
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [117]:
# Create a list of callbacks containing the KerasMetricCallback
callbacks = [metric_callback]

In [118]:
# Train the model using the prepared TensorFlow datasets for training and validation
# The x parameter refers to the training dataset, `validation_data` to the validation dataset
# epochs specifies the number of training epochs
# callbacks contains the list of callbacks
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_train_epochs, callbacks=callbacks)

Epoch 1/10
Epoch 2/10




Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x26f01d43700>

In [None]:
# The path to the folder with model
model_path = "../runs"

# Saving the model to the "runs" directory
model.save("runs")
# model.summary()

In [120]:
text = "In the heart of every mountain range lies a story as ancient as time itself, inscribed within the very fabric of stone and ice. These narratives weave tales of human endeavor, courage, and the relentless pursuit of conquering the unconquerable. From the serene and awe-inspiring Sierra Nevada to the rugged expanse of the Swiss Alps these colossal peaks stand as testaments to human resilience and the enduring power of nature. Their timeless grandeur bears witness to the indomitable spirit of exploration that courses through humanity’s veins, a testament to our unwavering quest for discovery amidst the vast and formidable landscapes of this world."

In [121]:
# Create a Named Entity Recognition (NER) pipeline using the provided model and tokenizer
pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Process the 'text' input using the NER pipeline with aggregation strategy set to "max"
print(pipeline(text, aggregation_strategy="max"))

[{'entity_group': 'geo', 'score': 0.985183, 'word': 'sierra nevada', 'start': 279, 'end': 292}, {'entity_group': 'geo', 'score': 0.9863523, 'word': 'swiss alps', 'start': 322, 'end': 332}]


In [122]:
# # Use the loaded model for making predictions
# model = load_model("runs")
# predictions = model.predict(text)