In [1]:
!pip install evaluate
!pip install seqeval
!pip install datasets


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0

In [2]:
!pip install seqeval



In [3]:
import random
from datasets import load_dataset, Dataset

def preprocess_mountain_data():
    # Load the dataset
    all_data = load_dataset("DFKI-SLT/few-nerd", "supervised")

    # Define the mountain label
    mountain_label = 24

    def extract_mountain_data(dataset):
        mountain_data = []
        non_mountain_data = []

        for example in dataset:
            if mountain_label in example["fine_ner_tags"]:
                mountain_data.append(example)
            else:
                non_mountain_data.append(example)

        return mountain_data, non_mountain_data

    def balance_data(mountain_data, non_mountain_data, total_size):
        non_mountain_sample_size = total_size - len(mountain_data)
        balanced_data = mountain_data + random.sample(non_mountain_data, non_mountain_sample_size)
        random.shuffle(balanced_data)

        for example in balanced_data:
            example["ner_tags"] = [1 if tag == mountain_label else 0 for tag in example["fine_ner_tags"]]
            del example["fine_ner_tags"]

        return Dataset.from_dict({key: [example[key] for example in balanced_data] for key in balanced_data[0]})

    # Process train data
    train_mountain, train_non_mountain = extract_mountain_data(all_data["train"])
    balanced_train = balance_data(train_mountain, train_non_mountain, 2000)
    print(len(train_mountain))
    # Process validation data
    val_mountain, val_non_mountain = extract_mountain_data(all_data["validation"])
    balanced_val = balance_data(val_mountain, val_non_mountain, 500)

    # Process test data
    test_mountain, test_non_mountain = extract_mountain_data(all_data["test"])
    balanced_test = balance_data(test_mountain, test_non_mountain, 500)

    return balanced_train, balanced_val, balanced_test

# Run the preprocessing
train_data, val_data, test_data = preprocess_mountain_data()

# Print dataset sizes
print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


0000.parquet:   0%|          | 0.00/16.9M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/2.43M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/4.84M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/131767 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/18824 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/37648 [00:00<?, ? examples/s]

1502
Train set size: 2000
Validation set size: 500
Test set size: 500


In [4]:
!PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [6]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer
from sklearn.utils.class_weight import compute_class_weight
from torch import nn
from datasets import load_from_disk
import evaluate
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import torch


# Load the pre-trained model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER", num_labels=2, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")

# Define a function to tokenize the datasets
def tokenize_adjust_labels(all_samples_per_split):
    tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], is_split_into_words=True)
    total_adjusted_labels = []
    for k in range(0, len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ner_tags"][k]
        i = -1
        adjusted_label_ids = []
        for wid in word_ids_list:
            if wid is None:
                adjusted_label_ids.append(-100)
            elif wid != prev_wid:
                i = i + 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = wid
            else:
                adjusted_label_ids.append(existing_label_ids[i])
        total_adjusted_labels.append(adjusted_label_ids)
    tokenized_samples["labels"] = total_adjusted_labels
    return tokenized_samples

# Tokenize the datasets
tokenized_dataset_train = train_data.map(tokenize_adjust_labels, batched=True)
tokenized_dataset_val = val_data.map(tokenize_adjust_labels, batched=True)
tokenized_dataset_test = test_data.map(tokenize_adjust_labels, batched=True)


# Assuming your labels are defined as before
labels = [0] * (len(train_data) - 1502) + [1] * 1502

# Convert the classes list to a numpy array
classes = np.array([0, 1])

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=classes, y=labels)
class_weights_dict = {i: weight for i, weight in zip(classes, class_weights)}

# Define a custom data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = torch.tensor(predictions)  # Convert NumPy array to PyTorch tensor
    predictions = torch.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    metric = evaluate.load("seqeval")
    results = metric.compute(predictions=true_predictions, references=true_labels)

    class_1_tp = 0
    class_1_fp = 0
    class_1_fn = 0
    for i in range(len(true_predictions)):
        class_1_tp += sum([(true_predictions[i][j] == 1 and true_labels[i][j] == 1) for j in range(len(true_predictions[i]))])
        class_1_fp += sum([(true_predictions[i][j] == 1 and true_labels[i][j] != 1) for j in range(len(true_predictions[i]))])
        class_1_fn += sum([(true_predictions[i][j] != 1 and true_labels[i][j] == 1) for j in range(len(true_predictions[i]))])

    class_1_precision = class_1_tp / max(float(class_1_tp + class_1_fp), 1e-9)
    class_1_recall = class_1_tp / max(float(class_1_tp + class_1_fn), 1e-9)
    class_1_f1 = 2 * (class_1_precision * class_1_recall) / max(class_1_precision + class_1_recall, 1e-9)

    return {
        "class_1_f1": class_1_f1,
        "class_1_precision": class_1_precision,
        "class_1_recall": class_1_recall,
        "overall_accuracy": results["overall_accuracy"]
    }

# Define a custom training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="class_1_f1",
    greater_is_better=True,
    save_total_limit=2,
    save_on_each_node=True,
)

# Define a custom trainer
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.57, 4.19], device=logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Initialize the trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

evaluation_results = trainer.evaluate(eval_dataset=tokenized_dataset_test)

print(evaluation_results)

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-large-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 1024

Epoch,Training Loss,Validation Loss


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

ValueError: Predictions and/or references don't match the expected format.
Expected format: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')},
Input predictions: [[tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0)], [tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0)], [tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0)], ..., [tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(1), tensor(1), tensor(1), tensor(1), tensor(0), tensor(0), tensor(1), tensor(1), tensor(1), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0)], [tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(1), tensor(1), tensor(1), tensor(0), tensor(0), tensor(0), tensor(0)], [tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0)]],
Input references: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ..., [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]