In [2]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
dataset = load_dataset("json", data_files="/home/s27mhusa_hpc/Master-Thesis/ner_dataset.json")
# Load each split from local JSON files
train_dataset = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_train.json")
val_dataset   = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_val.json")
test_dataset  = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_test.json")

# Combine into a single DatasetDict (optional, but useful)
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})


model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [3]:
print(dataset)
print(dataset["train"][0])


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 155
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 19
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 20
    })
})
{'tokens': ['Title', ':', '\n', 'Continuous', 'thermosalinograph', 'oceanography', 'along', 'RV', 'Meteor', 'cruise', 'track', 'M185', '\n\n', 'Abstract', ':', '\n', 'Underway', 'temperature', 'and', 'salinity', 'data', 'was', 'collected', 'along', 'the', 'cruise', 'track', 'with', 'two', 'autonomous', 'thermosalinograph', '(', 'TSG', ')', 'systems', ',', 'each', 'consisting', 'of', 'a', 'SBE21', 'TSG', 'together', 'with', 'a', 'SBE38', 'Thermometer', '.', 'Both', 'systems', 'worked', 'independent', 'from', 'each', 'other', 'throughout', 'the', 'cruise', '.', 'While', 'temperature', 'is', 'taken', 'at', 'the', 'water', 'inlet', 'in', 'about', '5', 'm', 'depth', ',', 'salinity', 'is', 'estimated',

In [4]:
from transformers import AutoTokenizer

label_list = ["O", "B-soilOrganicCarbon", "I-soilOrganicCarbon", "B-startTime", "I-startTime", "B-endTime", "I-endTime", "B-city", "I-city", "B-duration", "I-duration", "B-cropSpecies", "I-cropSpecies", "B-soilAvailableNitrogen", "I-soilAvailableNitrogen", "B-soilDepth", "I-soilDepth", "B-region", "I-region", "B-country", "I-country", "B-longitude", "I-longitude", "B-latitude", "I-latitude", "B-cropVariety", "I-cropVariety", "B-soilPH", "I-soilPH", "B-soilBulkDensity", "I-soilBulkDensity"]
label_to_id = {l: i for i, l in enumerate(label_list)}


def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(example["ner_tags"][word_idx])  # or -100 to ignore subwords
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)


In [15]:
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
import numpy as np

# Load model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased", num_labels=len(label_list)
)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Load metric using evaluate
seqeval = evaluate.load("seqeval")

# Align predictions with labels
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    out_pred_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i][j] != -100:
                out_label_list[i].append(label_list[label_ids[i][j]])
                out_pred_list[i].append(label_list[preds[i][j]])

    return out_pred_list, out_label_list

# Compute metrics
def compute_metrics(p):
    predictions, label_ids = p
    preds, labels = align_predictions(predictions, label_ids)
    results = seqeval.compute(predictions=preds, references=labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

training_args = TrainingArguments(
    output_dir="./tmp_eval",  # just a dummy dir
    per_device_eval_batch_size=8,
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

results = trainer.evaluate()
print(results)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


{'eval_loss': 3.6846370697021484, 'eval_model_preparation_time': 0.0111, 'eval_precision': 0.0013097576948264572, 'eval_recall': 0.029850746268656716, 'eval_f1': 0.0025094102885821834, 'eval_accuracy': 0.002959127057518032, 'eval_runtime': 2.5128, 'eval_samples_per_second': 7.959, 'eval_steps_per_second': 1.194}


In [16]:
outputs = trainer.predict(tokenized_dataset["test"])
preds, labels = align_predictions(outputs.predictions, outputs.label_ids)

# Print a few predictions
for i in range(3):
    print("Pred:", preds[i])
    print("Gold:", labels[i])
    print()


Pred: ['I-soilOrganicCarbon', 'B-soilOrganicCarbon', 'I-soilPH', 'I-soilPH', 'B-soilDepth', 'I-latitude', 'I-latitude', 'I-latitude', 'I-soilPH', 'I-latitude', 'I-latitude', 'I-region', 'B-cropVariety', 'I-latitude', 'B-soilPH', 'I-cropSpecies', 'I-startTime', 'I-region', 'I-startTime', 'B-soilDepth', 'I-soilPH', 'I-soilPH', 'I-soilPH', 'I-latitude', 'I-soilPH', 'I-latitude', 'I-latitude', 'I-latitude', 'B-region', 'I-soilPH', 'I-latitude', 'B-cropVariety', 'B-cropVariety', 'I-soilAvailableNitrogen', 'I-region', 'B-soilBulkDensity', 'I-region', 'I-latitude', 'I-latitude', 'I-soilAvailableNitrogen', 'B-cropVariety', 'B-cropVariety', 'I-soilAvailableNitrogen', 'I-startTime', 'B-soilBulkDensity', 'I-latitude', 'I-latitude', 'I-latitude', 'I-latitude', 'B-cropVariety', 'I-latitude', 'I-soilPH', 'I-latitude', 'I-latitude', 'I-soilPH', 'B-soilBulkDensity', 'I-latitude', 'I-region', 'I-region', 'I-soilPH', 'I-soilPH', 'B-cropVariety', 'I-soilPH', 'I-region', 'I-soilAvailableNitrogen', 'I-soil

In [17]:
test_dataset = tokenized_dataset["test"]
results = trainer.predict(test_dataset)
print(results.metrics)

{'test_loss': 3.6846370697021484, 'test_model_preparation_time': 0.0111, 'test_precision': 0.0013097576948264572, 'test_recall': 0.029850746268656716, 'test_f1': 0.0025094102885821834, 'test_accuracy': 0.002959127057518032, 'test_runtime': 1.7851, 'test_samples_per_second': 11.204, 'test_steps_per_second': 1.681}
