In [1]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("json", data_files="/home/s27mhusa_hpc/Master-Thesis/ner_dataset.json")
# Load each split from local JSON files
train_dataset = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_train.json")
val_dataset   = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_val.json")
test_dataset  = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_test.json")

# Combine into a single DatasetDict (optional, but useful)
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})


model_checkpoint = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [3]:
print(dataset)
print(dataset["train"][0])


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 155
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 19
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 20
    })
})
{'tokens': ['Title', ':', '\n', 'Continuous', 'thermosalinograph', 'oceanography', 'along', 'RV', 'Meteor', 'cruise', 'track', 'M185', '\n\n', 'Abstract', ':', '\n', 'Underway', 'temperature', 'and', 'salinity', 'data', 'was', 'collected', 'along', 'the', 'cruise', 'track', 'with', 'two', 'autonomous', 'thermosalinograph', '(', 'TSG', ')', 'systems', ',', 'each', 'consisting', 'of', 'a', 'SBE21', 'TSG', 'together', 'with', 'a', 'SBE38', 'Thermometer', '.', 'Both', 'systems', 'worked', 'independent', 'from', 'each', 'other', 'throughout', 'the', 'cruise', '.', 'While', 'temperature', 'is', 'taken', 'at', 'the', 'water', 'inlet', 'in', 'about', '5', 'm', 'depth', ',', 'salinity', 'is', 'estimated',

In [4]:
from transformers import AutoTokenizer

label_list = ["O", "B-soilOrganicCarbon", "I-soilOrganicCarbon", "B-startTime", "I-startTime", "B-endTime", "I-endTime", "B-city", "I-city", "B-duration", "I-duration", "B-cropSpecies", "I-cropSpecies", "B-soilAvailableNitrogen", "I-soilAvailableNitrogen", "B-soilDepth", "I-soilDepth", "B-region", "I-region", "B-country", "I-country", "B-longitude", "I-longitude", "B-latitude", "I-latitude", "B-cropVariety", "I-cropVariety", "B-soilPH", "I-soilPH", "B-soilBulkDensity", "I-soilBulkDensity"]
label_to_id = {l: i for i, l in enumerate(label_list)}


def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(example["ner_tags"][word_idx])  # or -100 to ignore subwords
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)


Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map: 100%|██████████| 19/19 [00:00<00:00, 347.09 examples/s]


In [None]:
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
import numpy as np

# Load model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-large", num_labels=len(label_list)
)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

# Load metric using evaluate
seqeval = evaluate.load("seqeval")

# Align predictions with labels
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    out_pred_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i][j] != -100:
                out_label_list[i].append(label_list[label_ids[i][j]])
                out_pred_list[i].append(label_list[preds[i][j]])

    return out_pred_list, out_label_list

# Compute metrics
def compute_metrics(p):
    predictions, label_ids = p
    preds, labels = align_predictions(predictions, label_ids)
    results = seqeval.compute(predictions=preds, references=labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./ner_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=100,
    run_name = "xlm-roberta-large-100-epochs"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokeni

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.321103,0.0,0.0,0.0,0.953341
2,No log,0.308413,0.0,0.0,0.0,0.953341
3,No log,0.316438,0.0,0.0,0.0,0.953341
4,No log,0.307753,0.0,0.0,0.0,0.953341
5,No log,0.24973,0.0,0.0,0.0,0.953341
6,No log,0.161383,0.424658,0.183432,0.256198,0.960603
7,No log,0.156734,0.333333,0.201183,0.250923,0.964779
8,No log,0.124516,0.627586,0.538462,0.579618,0.976398
9,No log,0.114699,0.726744,0.739645,0.733138,0.983297
10,No log,0.173026,0.848921,0.698225,0.766234,0.981663


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
outputs = trainer.predict(tokenized_dataset["validation"])
preds, labels = align_predictions(outputs.predictions, outputs.label_ids)

# Print a few predictions
for i in range(3):
    print("Pred:", preds[i])
    print("Gold:", labels[i])
    print()


Pred: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-region', 'B-region', 'B-region', 'O', 'O', 'B-region', 'I-region', 'I-region', 'O', 'O', 'B-region', 'I-region', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-region', 'B-region', 'B-region', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-startTime', 'O', 'B-startTime', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

In [8]:
test_dataset = tokenized_dataset["test"]
results = trainer.predict(test_dataset)
print(results.metrics)

{'test_loss': 0.1467079222202301, 'test_precision': 0.391025641025641, 'test_recall': 0.34269662921348315, 'test_f1': 0.36526946107784436, 'test_accuracy': 0.9671857619577308, 'test_runtime': 0.5377, 'test_samples_per_second': 37.194, 'test_steps_per_second': 5.579}
