In [16]:
import json
import numpy as np
from sklearn.model_selection import train_test_split

In [17]:
with open("/home/s27mhusa_hpc/Master-Thesis/ner_dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [18]:
label_list = ["O", "B-soilOrganicCarbon", "I-soilOrganicCarbon", "B-startTime", "I-startTime", "B-endTime", "I-endTime", "B-city", "I-city", "B-duration", "I-duration", "B-cropSpecies", "I-cropSpecies", "B-soilAvailableNitrogen", "I-soilAvailableNitrogen", "B-soilDepth", "I-soilDepth", "B-region", "I-region", "B-country", "I-country", "B-longitude", "I-longitude", "B-latitude", "I-latitude", "B-cropVariety", "I-cropVariety", "B-soilPH", "I-soilPH", "B-soilBulkDensity", "I-soilBulkDensity"]
len(label_list)

31

In [25]:
from collections import Counter

def get_dominant_label(ner_tags):
    tags = [tag for tag in ner_tags if tag != 0]  # ignore 'O'
    return max(set(tags), key=tags.count) if tags else 0

dominant_labels = [get_dominant_label(item["ner_tags"]) for item in data]

dominant_counts = Counter(dominant_labels)

# Print
for label_id, count in sorted(dominant_counts.items(), key=lambda x: x[1]):
    print(f"Dominant Label {label_list[label_id]} (ID {label_id}): {count} sentences")


Dominant Label I-soilOrganicCarbon (ID 2): 1 sentences
Dominant Label B-cropVariety (ID 25): 1 sentences
Dominant Label I-endTime (ID 6): 1 sentences
Dominant Label I-cropVariety (ID 26): 1 sentences
Dominant Label B-duration (ID 9): 2 sentences
Dominant Label I-cropSpecies (ID 12): 2 sentences
Dominant Label B-endTime (ID 5): 3 sentences
Dominant Label I-startTime (ID 4): 3 sentences
Dominant Label B-city (ID 7): 5 sentences
Dominant Label I-duration (ID 10): 6 sentences
Dominant Label I-region (ID 18): 7 sentences
Dominant Label B-soilOrganicCarbon (ID 1): 8 sentences
Dominant Label B-region (ID 17): 18 sentences
Dominant Label B-country (ID 19): 22 sentences
Dominant Label B-startTime (ID 3): 25 sentences
Dominant Label B-cropSpecies (ID 11): 34 sentences
Dominant Label O (ID 0): 55 sentences


In [26]:
from collections import Counter

# First, count how many times each label is the dominant label
dominant_labels = [get_dominant_label(item["ner_tags"]) for item in data]
label_counts = Counter(dominant_labels)

# Identify labels that appear less than 2 times
rare_labels = {label for label, count in label_counts.items() if count < 2}
print("Rare dominant labels:", rare_labels)

# Re-map rare dominant labels to 0 (or another safe fallback)
def filtered_dominant_label(ner_tags):
    label = get_dominant_label(ner_tags)
    return 0 if label in rare_labels else label

stratify_labels = [filtered_dominant_label(item["ner_tags"]) for item in data]


Rare dominant labels: {25, 2, 26, 6}


In [27]:
from collections import Counter

c = Counter(stratify_labels)
for k, v in c.items():
    if v < 2:
        print(f"Label group {k} only occurs {v} time(s)")

In [30]:
# First split: train + temp
train_data, temp_data, y_train, y_temp = train_test_split(
    data, stratify_labels, test_size=0.2, random_state=42, stratify=stratify_labels
)

# Split temp into val and test
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, random_state=42
)

In [31]:
# Save splits
base_path = "/home/s27mhusa_hpc/Master-Thesis"

for split_name, split_data in [("train", train_data), ("val", val_data), ("test", test_data)]:
    out_path = f"{base_path}/ner_dataset_{split_name}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(split_data, f, indent=2, ensure_ascii=False)
    print(f"Saved {split_name} split with {len(split_data)} examples to {out_path}")

Saved train split with 155 examples to /home/s27mhusa_hpc/Master-Thesis/ner_dataset_train.json
Saved val split with 19 examples to /home/s27mhusa_hpc/Master-Thesis/ner_dataset_val.json
Saved test split with 20 examples to /home/s27mhusa_hpc/Master-Thesis/ner_dataset_test.json


In [3]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict, load_dataset

In [4]:
dataset = load_dataset("json", data_files="/home/s27mhusa_hpc/Master-Thesis/ner_dataset.json")
# Load each split from local JSON files
train_dataset = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_train.json")
val_dataset   = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_val.json")
test_dataset  = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_test.json")

# Combine into a single DatasetDict (optional, but useful)
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})


model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [3]:
print(dataset)
print(dataset["train"][0])


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 155
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 19
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 20
    })
})
{'tokens': ['Title', ':', '\n', 'Continuous', 'thermosalinograph', 'oceanography', 'along', 'RV', 'Meteor', 'cruise', 'track', 'M185', '\n\n', 'Abstract', ':', '\n', 'Underway', 'temperature', 'and', 'salinity', 'data', 'was', 'collected', 'along', 'the', 'cruise', 'track', 'with', 'two', 'autonomous', 'thermosalinograph', '(', 'TSG', ')', 'systems', ',', 'each', 'consisting', 'of', 'a', 'SBE21', 'TSG', 'together', 'with', 'a', 'SBE38', 'Thermometer', '.', 'Both', 'systems', 'worked', 'independent', 'from', 'each', 'other', 'throughout', 'the', 'cruise', '.', 'While', 'temperature', 'is', 'taken', 'at', 'the', 'water', 'inlet', 'in', 'about', '5', 'm', 'depth', ',', 'salinity', 'is', 'estimated',

In [5]:
from transformers import AutoTokenizer

label_list = ["O", "B-soilOrganicCarbon", "I-soilOrganicCarbon", "B-startTime", "I-startTime", "B-endTime", "I-endTime", "B-city", "I-city", "B-duration", "I-duration", "B-cropSpecies", "I-cropSpecies", "B-soilAvailableNitrogen", "I-soilAvailableNitrogen", "B-soilDepth", "I-soilDepth", "B-region", "I-region", "B-country", "I-country", "B-longitude", "I-longitude", "B-latitude", "I-latitude", "B-cropVariety", "I-cropVariety", "B-soilPH", "I-soilPH", "B-soilBulkDensity", "I-soilBulkDensity"]
label_to_id = {l: i for i, l in enumerate(label_list)}


def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(example["ner_tags"][word_idx])  # or -100 to ignore subwords
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)


In [6]:
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
import numpy as np

# Load model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased", num_labels=len(label_list)
)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Load metric using evaluate
seqeval = evaluate.load("seqeval")

# Align predictions with labels
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    out_pred_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i][j] != -100:
                out_label_list[i].append(label_list[label_ids[i][j]])
                out_pred_list[i].append(label_list[preds[i][j]])

    return out_pred_list, out_label_list

# Compute metrics
def compute_metrics(p):
    predictions, label_ids = p
    preds, labels = align_predictions(predictions, label_ids)
    results = seqeval.compute(predictions=preds, references=labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./ner_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=100,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmurtuzanh[0m ([33mmurtuzanh-university-bonn[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.328538,0.0,0.0,0.0,0.950509
2,No log,0.284195,0.0,0.0,0.0,0.950509
3,No log,0.223886,0.423913,0.216667,0.286765,0.957835
4,No log,0.22438,0.424528,0.25,0.314685,0.959264
5,No log,0.205588,0.458824,0.433333,0.445714,0.966768
6,No log,0.185013,0.70922,0.555556,0.623053,0.973557
7,No log,0.239175,0.673077,0.388889,0.492958,0.968733
8,No log,0.224579,0.836207,0.538889,0.655405,0.973379
9,No log,0.227845,0.773438,0.55,0.642857,0.972128
10,No log,0.207303,0.834646,0.588889,0.690554,0.976059


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=2000, training_loss=0.014621016878634692, metrics={'train_runtime': 2172.0736, 'train_samples_per_second': 7.136, 'train_steps_per_second': 0.921, 'total_flos': 4051161613824000.0, 'train_loss': 0.014621016878634692, 'epoch': 100.0})

In [8]:
outputs = trainer.predict(tokenized_dataset["validation"])
preds, labels = align_predictions(outputs.predictions, outputs.label_ids)

# Print a few predictions
for i in range(3):
    print("Pred:", preds[i])
    print("Gold:", labels[i])
    print()


Pred: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-region', 'B-region', 'B-region', 'I-region', 'I-region', 'O', 'O', 'B-region', 'I-region', 'I-region', 'O', 'B-region', 'I-region', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-region', 'B-region', 'B-region', 'I-region', 'I-region', 'O', 'O', 'O', 'O', 'O', 'O', 'B-startTime', 'O', 'B-endTime', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

In [9]:
test_dataset = tokenized_dataset["test"]
results = trainer.predict(test_dataset)
print(results.metrics)

{'test_loss': 0.16118207573890686, 'test_precision': 0.7379679144385026, 'test_recall': 0.6865671641791045, 'test_f1': 0.7113402061855669, 'test_accuracy': 0.9785463288329943, 'test_runtime': 0.5883, 'test_samples_per_second': 33.996, 'test_steps_per_second': 5.099}
