In [1]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("json", data_files="/home/s27mhusa_hpc/Master-Thesis/ner_dataset_sentence.json")
# Load each split from local JSON files
train_dataset = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_sentence_train.json")
val_dataset   = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_sentence_val.json")
test_dataset  = Dataset.from_json("/home/s27mhusa_hpc/Master-Thesis/ner_dataset_sentence_test.json")

# Combine into a single DatasetDict (optional, but useful)
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})


model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Generating train split: 2067 examples [00:00, 46634.96 examples/s]
Generating train split: 1623 examples [00:00, 15923.82 examples/s]
Generating train split: 203 examples [00:00, 31534.95 examples/s]
Generating train split: 203 examples [00:00, 32261.43 examples/s]


In [3]:
print(dataset)
print(dataset["train"][0])


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1623
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 203
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 203
    })
})
{'tokens': ['6', 'Nr', '.'], 'ner_tags': [0, 0, 0]}


In [4]:
from transformers import AutoTokenizer

label_list = ["O", "B-soilOrganicCarbon", "I-soilOrganicCarbon", "B-startTime", "I-startTime", "B-endTime", "I-endTime", "B-city", "I-city", "B-duration", "I-duration", "B-cropSpecies", "I-cropSpecies", "B-soilAvailableNitrogen", "I-soilAvailableNitrogen", "B-soilDepth", "I-soilDepth", "B-region", "I-region", "B-country", "I-country", "B-longitude", "I-longitude", "B-latitude", "I-latitude", "B-cropVariety", "I-cropVariety", "B-soilPH", "I-soilPH", "B-soilBulkDensity", "I-soilBulkDensity"]
label_to_id = {l: i for i, l in enumerate(label_list)}


def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            labels.append(example["ner_tags"][word_idx])  # or -100 to ignore subwords
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)


Map: 100%|██████████| 1623/1623 [00:01<00:00, 1575.08 examples/s]
Map: 100%|██████████| 203/203 [00:00<00:00, 1580.68 examples/s]
Map: 100%|██████████| 203/203 [00:00<00:00, 1565.24 examples/s]


In [None]:
import evaluate
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
import num
py as np

# Load model and tokenizer
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased", num_labels=len(label_list)
)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Load metric using evaluate
seqeval = evaluate.load("seqeval")
# Align predictions with labels
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    out_pred_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i][j] != -100:
                out_label_list[i].append(label_list[label_ids[i][j]])
                out_pred_list[i].append(label_list[preds[i][j]])

    return out_pred_list, out_label_list

# Compute metrics
def compute_metrics(p):
    predictions, label_ids = p
    preds, labels = align_predictions(predictions, label_ids)
    results = seqeval.compute(predictions=preds, references=labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./ner_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=100,
    run_name = "bert_base_cased_sentence_100"

)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism t

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.054446,0.649573,0.520548,0.577947,0.98687
2,No log,0.040672,0.889706,0.828767,0.858156,0.993731
3,0.094000,0.057366,0.792208,0.835616,0.813333,0.991602


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

In [None]:
outputs = trainer.predict(tokenized_dataset["validation"])
preds, labels = align_predictions(outputs.predictions, outputs.label_ids)

# Print a few predictions
for i in range(3):
    print("Pred:", preds[i])
    print("Gold:", labels[i])
    print()


In [None]:
test_dataset = tokenized_dataset["test"]
results = trainer.predict(test_dataset)
print(results.metrics)