In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification, pipeline

from torch.utils.data import Dataset
import torch
import numpy as np
import random
from sklearn.metrics import precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


### NER_model train:

In [2]:
class NERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, label2id, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        word_labels = self.labels[idx]

        encoding = self.tokenizer(
            text.split(),
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_offsets_mapping=False
        )

        labels = []
        word_ids = encoding.word_ids() # use word_ids to align labels
        previous_word_id = None

        # Word labeling logic
        for word_id in word_ids:
            if word_id is None:
                labels.append(-100)
            else:
                label = word_labels[word_id]
                if label == "O":
                    labels.append(self.label2id[label])
                else:
                    # If token is part of same word, put I-MOUNTAIN 
                    if word_id != previous_word_id:
                        labels.append(self.label2id["B-MOUNTAIN"])
                    else:
                        labels.append(self.label2id["I-MOUNTAIN"])
                previous_word_id = word_id

        encoding = {k: torch.tensor(v) for k, v in encoding.items()}
        encoding["labels"] = torch.tensor(labels)
        return encoding

In [3]:
def load_synthetic_data(file_path):
    texts, labels_list = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        text, labels = [], []
        for line in f:
            line = line.strip()
            if not line:  # Пустая строка означает конец текста
                if text and labels:
                    texts.append(" ".join(text))
                    labels_list.append(labels)
                text, labels = [], []
            else:
                word, label = line.split()
                text.append(word)
                labels.append(label)
    return texts, labels_list



# Загрузка данных из файла
train_texts, train_labels = load_synthetic_data('train_data.txt')
val_texts, val_labels = load_synthetic_data('val_data.txt')

In [4]:
train_data = list(zip(train_texts, train_labels))
val_data = list(zip(val_texts, val_labels))

train_data[:5]

[('Fuji is visible from hundreds of kilometers away.',
  ['B-MOUNTAIN', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
 ('I wrote my name in the summit register of Elbrus.',
  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN']),
 ('The wind at the top of K2 was over 100 km/h.',
  ['O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN', 'O', 'O', 'O', 'O']),
 ('K2 has multiple named routes of varying difficulty.',
  ['B-MOUNTAIN', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
 ('The wind at the top of Everest was over 100 km/h.',
  ['O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN', 'O', 'O', 'O', 'O'])]

In [5]:
def compute_metrics(eval_pred):
    # Compute main metrics
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    true_labels, pred_labels = [], []
    for pred, lab in zip(predictions, labels):
        for p, l in zip(pred, lab):
            if l != -100:
                true_labels.append(l)
                pred_labels.append(p)

    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, pred_labels, average='weighted', zero_division=0
    )
    return {"precision": precision, "recall": recall, "f1": f1}

In [6]:
def ner_model_train(train_path="train_data.txt", val_path="val_data.txt", output_dir="./ner_model", epochs=3, batch_size=16):
    # Загрузка данных из файла
    train_texts, train_labels = load_synthetic_data(train_path)
    val_texts, val_labels = load_synthetic_data(val_path)

    # Put labels
    label_list = ["O", "B-MOUNTAIN", "I-MOUNTAIN"]
    label2id = {label: i for i, label in enumerate(label_list)}

    # Load tokkenizer and model
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=len(label_list),
        id2label={i: l for i, l in enumerate(label_list)},
        label2id=label2id
    )

    # Create datasets
    train_dataset = NERDataset(train_texts, train_labels, tokenizer, label2id)
    val_dataset = NERDataset(val_texts, val_labels, tokenizer, label2id)
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        eval_strategy="epoch",             # metrics after every epoch
        save_strategy="no",                # do not save checkpoints
        logging_strategy="epoch",           
        logging_dir="./logs",               
        load_best_model_at_end=False,
        report_to="none",                   
    )

    # Trainer with metrics
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Model training 
    trainer.train()

    # Model saving
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

In [13]:
ner_model_train(train_path="train_data.txt", val_path="val_data.txt", batch_size=32)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.2484,0.009294,0.99733,0.997321,0.997323
2,0.0029,0.000581,1.0,1.0,1.0
3,0.001,0.000462,1.0,1.0,1.0


### NER_model inference:

In [14]:
def extract_mountains(model_dir, text):
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForTokenClassification.from_pretrained(model_dir)
    # Create ner pipeline
    nlp = pipeline(
        "ner",
        model=model,
        tokenizer=tokenizer,
        aggregation_strategy="simple"  # create word from B/I tokens
    )

    # Get predictions
    results = nlp(text)
    
    # Take labels
    mountains = [r["word"].strip(',.!?') for r in results if "MOUNTAIN" in r["entity_group"]]
    return mountains

In [17]:
def main(model_dir, text):
    mountains = extract_mountains(model_dir, text)
    if mountains:
        print(f'Mountains found: {", ".join(mountain.capitalize() for mountain in mountains)}')
    else: 
        print("Mountains not found!")
    # print(mountains[0].capitalize() if mountains else "Mountains not found!")

In [19]:
main(
    model_dir="./ner_model", 
    text="The Everest is the highest mountain. The K2 is also very tall."
    )

Device set to use cuda:0


Mountains found: Everest, K2
