<a href="https://colab.research.google.com/github/mathewpolonsky/RuTube-NER/blob/main/wikineural_multilingual_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!gdown 162_OpNEW5GXTqF72o3a5P4VjoJaqjsC8

Downloading...
From: https://drive.google.com/uc?id=162_OpNEW5GXTqF72o3a5P4VjoJaqjsC8
To: /content/new_ner_data.csv
100% 9.35M/9.35M [00:00<00:00, 48.6MB/s]


In [None]:
!pip install transformers seqeval razdel datasets accelerate

In [19]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# import os
import json
import numpy as np
import pandas as pd
import razdel
from tqdm.notebook import tqdm

import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric

# import logging

In [2]:
data = pd.read_csv("new_ner_data.csv")
data.head(5)

Unnamed: 0,video_info,entities
0,<НАЗВАНИЕ:> Агент 117: Из Африки с любовью — Р...,"{""label"":""локация""\,""offset"":26\,""length"":6\,""..."
1,"<НАЗВАНИЕ:> ""Спартаку"" помогли судьи? Локомоти...","{""label"":""команда""\,""offset"":13\,""length"":8\,""..."
2,<НАЗВАНИЕ:> 6 марта 2022 г. Каратэ шотокан. Кр...,"{""label"":""Дата""\,""offset"":12\,""length"":14\,""se..."
3,<НАЗВАНИЕ:> Международный Проект «Победа без г...,"{""label"":""название проекта""\,""offset"":34\,""len..."
4,<НАЗВАНИЕ:> Подготовка к отопительному сезону:...,"{""label"":""локация""\,""offset"":49\,""length"":10\,..."


In [3]:
# данные спарсены с Толоки, поэтому могут иметь проблемы с символами и их нужно избежать,
# удалить лишние '\' например, преобразовать из str в список dict-ов
df = data.copy()
df['entities'] = df['entities'].apply(lambda l: l.replace('\,', ',')if isinstance(l, str) else l)
df['entities'] = df['entities'].apply(lambda l: l.replace('\\\\', '\\')if isinstance(l, str) else l)
df['entities'] = df['entities'].apply(lambda l: '[' + l + ']'if isinstance(l, str) else l)
df['entities'] = df['entities'].apply(lambda l: json.loads(l)if isinstance(l, str) else l)

df.head(5)

Unnamed: 0,video_info,entities
0,<НАЗВАНИЕ:> Агент 117: Из Африки с любовью — Р...,"[{'label': 'локация', 'offset': 26, 'length': ..."
1,"<НАЗВАНИЕ:> ""Спартаку"" помогли судьи? Локомоти...","[{'label': 'команда', 'offset': 13, 'length': ..."
2,<НАЗВАНИЕ:> 6 марта 2022 г. Каратэ шотокан. Кр...,"[{'label': 'Дата', 'offset': 12, 'length': 14,..."
3,<НАЗВАНИЕ:> Международный Проект «Победа без г...,"[{'label': 'название проекта', 'offset': 34, '..."
4,<НАЗВАНИЕ:> Подготовка к отопительному сезону:...,"[{'label': 'локация', 'offset': 49, 'length': ..."


In [4]:
# Теперь из наших данных нам нужно извлечь для каждого слова (токена) его
# тег (label) из разметки, чтобы потом предать в модель классификации токенов

def extract_labels(item):

    # воспользуемся удобным токенайзером из библиотеки razdel,
    # она помимо разбиения на слова, сохраняет важные
    # для нас числа - начало и конец слова в токенах

    raw_toks = list(razdel.tokenize(item['video_info']))
    words = [tok.text for tok in raw_toks]
    # присвоим для начала каждому слову тег 'О' - тег, означающий отсутствие NER-а
    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(item['video_info'])
    # так как NER можем состаять из нескольких слов, то нам нужно сохранить эту инфорцию
    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)

    labels = item['entities']
    if isinstance(labels, dict):
        labels = [labels]
    if labels is not None:
        for e in labels:
            if e['label'] != 'не найдено':
                e_words = sorted({idx for idx in char2word[e['offset']:e['offset']+e['length']] if idx is not None})
                if e_words:
                    word_labels[e_words[0]] = 'B-' + e['label']
                    for idx in e_words[1:]:
                        word_labels[idx] = 'I-' + e['label']
                else:
                    continue
            else:
                continue
        return {'tokens': words, 'tags': word_labels}
    else: return {'tokens': words, 'tags': word_labels}

In [5]:
ner_data = [extract_labels(item) for i, item in df.iterrows()]
ner_train, ner_test = train_test_split(ner_data, test_size=0.01, random_state=1)

In [6]:
label_list = sorted({label for item in ner_train for label in item['tags']})
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list
label_list

['O',
 'B-Дата',
 'B-бренд',
 'B-вид спорта',
 'B-видеоигра',
 'B-команда',
 'B-лига',
 'B-локация',
 'B-модель',
 'B-название проекта',
 'B-организация',
 'B-персона',
 'B-сезон',
 'B-серия',
 'I-Дата',
 'I-бренд',
 'I-вид спорта',
 'I-видеоигра',
 'I-команда',
 'I-лига',
 'I-локация',
 'I-модель',
 'I-название проекта',
 'I-организация',
 'I-персона',
 'I-сезон',
 'I-серия']

In [7]:
ner_data = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(ner_train)),
    'test': Dataset.from_pandas(pd.DataFrame(ner_test))
})
ner_data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5698
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 58
    })
})

---

In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner", device=device)
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")

In [9]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/5698 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

In [11]:
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

data_collator = DataCollatorForTokenClassification(tokenizer)

In [12]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [12]:
model.classifier = torch.nn.Linear(768, len(label_list), bias=True).cuda()
model.num_labels = len(label_list)
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [13]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [14]:
args = TrainingArguments(
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none',
    output_dir="/content/drive/MyDrive/models/ner/xlm_roberta"
)

In [15]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 3.467250347137451,
 'eval_precision': 0.006689684937419077,
 'eval_recall': 0.03205791106514995,
 'eval_f1': 0.011069451883592218,
 'eval_accuracy': 0.008191925102399064,
 'eval_runtime': 2.3362,
 'eval_samples_per_second': 24.827,
 'eval_steps_per_second': 1.712}

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.40658,0.513123,0.404343,0.452285,0.889827
2,0.462800,0.353268,0.47957,0.46122,0.470216,0.899941
3,0.304400,0.333194,0.528747,0.532575,0.530654,0.901446
4,0.304400,0.343797,0.53125,0.509824,0.520317,0.901697
5,0.277500,0.340676,0.559471,0.525336,0.541867,0.905124
6,0.253500,0.344233,0.551687,0.524302,0.537646,0.904372


TrainOutput(global_step=2142, training_loss=0.3199743187171516, metrics={'train_runtime': 2381.1472, 'train_samples_per_second': 14.358, 'train_steps_per_second': 0.9, 'total_flos': 6636150861384960.0, 'train_loss': 0.3199743187171516, 'epoch': 6.0})

In [20]:
model.save_pretrained('/content/drive/MyDrive/models/wiki_1000+5000')