In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

In [3]:
df = pd.read_json('/content/drive/MyDrive/ML_Data/term_labeled_texts_train.json')
df.head()

Unnamed: 0,tokenized_text,token_labels,new_terms
0,"[ABBYY, Retrieval, &, Morphology, Engine, В, с...","[B, I, I, I, E, O, O, O, O, B, E, O, O, O, O, ...","[ABBYY Retrieval & Morphology Engine, программ..."
1,"[Речевые, формулы, в, диалоге, Предложенная, к...","[B, E, O, S, O, O, O, O, O, O, O, B, E, O, O, ...","[Речевые формулы, диалоге, лингвистические тип..."
2,"[Географические, названия, и, полнотекстовые, ...","[B, E, O, B, E, O, B, I, E, O, O, O, O, O, O, ...","[Географические названия, полнотекстовые докум..."
3,"[Методы, автоматического, построения, специали...","[O, B, I, I, E, O, O, O, O, B, I, E, O, O, B, ...",[автоматического построения специализированног...
4,"[К, проблеме, понимания, несегментированного, ...","[O, O, O, B, E, O, O, O, B, E, O, O, O, O, O, ...","[несегментированного текста, метеорологических..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750 entries, 0 to 749
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tokenized_text  750 non-null    object
 1   token_labels    750 non-null    object
 2   new_terms       750 non-null    object
dtypes: object(3)
memory usage: 23.4+ KB


In [5]:
texts = df['tokenized_text'].to_list()

In [6]:
tags = df['token_labels'].to_list()

In [7]:
print(texts[1])

['Речевые', 'формулы', 'в', 'диалоге', 'Предложенная', 'классификация', ',', 'как', 'и', 'многие', 'другие', 'лингвистические', 'типологии', ',', 'допускает', 'пересечения', '.', 'Например', ',', 'идиома', 'врать', 'готово', 'в', 'последовательности', 'реплик', '[', '-', 'Честное', 'слово', ']', '-', 'Врать', 'готово', ',', 'с', 'одной', 'стороны', ',', 'является', 'комментарием', ',', 'а', 'с', 'другой', '–', 'обладает', 'некоторыми', 'характеристиками', 'формул', 'ответа', ':', 'иллокутивно', 'вынуждается', 'предшествующей', 'репликой', 'и', 'повторяет', 'ее', 'некоторые', 'фонетические', 'особенности', '.', 'Кроме', 'того', ',', 'поскольку', 'оценивается', 'искренность', 'предшествующей', 'клятвы', ',', 'данную', 'идиому', 'можно', 'рассматривать', 'и', 'как', 'формулу', 'эпистемической', 'модальности', '.']


In [8]:
print(tags[1])

['B', 'E', 'O', 'S', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'E', 'O', 'O', 'O', 'O', 'O', 'O', 'S', 'O', 'O', 'O', 'O', 'S', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'S', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'E', 'O', 'S', 'O', 'O', 'S', 'O', 'O', 'O', 'O', 'B', 'E', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'S', 'O', 'O', 'O', 'O', 'B', 'I', 'E', 'O']


In [9]:
train_texts = texts[:570]
train_tags = tags[:570]
val_texts = texts[570:]
val_tags = tags[570:]

In [10]:
unique_tags = ['O', 'S', 'B', 'I', 'E']
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [11]:
tag2id

{'O': 0, 'S': 1, 'B': 2, 'I': 3, 'E': 4}

In [12]:
id2tag

{0: 'O', 1: 'S', 2: 'B', 3: 'I', 4: 'E'}

In [13]:
! pip install datasets transformers[torch] seqeval accelerate -U



In [14]:
model_checkpoint = "xlm-roberta-base"
batch_size = 4

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [16]:
encoded_tags_train = [[tag2id[tag] for tag in doc] for doc in train_tags]

In [17]:
encoded_tags_val = [[tag2id[tag] for tag in doc] for doc in val_tags]

In [18]:
label_all_tokens = True

def tokenize_and_align_labels(texts, tags):

    tokenized_inputs = tokenizer(texts,
                                 truncation=True, # обрезка слишком длинных последовательностей
                                 padding=True,
                                 is_split_into_words=True # предупреждаем, что вход поступит в виде списков токенов
                                 )

    labels = []
    for i, label in enumerate(tags):

        # достаем 1 текст
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        # идем по всем словам
        for word_idx in word_ids:

            # Некоторые специальные токены имеют id None. Мы даем им лейбл -100, чтобы модель их игнорировала
            if word_idx is None:
                label_ids.append(-100)

            # Логично, что если слово разделилось на subword-токены, их лейблы в пределах слова должны быть одинаковыми.
            # Если мы перешли на новое слово, добавляем его лейбл в список
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])

            # Всем следующим частям одного и того же слова мы даем или тот же лейбл, или -100, если label_all_tokens=False
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    return tokenized_inputs, labels

In [19]:
tokenized_train, train_labels = tokenize_and_align_labels(train_texts, encoded_tags_train)

In [20]:
print(tokenized_train[2])

Encoding(num_tokens=373, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [21]:
print(train_labels[2])

[-100, 2, 2, 2, 4, 4, 0, 2, 2, 2, 4, 0, 2, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 4, 4, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 4, 4, 0, 2, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 2, 2, 2, 4, 4, 4, 0, 0, 0, 0, 0, 0, 2, 2, 3, 4, 2, 2, 4, 4, 0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 4, 4, 0, 2, 2, 2, 3, 3, 3, 3, 4, 4, 0, 0, 0, 0, 0, 2, 2, 2, 4, 2, 2, 2, 2, 4, 0, 0, 0, 0, 1, 1, 0, 0, 0, 2, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 4, 4, 0, 0, 0, 2, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 4, 4, 0, 0, 0, 0, 2, 2, 2, 4, 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -10

In [22]:
len(train_labels[2])

373

In [23]:
tokenized_val, val_labels = tokenize_and_align_labels(val_texts, encoded_tags_val)

In [24]:
len(val_labels[12])

417

In [25]:
len(train_labels[67])

373

In [26]:
import torch

In [27]:
class TermLabDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
      # этот метод вызывается моделью, когда она учится
      # он определяет, в каком виде данные подаются в модель
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [28]:
train_dataset = TermLabDataset(tokenized_train, train_labels)
val_dataset = TermLabDataset(tokenized_val, val_labels)

In [29]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=5)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"//content/sample_data/{model_name}-finetuned-1-term-tag",
    overwrite_output_dir=True, # записываем каждый раз в один и тот же файл
    evaluation_strategy = "epoch", # оцениваем каждую эпоху
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size, # размер тренировочного батча на каждый процессор
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01, # регуляризация функции потерь
    push_to_hub=False, # не публиковать на Huggingface
)



In [31]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [32]:
from datasets import load_metric

In [33]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

The repository for seqeval contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/seqeval.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [34]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Удаляем индексы специальных токенов
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [35]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.397781,0.772054,0.764915,0.768468,0.853633
2,No log,0.346562,0.802453,0.843336,0.822387,0.882721
3,No log,0.344982,0.798753,0.866009,0.831023,0.883197
4,0.404400,0.366607,0.803297,0.875078,0.837653,0.88636
5,0.404400,0.394227,0.822052,0.837912,0.829906,0.882921
6,0.404400,0.400342,0.828994,0.85774,0.843122,0.893236
7,0.133700,0.431267,0.823474,0.856495,0.83966,0.888367
8,0.133700,0.43668,0.827734,0.860852,0.843968,0.89296
9,0.133700,0.472918,0.817952,0.862097,0.839444,0.888518
10,0.133700,0.469328,0.815368,0.868943,0.841303,0.890149


TrainOutput(global_step=1430, training_loss=0.20876756681428923, metrics={'train_runtime': 675.7219, 'train_samples_per_second': 8.435, 'train_steps_per_second': 2.116, 'total_flos': 1085074418727000.0, 'train_loss': 0.20876756681428923, 'epoch': 10.0})

In [37]:
predictions, labels, _ = trainer.predict(val_dataset)
predictions = np.argmax(predictions, axis=2)

# Уберем игнорируемые токены и декодируем предсказанные токены
ids_predictions = [
    [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=ids_predictions, references=true_labels)
results

{'_': {'precision': 0.8153679292507926,
  'recall': 0.8689428291988975,
  'f1': 0.841303318555503,
  'number': 11247},
 'overall_precision': 0.8153679292507926,
 'overall_recall': 0.8689428291988975,
 'overall_f1': 0.841303318555503,
 'overall_accuracy': 0.890149328648513}

In [38]:
from transformers import TokenClassificationPipeline

In [45]:
def postprocess_output(output):

  string = ''
  tags = []
  last_token_end = 0

  for token in output:
    # сначала обработаем тег
    if '_' in token['entity']:
      tag = id2tag[int(token['entity'].split('_')[1])]
    else:
      tag = token['entity']

    if token['word'][0] == '▁':
      # если токен - или начало слова, или полное слово
      if last_token_end != token['start']:
        string += ' '

      string += token['word']
      tags.append(tag)

    else:
      # если токен - середина слова или конец
      string += token['word'].replace('#', '')

    last_token_end = token['end']

  return list(zip(string.split(), tags))

In [46]:
checkpoint = '/content/sample_data/xlm-roberta-base-finetuned-1-term-tag/checkpoint-1000'

In [47]:
tokenizer_term = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [48]:
model_term = AutoModelForTokenClassification.from_pretrained(checkpoint)

In [49]:
termbert = TokenClassificationPipeline(model=model_term, tokenizer=tokenizer_term, task="term_tagging")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [52]:
postprocess_output(termbert('В данной статье описываются основные принципы работы синтаксического парсера русского языка LPaRus , разработанного на основе лингвистических технологий компании Megaputer Intelligence .'))

[('▁В', 'O'),
 ('▁данной', 'O'),
 ('▁статье', 'O'),
 ('▁описываются', 'O'),
 ('▁основные', 'O'),
 ('▁принципы', 'O'),
 ('▁работы', 'O'),
 ('▁синтаксического', 'B'),
 ('▁парсера', 'E'),
 ('▁русского', 'B'),
 ('▁языка', 'E'),
 ('▁LPaRus', 'O'),
 ('▁,', 'O'),
 ('▁разработанного', 'O'),
 ('▁на', 'O'),
 ('▁основе', 'O'),
 ('▁лингвистических', 'B'),
 ('▁технологий', 'E'),
 ('▁компании', 'O'),
 ('▁Megaputer', 'O'),
 ('▁Intelligence', 'O'),
 ('▁.', 'O')]