In [4]:
import os
import gc
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

from pathlib import Path
import re
import numpy as np
import pandas as pd
import pickle

try:
    from google.colab import drive
    drive.mount('/content/gdrive')
    data_path='/content/gdrive/My Drive/Colab Notebooks/title/data'
except:
    data_path='../../DATASETS/IT_TEXTS/'

In [3]:
Xy = pd.read_feather(f'{data_path}/META/it_articles_ru.feather')

In [3]:
# _ = Xy.sort_values(by='views_num', ascending=False)[['url', 'title']]
# _.reset_index(inplace=True, drop=True)
# #_.to_feather(f'{data_path}/titles.csv')

In [12]:
# all_titles_str = '\n'.join(_.title.to_list()[:10000])
# with open(f'{data_path}/all_titles.txt', "w") as text_file:
#     text_file.write(all_titles_str)

We used [CoNLL format](https://www.signll.org/conll/) ([StackOverflow discussion](https://stackoverflow.com/questions/27416164/what-is-conll-data-format)) that also used by Label Studio and by example at HuggingFace documentation.

In [54]:
def read_conll(file_path):
    file_path = Path(file_path)
    
    raw_text = file_path.read_text(encoding='utf-8').strip()
    raw_docs = re.split(r'\n\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            try:
                token, tag = line.split('-X- _')
                tokens.append(token.strip())
                tags.append(tag.strip())
            except ValueError:
                pass
        token_docs.append(tokens)
        tag_docs.append(tags)
        
    return token_docs, tag_docs
        
texts, tags = read_conll(f'{data_path}/PREPROCESSING/titles.conll')

Now that we’ve read the data in, let’s create a train/validation split:

In [56]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

Next, let’s create encodings for our tokens and tags. For the tags, we can start by just create a simple mapping which we’ll use in a moment:

In [57]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

To encode the tokens, we’ll use a pre-trained `DeepPavlov/rubert-base-cased` tokenizer. We can tell the tokenizer that we’re dealing with ready-split tokens rather than full sentence strings by passing `is_split_into_words=True`. We’ll also pass `padding=True` and to pad the sequences to be the same length. Lastly, we can tell the model to return information about the tokens which are split by the wordpiece tokenization process, which we will need in a moment.

In [64]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=213450.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=435797.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=29.0), HTML(value='')))




Great, so now our tokens are nicely encoded in the format that they need to be in to feed them into our model below.

In [65]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [66]:
import torch

class CONLLDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = CONLLDataset(train_encodings, train_labels)
val_dataset = CONLLDataset(val_encodings, val_labels)

In [None]:
from transformers import DistilBertForTokenClassification
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased',
                                                         num_labels=len(unique_tags))

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=411.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=263273408.0), HTML(value='')))

In [15]:
model = BertForSequenceClassification.from_pretrained(model_name,
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)





HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=642.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=711456796.0), HTML(value='')))




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(119547, 768, padding_idx=0)

texts
tags

# Sources

* [Token Classification with W-NUT Emerging Entities](https://huggingface.co/transformers/custom_datasets.html?highlight=named%20entity%20recognition#token-classification-with-w-nut-emerging-entities)

In [19]:
labeled[0]

{'id': 1477,
 'annotations': [{'id': 1501,
   'completed_by': {'id': 1,
    'email': 'leva.matyushkin@gmail.com',
    'first_name': 'Leo',
    'last_name': 'Matyushkin'},
   'state': {},
   'result': [{'value': {'start': 16,
      'end': 36,
      'text': 'платежные технологии',
      'labels': ['TECH']},
     'id': 'CFSQHFanLE',
     'from_name': 'label',
     'to_name': 'text',
     'type': 'labels'},
    {'value': {'start': 0, 'end': 3, 'text': 'Как', 'labels': ['PROFIT']},
     'id': '1GeN7olSvI',
     'from_name': 'label',
     'to_name': 'text',
     'type': 'labels'},
    {'value': {'start': 50,
      'end': 85,
      'text': 'предприятиями общественного питания',
      'labels': ['TECH']},
     'id': '6k_PJzBp6a',
     'from_name': 'label',
     'to_name': 'text',
     'type': 'labels'},
    {'value': {'start': 86,
      'end': 99,
      'text': 'по всему миру',
      'labels': ['AUD']},
     'id': 'rxrvBPO0W_',
     'from_name': 'label',
     'to_name': 'text',
     'type': 'l