In [1]:
!pip install russian-tagsets
!pip install sklearn-crfsuite
!pip install pymorphy2
!pip install conllu



In [2]:
from collections import Counter

import sklearn_crfsuite
from sklearn_crfsuite import metrics as crf_metrics

from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

from russian_tagsets import converters
from pymorphy2 import MorphAnalyzer
import conllu

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
analyzer = MorphAnalyzer()

In [4]:
ud_converter = converters.converter('opencorpora-int', 'ud20')

#### Загрузка и подготовка данных

In [5]:
train_data_conllu = open('ru_syntagrus-ud-train.conllu', encoding='utf-8')
test_data_conllu = open('ru_syntagrus-ud_test.conllu', encoding='utf-8')

In [6]:
train_data = conllu.parse_incr(train_data_conllu)
test_data = conllu.parse_incr(test_data_conllu)

In [7]:
def process_data(dataset):
    data = []
    pos = []

    # FORM: Word form or punctuation symbol
    # LEMMA: Lemma or stem of word form.
    # UPOS: Universal part-of-speech tag.
    # FEATS: List of morphological features from the universal feature inventory 
    # or from a defined language-specific extension; 
    # underscore if not available (https://universaldependencies.org/u/feat/index.html)
                
    for sent in dataset: 
        sentence = []
        sentence_pos = []

        for i in range(len(sent)): 
            res_dict = {}
            elem_upos = sent[i]['upos'] # сохраняем upos метку для i-того слова в предложении

            for j in range(i-2, i+3): # смотрим на 2 слова назад и 3 перед от i-того слова (на большем кол-ве ноут не выдерживает)
                if 0 <= j < len(sent):
                    forms = analyzer.parse(sent[j]['form']) # обрабатываем через pymorphy
                    forms = forms[:3] # сохраняем топ-3 формы

                    for upos_num in range(len(forms)):
                        res_dict['lemma'+str(j-i)+str(upos_num)] = forms[upos_num].normal_form # кладем в словарь лемму 
                        converted = ud_converter(str(forms[upos_num].tag))
                        upos, feats = converted.split(' ')
                        res_dict['upos'+str(j-i)+str(upos_num)] = upos # кладем в словарь upos, feats
                        res_dict['feats'+str(j-i)+str(upos_num)] = feats

                    for upos_num in range(len(forms), 3): 
                        res_dict['lemma'+str(j-i)+str(upos_num)] = 'none'
                        res_dict['upos'+str(j-i)+str(upos_num)] = 'none'
                        res_dict['feats'+str(j-i)+str(upos_num)] = 'none'
                
                else: # если j не удовлетворил условиям
                    for upos_num in range(3):
                        res_dict['lemma'+str(j-i)+str(upos_num)] = 'none'
                        res_dict['upos'+str(j-i)+str(upos_num)] = 'none'
                        res_dict['feats'+str(j-i)+str(upos_num)] = 'none'

            sentence.append(res_dict)
            sentence_pos.append(elem_upos)

        data.append(sentence)
        pos.append(sentence_pos)

    return data, pos

In [8]:
%%time
data, pos = process_data(train_data)

Wall time: 25min 56s


#### Обучение модели

In [9]:
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=200, all_possible_transitions=True)

In [10]:
%%time
crf.fit(data, pos)

Wall time: 21min 10s


CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=200)

#### Тест модели

Подготовка данных для теста:

In [11]:
%%time
data_test, pos_test = process_data(test_data)

Wall time: 3min 4s


Предсказываем значения с помощью модели:

In [12]:
predicted_values = crf.predict(data_test)

In [13]:
labels = list(crf.classes_) # X = other
labels

['NOUN',
 'PUNCT',
 'ADJ',
 'PROPN',
 'AUX',
 'VERB',
 'ADP',
 'ADV',
 'CCONJ',
 'PART',
 'PRON',
 'DET',
 'SCONJ',
 'NUM',
 '_',
 'INTJ',
 'X',
 'SYM']

Метрики:

In [14]:
print(crf_metrics.flat_classification_report(pos_test, predicted_values, labels=labels, digits=3))

              precision    recall  f1-score   support

        NOUN      0.973     0.986     0.980     36568
       PUNCT      1.000     1.000     1.000     29463
         ADJ      0.953     0.967     0.960     14471
       PROPN      0.927     0.818     0.869      5883
         AUX      0.940     0.953     0.946      1518
        VERB      0.987     0.985     0.986     18146
         ADP      0.999     0.999     0.999     15062
         ADV      0.947     0.948     0.947      8085
       CCONJ      0.963     0.979     0.971      5736
        PART      0.930     0.908     0.919      4921
        PRON      0.963     0.963     0.963      8015
         DET      0.926     0.912     0.919      4094
       SCONJ      0.921     0.942     0.932      2992
         NUM      0.943     0.945     0.944      2528
           _      1.000     1.000     1.000       271
        INTJ      0.875     0.609     0.718        23
           X      0.246     0.333     0.283        48
         SYM      0.994    

Модель плохо распознает X (other), но, кажется, это что-то небольшое и не очень существенное. Междометия тоже распознаются не очень хорошо, видимо, недостаточно данных в датасете

In [15]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
15.402754 AUX      lemma00:бы
14.668442 PRON     lemma00:который
13.806176 CCONJ    lemma00:или
13.490235 SCONJ    lemma00:если
13.361143 ADP      upos00:ADP
12.984190 _        lemma00:_
12.053161 PUNCT    upos00:PUNCT
11.556374 AUX      lemma00:быть
10.931335 CCONJ    lemma00:либо
10.724052 ADV      feats00:Tense=Pres
10.586029 PROPN    lemma00:гарус
10.465793 ADP      upos01:ADP
10.159273 PROPN    lemma00:цик
10.023963 NUM      lemma00:один
9.894987 SYM      lemma00:%
9.783239 PART     lemma00:еле
9.336678 PRON     lemma00:весь
9.199847 PROPN    lemma00:кремль
8.491245 ADJ      lemma00:крестовский
8.432884 SCONJ    lemma00:поскольку
8.420548 NOUN     lemma00:многое
8.286275 ADP      lemma00:порядок
8.254027 PROPN    lemma00:крэк
8.243982 NOUN     lemma00:им.
8.053294 PART     lemma00:прямо-таки
7.672967 NUM      upos00:NUM
7.647299 PART     lemma00:не
7.509956 SCONJ    lemma00:ибо
7.488617 VERB     lemma00:см.
7.386516 CCONJ    lemma00:причём

Top negative:
-3.580421 CC