In [None]:
!git clone https://github.com/UniversalDependencies/UD_Russian-SynTagRus.git
!pip install -qq pyconll

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pyconll
import numpy as np

import nltk
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [None]:
full_train = pyconll.load_from_file('UD_Russian-SynTagRus/ru_syntagrus-ud-train-a.conllu')
full_test = pyconll.load_from_file('UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu')

In [None]:
fdata_train = []
for sent in full_train[:]:
    fdata_train.append([(token.form, token.upos) for token in sent])
    
fdata_test = []
for sent in full_test[:]:
    fdata_test.append([(token.form, token.upos) for token in sent])
    
fdata_sent_test = []
for sent in full_test[:]:
    fdata_sent_test.append([token.form for token in sent])

In [None]:
unigram_tagger = UnigramTagger(fdata_train)
unigram_accuracy = unigram_tagger.accuracy(fdata_test)
unigram_tagger.tag(fdata_sent_test[1]), unigram_accuracy

([('Доставшийся', None),
  ('в', 'ADP'),
  ('наследство', 'NOUN'),
  ('от', 'ADP'),
  ('советского', 'ADJ'),
  ('периода', 'NOUN'),
  ('времени', 'NOUN'),
  ('промышленный', 'ADJ'),
  ('и', 'CCONJ'),
  ('интеллектуальный', 'ADJ'),
  ('потенциал', 'NOUN'),
  ('оказался', 'VERB'),
  ('благом', 'NOUN'),
  ('и', 'CCONJ'),
  ('горем', None),
  ('страны', 'NOUN'),
  ('.', 'PUNCT')],
 0.8369316851173183)

In [None]:
bigram_tagger = BigramTagger(fdata_train)
bigram_accuracy = bigram_tagger.accuracy(fdata_test)
bigram_tagger.tag(fdata_sent_test[1]), bigram_accuracy

([('Доставшийся', None),
  ('в', 'ADP'),
  ('наследство', 'NOUN'),
  ('от', 'ADP'),
  ('советского', 'ADJ'),
  ('периода', 'NOUN'),
  ('времени', 'NOUN'),
  ('промышленный', 'ADJ'),
  ('и', 'CCONJ'),
  ('интеллектуальный', None),
  ('потенциал', None),
  ('оказался', None),
  ('благом', None),
  ('и', 'CCONJ'),
  ('горем', None),
  ('страны', None),
  ('.', 'PUNCT')],
 0.6249422428143732)

In [None]:
trigram_tagger = TrigramTagger(fdata_train)
trigram_accuracy = trigram_tagger.accuracy(fdata_test)
trigram_tagger.tag(fdata_sent_test[1]), trigram_accuracy

([('Доставшийся', None),
  ('в', None),
  ('наследство', None),
  ('от', None),
  ('советского', None),
  ('периода', None),
  ('времени', None),
  ('промышленный', None),
  ('и', None),
  ('интеллектуальный', None),
  ('потенциал', None),
  ('оказался', None),
  ('благом', None),
  ('и', None),
  ('горем', None),
  ('страны', None),
  ('.', None)],
 0.18803840773724753)

In [None]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff

backoff = DefaultTagger('NN') 
tag = backoff_tagger(fdata_train,  
                     [UnigramTagger, BigramTagger, TrigramTagger],  
                     backoff = backoff) 
  
tag.accuracy(fdata_test) 

0.840590167669901

In [None]:
train_tok = []
train_label = []
for sent in fdata_train[:]:
    for tok in sent:
        train_tok.append(tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])
        
test_tok = []
test_label = []
for sent in fdata_test[:]:
    for tok in sent:
        test_tok.append(tok[0])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])

In [None]:
encoder = LabelEncoder()
train_label_enc = encoder.fit_transform(train_label)
test_label_enc = encoder.transform(test_label)

In [None]:
encoder.classes_

array(['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN',
       'NO_TAG', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',
       'VERB', 'X'], dtype='<U6')

In [None]:
not_nan_ind = [i for i, e in enumerate(test_tok) if e is not None]

In [None]:
test_label_enc = test_label_enc[not_nan_ind]

In [None]:
test_tok = [token for token in test_tok if token is not None]

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 3), analyzer='char', max_features=300)
count = CountVectorizer(ngram_range=(1, 3), analyzer='char', max_features=300)
hashing = HashingVectorizer(ngram_range=(1, 3), analyzer='char', n_features=300)

for vectorizer in (tfidf, count, hashing):
    pipe = make_pipeline(vectorizer,
                         LogisticRegression(max_iter=100, random_state=42, n_jobs=-1)).fit(train_tok, train_label_enc)
    print(f"{vectorizer.__class__.__name__}")
    print(classification_report(test_label_enc, pipe.predict(test_tok)))
    print("="*50)

TfidfVectorizer


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.79      0.79      0.79     14471
           1       0.97      0.98      0.97     15062
           2       0.75      0.68      0.71      8085
           3       0.85      0.97      0.91      1518
           4       0.88      0.99      0.93      5736
           5       0.82      0.69      0.75      4094
           6       0.00      0.00      0.00        23
           7       0.80      0.87      0.83     36568
           8       0.00      0.00      0.00       194
           9       0.74      0.48      0.58      2528
          10       0.91      0.74      0.82      4921
          11       0.83      0.90      0.87      8015
          12       0.57      0.26      0.35      5883
          13       0.93      1.00      0.97     29463
          14       0.77      0.89      0.82      2992
          15       0.00      0.00      0.00       165
          16       0.86      0.82      0.84     18146
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.81      0.79      0.80     14471
           1       0.97      0.99      0.98     15062
           2       0.76      0.70      0.73      8085
           3       0.86      0.96      0.91      1518
           4       0.85      1.00      0.92      5736
           5       0.80      0.69      0.74      4094
           6       0.00      0.00      0.00        23
           7       0.79      0.87      0.83     36568
           8       0.00      0.00      0.00       194
           9       0.72      0.47      0.57      2528
          10       0.92      0.72      0.81      4921
          11       0.84      0.89      0.86      8015
          12       0.56      0.27      0.36      5883
          13       0.93      1.00      0.97     29463
          14       0.76      0.89      0.82      2992
          15       0.00      0.00      0.00       165
          16       0.86      0.81      0.84     18146
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.70      0.66      0.68     14471
           1       0.93      0.97      0.95     15062
           2       0.67      0.59      0.63      8085
           3       0.85      0.97      0.91      1518
           4       0.86      0.98      0.92      5736
           5       0.82      0.67      0.74      4094
           6       0.00      0.00      0.00        23
           7       0.72      0.81      0.76     36568
           8       1.00      1.00      1.00       194
           9       0.76      0.83      0.80      2528
          10       0.93      0.72      0.81      4921
          11       0.79      0.89      0.84      8015
          12       0.52      0.24      0.33      5883
          13       1.00      1.00      1.00     29463
          14       0.79      0.89      0.84      2992
          15       1.00      0.95      0.98       165
          16       0.78      0.73      0.76     18146
          17       0.07    

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 3), analyzer='word', max_features=500)
count = CountVectorizer(ngram_range=(1, 3), analyzer='word', max_features=500)
hashing = HashingVectorizer(ngram_range=(1, 3), analyzer='word', n_features=500)

for vectorizer in (tfidf, count, hashing):
    pipe = make_pipeline(vectorizer,
                         LogisticRegression(max_iter=100, random_state=42, n_jobs=-1)).fit(train_tok, train_label_enc)
    print(f"{vectorizer.__class__.__name__}")
    print(classification_report(test_label_enc, pipe.predict(test_tok)))
    print("="*50)

TfidfVectorizer
              precision    recall  f1-score   support

           0       0.89      0.08      0.15     14471
           1       0.98      0.48      0.64     15062
           2       0.93      0.59      0.72      8085
           3       0.87      0.94      0.90      1518
           4       0.89      0.21      0.34      5736
           5       0.77      0.59      0.67      4094
           6       0.00      0.00      0.00        23
           7       0.30      1.00      0.46     36568
           8       0.00      0.00      0.00       194
           9       0.87      0.33      0.48      2528
          10       0.97      0.73      0.83      4921
          11       0.91      0.71      0.80      8015
          12       0.93      0.07      0.13      5883
          13       0.00      0.00      0.00     29463
          14       0.73      0.94      0.82      2992
          15       0.00      0.00      0.00       165
          16       0.96      0.10      0.17     18146
          1

In [None]:
union = make_union(CountVectorizer(ngram_range=(1, 3), analyzer='char', max_features=200),
                   CountVectorizer(ngram_range=(1, 1), analyzer='word', max_features=200))

pipe = make_pipeline(union,
                     LogisticRegression(max_iter=100, random_state=42, n_jobs=-1)).fit(train_tok, train_label_enc)
print(classification_report(test_label_enc, pipe.predict(test_tok)))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75     14471
           1       0.97      0.99      0.98     15062
           2       0.78      0.70      0.73      8085
           3       0.86      0.96      0.91      1518
           4       0.87      0.98      0.92      5736
           5       0.77      0.75      0.76      4094
           6       0.00      0.00      0.00        23
           7       0.77      0.86      0.81     36568
           8       0.00      0.00      0.00       194
           9       0.62      0.30      0.41      2528
          10       0.94      0.75      0.83      4921
          11       0.86      0.85      0.86      8015
          12       0.57      0.20      0.29      5883
          13       0.92      1.00      0.96     29463
          14       0.81      0.89      0.85      2992
          15       0.00      0.00      0.00       165
          16       0.83      0.79      0.81     18146
          17       0.00    

In [None]:
# the best is tfidf and count with char level

In [None]:
!pip install -qq corus razdel
!wget -qq http://www.labinform.ru/pub/named_entities/collection5.zip
!unzip collection5.zip

In [None]:
import corus
import pandas as pd

from corus import load_ne5
from razdel import tokenize
from sklearn.model_selection import train_test_split
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import (Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D, LSTM, Dropout, Input, 
                                     TextVectorization, TimeDistributed, Bidirectional, SpatialDropout1D)

In [270]:
records = load_ne5('Collection5/')

In [None]:
words_docs = []
for ix, rec in enumerate(records):
    words = []
    for token in tokenize(rec.text):
        type_ent = 'OUT'
        for ent in rec.spans:
            if (token.start >= ent.start) and (token.stop <= ent.stop):
                type_ent = ent.type
                break
        words.append([token.text, type_ent])
    words_docs.extend(words)

In [None]:
df_words = pd.DataFrame(words_docs, columns=['word', 'tag'])

In [None]:
df_words["three"] = df_words["word"].shift(-1) + " " + df_words["word"] + " " + df_words["word"].shift(1)
df_words["tag_tree"] = df_words["tag"].shift(-1) + " " + df_words["tag"] + " " + df_words["tag"].shift(1)

df_words.dropna(inplace=True)

In [None]:
encoder = LabelEncoder().fit(df_words["tag"])

df_words["tag_tree"] = df_words["tag_tree"].apply(lambda x: encoder.transform(x.split()).tolist())

In [271]:
# full texts

words_docs = []
spans_docs = []

for ix, rec in enumerate(records):
    words, tags = [], []
    for token in tokenize(rec.text):
        type_ent = 'OUT'
        for ent in rec.spans:
            if (token.start >= ent.start) and (token.stop <= ent.stop):
                type_ent = ent.type
                break
        words.append(token.text)
        tags.append(type_ent)
    words_docs.extend([words])
    spans_docs.append(tags)

In [276]:
# full sequences
df = pd.DataFrame({"word": words_docs, "tag": spans_docs})
df["word"] = df["word"].apply(" ".join)
df.head()

Unnamed: 0,word,tag
0,Совмин Белоруссии надеется разгрузить склады в...,"[ORG, GEOPOLIT, OUT, OUT, OUT, OUT, LOC, OUT, ..."
1,Продолжаются увольнения энергетиков после крит...,"[OUT, OUT, OUT, OUT, OUT, PER, PER, PER, OUT, ..."
2,Керри может через неделю вернуться на Ближний ...,"[PER, OUT, OUT, OUT, OUT, OUT, LOC, LOC, OUT, ..."
3,Экс-Президенту Египта Мурси продлили срок арес...,"[OUT, GEOPOLIT, PER, OUT, OUT, OUT, OUT, OUT, ..."
4,Медведев уволил Суходольского с должности глав...,"[PER, OUT, PER, OUT, OUT, OUT, ORG, ORG, LOC, ..."


In [279]:
df["word"].str.split().str.len().describe()

count    1000.000000
mean      265.471000
std       186.292163
min        21.000000
25%       171.000000
50%       226.000000
75%       307.500000
max      2005.000000
Name: word, dtype: float64

In [280]:
tags2id = {}
for i, tag in enumerate(df_words["tag"].unique()):
    tags2id[tag] = i
    
id2tags = {v: k for k, v in tags2id.items()}

In [281]:
max_length = 128

def preprocess_tags(tags2id, tag_sequences, maxlen=max_length):
    final_encoded_tags = []
    for list_tags in tag_sequences:
        encoded_tags = []
        for tag in list_tags:
            encoded_tags.append(tags2id[tag])
        current_length = len(encoded_tags)
        if current_length > maxlen:
            encoded_tags = encoded_tags[:maxlen]
        else:
            padding_size = maxlen - current_length
            padding = [tags2id["OUT"]] * padding_size
            encoded_tags = encoded_tags + padding
        final_encoded_tags.append(encoded_tags)
        
    return final_encoded_tags 

In [282]:
y = preprocess_tags(tags2id, df["tag"])

In [None]:
df_words.sample(5)

Unnamed: 0,word,tag
685,и,OUT
161996,Домодедово,ORG
102097,",",OUT
205462,был,OUT
98266,сменой,OUT


In [None]:
df_words['tag'].value_counts()

OUT         219214
PER          21200
ORG          13651
LOC           4568
GEOPOLIT      4356
MEDIA         2482
Name: tag, dtype: int64

In [None]:
# single token
x_train, x_valid, y_train, y_valid = train_test_split(df_words['word'], df_words['tag'])

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

In [None]:
# three tokens
x_train, x_valid, y_train, y_valid = train_test_split(df_words['word'], df_words['tag_tree'])

In [288]:
# large sequence

x_train, x_valid, y_train, y_valid = train_test_split(df['word'], y)

In [289]:
def prepare_dataset(x, y, batch_size=16, cache=True, shuffle=True):
    ds = tf.data.Dataset.from_tensor_slices((x, y))
    if cache:
        ds = ds.cache()
    if shuffle:
        ds = ds.shuffle(buffer_size=len(x))
    ds = ds.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds

In [290]:
# single token / large seq
train_ds = prepare_dataset(x_train, y_train, batch_size=16)
valid_ds = prepare_dataset(x_valid, y_valid, batch_size=16, cache=False, shuffle=False)

In [None]:
# three tokens
train_ds = prepare_dataset(x_train, y_train.tolist(), batch_size=16)
valid_ds = prepare_dataset(x_valid, y_valid.tolist(), batch_size=16, cache=False, shuffle=False)

In [291]:
vocab_size = 30000
seq_len = max_length  # 5 single / 3 3words, max_size for full sentence

vectorize_layer = TextVectorization(
    standardize=None,  # do not apply lower to preserve PER tags more obvious
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=seq_len)

text_data = tf.data.Dataset.from_tensor_slices((x_train, y_train)).map(lambda x, y: x)  # y_train.tolist() for 3 tokens
vectorize_layer.adapt(text_data)

In [None]:
class modelNER(tf.keras.Model):
    """ single token """
    def __init__(self, embedding_dim = 64):
        super(modelNER, self).__init__()
        self.emb = Embedding(vocab_size+1, embedding_dim)
        self.gPool = GlobalMaxPooling1D()
        self.fc1 = Dense(300, activation='relu')
        self.fc2 = Dense(50, activation='relu')
        self.fc3 = Dense(6, activation='softmax')

    def call(self, x):
        x = vectorize_layer(x)
        x = self.emb(x)
        pool_x = self.gPool(x)
        
        fc_x = self.fc1(pool_x)
        fc_x = self.fc2(fc_x)
        
        concat_x = tf.concat([pool_x, fc_x], axis=1)
        prob = self.fc3(concat_x)
        return prob

In [None]:
class modelNER(tf.keras.Model):
    """ three tokens """
    def __init__(self, embedding_dim = 64):
        super(modelNER, self).__init__()
        self.emb = Embedding(vocab_size+1, embedding_dim)
        self.conv1 = Conv1D(100, 1, 1, padding="same", activation='relu')
        self.conv2 = Conv1D(100, 2, 1, padding="same", activation='relu')
        self.conv3 = Conv1D(100, 3, 1, padding="same", activation='relu')
        self.conv = Conv1D(50, 1, 1, activation='relu')
        self.fc = TimeDistributed(Dense(6, activation='softmax'))

    def call(self, x):
        x = vectorize_layer(x)
        x = self.emb(x)
        cx1 = self.conv1(x)
        cx2 = self.conv2(x)
        cx3 = self.conv3(x)
        x = tf.concat([cx1, cx2, cx3], axis=-1)
        
        fc_x = self.conv(x)

        prob = self.fc(fc_x)
        return prob

In [292]:
model = tf.keras.Sequential([
    vectorize_layer,
    Embedding(vectorize_layer.vocabulary_size()+1, 64, mask_zero=False),
    Bidirectional(LSTM(units=128, return_sequences=True)),
    SpatialDropout1D(rate=0.3),
    TimeDistributed(Dense(units=6, activation="softmax"))
])

In [309]:
model = modelNER()

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.fit(train_ds, validation_data=valid_ds, epochs=3)  # single token

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7efbdbfbfd60>

In [None]:
# single token report
preds = model.predict(valid_ds)
print(classification_report(y_valid, preds.argmax(-1), target_names=encoder.classes_))

              precision    recall  f1-score   support

    GEOPOLIT       0.90      0.91      0.91      1081
         LOC       0.86      0.81      0.83      1128
       MEDIA       0.93      0.78      0.85       683
         ORG       0.88      0.56      0.68      3422
         OUT       0.94      0.99      0.97     54758
         PER       0.98      0.72      0.83      5296

    accuracy                           0.94     66368
   macro avg       0.92      0.80      0.85     66368
weighted avg       0.94      0.94      0.94     66368



In [None]:
model.fit(train_ds, validation_data=valid_ds, epochs=3)  # three token

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7efb5623fd30>

In [None]:
preds = model.predict(valid_ds)
preds = pd.DataFrame(preds.argmax(-1))
preds["preds"] = preds.apply(lambda row: [i for i in row], axis=1)



In [None]:
predictions = pd.DataFrame({"y_true": y_valid.explode().reset_index(drop=True), "pred": preds["preds"].explode().reset_index(drop=True)})
predictions["y_true"] = predictions["y_true"].astype("int")
predictions["pred"] = predictions["pred"].astype("int")

In [None]:
print(classification_report(predictions["y_true"].values, predictions["pred"].values, target_names=encoder.classes_))

              precision    recall  f1-score   support

    GEOPOLIT       0.79      0.38      0.52      3239
         LOC       0.79      0.39      0.52      3413
       MEDIA       0.86      0.36      0.51      1855
         ORG       0.84      0.26      0.40     10364
         OUT       0.88      0.99      0.93    164255
         PER       0.95      0.35      0.52     15978

    accuracy                           0.88    199104
   macro avg       0.85      0.46      0.57    199104
weighted avg       0.88      0.88      0.85    199104



In [310]:
model.fit(train_ds, validation_data=valid_ds, epochs=9)  # large

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<keras.callbacks.History at 0x7efbbca45f10>

In [311]:
preds = model.predict(valid_ds)
preds = pd.DataFrame(preds.argmax(-1))
preds["preds"] = preds.apply(lambda row: [i for i in row], axis=1)



In [312]:
predictions = pd.DataFrame({"y_true": [item for sublist in y_valid for item in sublist], "pred": preds["preds"].explode().reset_index(drop=True)})
predictions["y_true"] = predictions["y_true"].astype("int")
predictions["pred"] = predictions["pred"].astype("int")

In [313]:
print(classification_report(predictions["y_true"].values, predictions["pred"].values, target_names=encoder.classes_))

              precision    recall  f1-score   support

    GEOPOLIT       0.96      0.89      0.92       559
         LOC       0.95      0.98      0.96     25614
       MEDIA       0.88      0.68      0.77       621
         ORG       0.93      0.76      0.84      2951
         OUT       0.71      0.70      0.71      1783
         PER       0.95      0.76      0.85       472

    accuracy                           0.93     32000
   macro avg       0.90      0.79      0.84     32000
weighted avg       0.93      0.93      0.93     32000



In [None]:
# the easiest way to predict ner with given architectures is by using single token. With larger corpus lstm model is more suitable