# Инициализация

In [1]:
import os
import numpy as np
import pymorphy2
from tqdm import tqdm_notebook

In [2]:
token_filenames = [f for f in os.listdir('./devset/') if '.tokens' in f]

# Функции для работы с файлами

In [3]:
class Token:
    def __init__(self, position, length, text):
        self._position = position
        self._length = length
        self._text = text
        self._pos = None
        self._tag = None

In [4]:
class Span:
    def __init__(self, token_id):
        self._token_id = token_id

In [5]:
def load_tokens(token_filename, path='./devset/'):
    tokens = dict()
    with open(path + token_filename, 'r+', encoding='utf8') as f:
        for line in f:
            split = line.split()
            if split:
                t = Token(split[1], split[2], split[3])
                tokens[split[0]] = t
    return tokens

In [6]:
def load_spans(token_filename, path='./devset/'):
    spans = dict()
    with open(path + token_filename.split('.')[0] + '.spans', 'r+', encoding='utf8') as f:
        for line in f:
            split = line.split()
            s = Span(split[4])
            spans[split[0]] = s
    return spans

In [7]:
def transform_base_tag(base_tag):
    if base_tag == 'Location':
        return 'LOC'
    if base_tag == 'LocOrg':
        return 'LOCORG'
    else:
        return 'MISC'

In [8]:
def load_objects(token_filename, tokens, spans, path='./devset/'):
    with open(path + token_filename.split('.')[0] + '.objects', 'r+', encoding='utf8') as f:
        for line in f:
            line = line.split(' # ')[0]
            split = line.split()
            base_tag = transform_base_tag(split[1])
            span_ids = split[2:]
            if len(span_ids) == 1:
                tokens[spans[span_ids[0]]._token_id]._tag = 'S-' + base_tag
            else:
                for i, span_id in enumerate(span_ids):
                    if i == 0:
                        tokens[spans[span_ids[i]]._token_id]._tag = 'B-' + base_tag
                    if i == len(span_ids) - 1:
                        tokens[spans[span_ids[i]]._token_id]._tag = 'E-' + base_tag
                    else:
                        tokens[spans[span_ids[i]]._token_id]._tag = 'I-' + base_tag
    return tokens

In [9]:
morph = pymorphy2.MorphAnalyzer()

In [10]:
def fill_pos(tokens):
    for id, token in tokens.items():
        pos = morph.parse(token._text)[0].tag.POS
        if pos is None:
            pos = 'None'
        token._pos = pos
        if token._tag is None:
            token._tag = 'O'
    return tokens

In [11]:
def word2features(sent, i):
    word = sent[i]._text
    postag = sent[i]._pos

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
    }
    if i > 0:
        word1 = sent[i-1]._text
        postag1 = sent[i-1]._pos
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1]._text
        postag1 = sent[i+1]._pos
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [token._tag for token in sent]

In [12]:
def split_tokens_by_sents(tokens):
    sents = []
    sent = []
    for id, token in tokens.items():
        if token._text != '.':
            sent.append(token)
        else:
            sents.append(sent)
            sent = []
    return sents

# Готовим тренировочную выборку

In [13]:
sents = []
for token_filename in tqdm_notebook(token_filenames):
    tokens = load_tokens(token_filename)
    spans = load_spans(token_filename)
    tokens = load_objects(token_filename, tokens, spans)
    tokens = fill_pos(tokens)
    sents += split_tokens_by_sents(tokens)

HBox(children=(IntProgress(value=0, max=122), HTML(value='')))




In [14]:
len(sents)

1524

In [15]:
from sklearn.model_selection import train_test_split
import numpy as np

train_ids, test_ids = train_test_split(np.arange(len(sents)))

In [16]:
train_sents = np.array(sents)[train_ids]
test_sents = np.array(sents)[test_ids]

In [17]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

# Тренируем модель

In [18]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [19]:
from sklearn_crfsuite.metrics import flat_classification_report
predict = crf.predict(X_test)
print(flat_classification_report(y_test, predict))

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

      E-LOC       0.33      0.06      0.10        18
   E-LOCORG       0.00      0.00      0.00         5
     E-MISC       0.71      0.62      0.66       181
      I-LOC       0.33      0.05      0.09        20
   I-LOCORG       0.00      0.00      0.00         6
     I-MISC       0.70      0.62      0.65       195
          O       0.98      0.99      0.98      6540
      S-LOC       0.71      0.76      0.73        92
   S-LOCORG       0.70      0.73      0.71        99
     S-MISC       0.69      0.56      0.62       181

avg / total       0.94      0.95      0.95      7337



# Применяем модель

In [20]:
def get_entities(tokens):
    rows = []

    buffer = []
    for id, token in tokens.items():
        tag = token._tag
        if tag.startswith('S'):
            rows.append('%s %d %d\n' % (tag.split('-')[1], int(token._position), int(token._length)))
        elif tag.startswith('B') or tag.startswith('I'):
            buffer.append(token)
        elif tag.startswith('E'):
            buffer.append(token)
            start = int(buffer[0]._position)
            length = int(buffer[-1]._position) + int(buffer[-1]._length) - int(start)
            rows.append('%s %d %d\n' % (tag.split('-')[1], start, length))
            buffer = []
    return rows

In [21]:
test_token_filenames = [filename for filename in os.listdir('./testset') if '.tokens' in filename]

In [22]:
for token_filename in tqdm_notebook(test_token_filenames):
    tokens = load_tokens(token_filename, path = './testset/')
    tokens = fill_pos(tokens)
    sents = split_tokens_by_sents(tokens)
    X = [sent2features(s) for s in sents]
    y_pred = crf.predict(X)
    for i in range(len(y_pred)):
        for j in range(len(y_pred[i])):
            sents[i][j]._tag = y_pred[i][j]
    rows = get_entities(tokens)
    with open('./results_crf/' + token_filename.split('.')[0] + '.task1', 'w') as f:
        f.writelines(rows)

HBox(children=(IntProgress(value=0, max=132), HTML(value='')))




# Проверяем результаты

In [23]:
!python scripts\t1_eval.py -s .\testset -t .\results_crf -l -o .\output\

Failed to load ".\results_crf\book_3884.task1"
Error: ".\results_crf\book_3884.task1", line 0.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 19 15
Details: 
Failed to load ".\results_crf\book_3702.task1"
Error: ".\results_crf\book_3702.task1", line 0.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 0 9
Details: 
Failed to load ".\results_crf\book_3804.task1"
Error: ".\results_crf\book_3804.task1", line 0.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 7 8
Details: 
Failed to load ".\results_crf\book_3573.task1"
Error: ".\results_crf\book_3573.task1", line 0.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 131 11
Details: 
Failed to load ".\results_crf\book_3615.task1"
Error: ".\results_crf\book_3615.task1", line 0.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 5 12
Details: 
Failed to load ".\results_crf\book_3767.task1"
Error: ".\results_crf\

Received: MISC 1 12
Details: 
Failed to load ".\results_crf\book_3965.task1"
Error: ".\results_crf\book_3965.task1", line 1.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 34 5
Details: 
Failed to load ".\results_crf\book_3877.task1"
Error: ".\results_crf\book_3877.task1", line 2.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 176 19
Details: 
Failed to load ".\results_crf\book_3832.task1"
Error: ".\results_crf\book_3832.task1", line 1.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 344 2
Details: 
Failed to load ".\results_crf\book_3966.task1"
Error: ".\results_crf\book_3966.task1", line 0.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 299 18
Details: 
Failed to load ".\results_crf\book_3894.task1"
Error: ".\results_crf\book_3894.task1", line 1.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 78 25
Details: 
Failed to load ".\results_crf\book

Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 0 14
Details: 
Failed to load ".\results_crf\book_3937.task1"
Error: ".\results_crf\book_3937.task1", line 0.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 46 10
Details: 
Failed to load ".\results_crf\book_3667.task1"
Error: ".\results_crf\book_3667.task1", line 0.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 18 9
Details: 
Failed to load ".\results_crf\book_3680.task1"
Error: ".\results_crf\book_3680.task1", line 0.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 147 11
Details: 
Failed to load ".\results_crf\book_3700.task1"
Error: ".\results_crf\book_3700.task1", line 1.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 21 5
Details: 
Failed to load ".\results_crf\book_3632.task1"
Error: ".\results_crf\book_3632.task1", line 0.
Expected: [per/loc/locorg/org] [START_SYMBOL_INDEX] [LENGTH]
Receiv

per        1.0000   0.0000   0.0000     0.00     0.00     1347        0
loc        0.8944   0.0518   0.0978    63.50    63.50     1227       71
org        1.0000   0.0000   0.0000     0.00     0.00     1595        0
overall    0.8944   0.0152   0.0300    63.50    63.50     4169       71


In [24]:
!python scripts\t1_eval.py -s .\testset -t .\results_crf

Failed to load ".\results_crf\book_3734.task1"
Error: ".\results_crf\book_3734.task1", line 0.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 0 14
Details: 
Failed to load ".\results_crf\book_3771.task1"
Error: ".\results_crf\book_3771.task1", line 0.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 111 9
Details: 
Failed to load ".\results_crf\book_3591.task1"
Error: ".\results_crf\book_3591.task1", line 1.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 138 19
Details: 
Failed to load ".\results_crf\book_3974.task1"
Error: ".\results_crf\book_3974.task1", line 0.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 124 2
Details: 
Failed to load ".\results_crf\book_3647.task1"
Error: ".\results_crf\book_3647.task1", line 0.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 21 9
Details: 
Failed to load ".\results_crf\book_3574.task1"
Error: ".\results_c

Received: MISC 93 11
Details: 
Failed to load ".\results_crf\book_3886.task1"
Error: ".\results_crf\book_3886.task1", line 1.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 1 54
Details: 
Failed to load ".\results_crf\book_3790.task1"
Error: ".\results_crf\book_3790.task1", line 0.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 10 9
Details: 
Failed to load ".\results_crf\book_3882.task1"
Error: ".\results_crf\book_3882.task1", line 0.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 0 3
Details: 
Failed to load ".\results_crf\book_3797.task1"
Error: ".\results_crf\book_3797.task1", line 0.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 44 5
Details: 
Failed to load ".\results_crf\book_3877.task1"
Error: ".\results_crf\book_3877.task1", line 2.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 176 19
Details: 
Failed to load ".\results_crf\book_379

Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 97 23
Details: 
Failed to load ".\results_crf\book_3764.task1"
Error: ".\results_crf\book_3764.task1", line 0.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 55 27
Details: 
Failed to load ".\results_crf\book_3806.task1"
Error: ".\results_crf\book_3806.task1", line 0.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 87 14
Details: 
Failed to load ".\results_crf\book_3828.task1"
Error: ".\results_crf\book_3828.task1", line 0.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 62 19
Details: 
Failed to load ".\results_crf\book_3842.task1"
Error: ".\results_crf\book_3842.task1", line 0.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Received: MISC 0 5
Details: 
Failed to load ".\results_crf\book_3562.task1"
Error: ".\results_crf\book_3562.task1", line 4.
Expected: [per/locorg/org/loc] [START_SYMBOL_INDEX] [LENGTH]
Receiv

per        1.0000   0.0000   0.0000     0.00     0.00     1347        0
loc        0.7353   0.0419   0.0792    25.00    25.00      597       34
org        1.0000   0.0000   0.0000     0.00     0.00     1595        0
locorg     0.6842   0.0411   0.0775    26.00    26.00      633       38
overall    0.7083   0.0122   0.0240    51.00    51.00     4172       72
