# Инициализация

In [170]:
import os
import numpy as np
import pymorphy2
from tqdm import tqdm_notebook
from sklearn.feature_extraction import DictVectorizer

In [171]:
token_filenames = [f for f in os.listdir('./devset/') if '.tokens' in f]

# Функции для работы с файлами

In [172]:
class Token:
    def __init__(self, position, length, text):
        self._position = position
        self._length = length
        self._text = text
        self._pos = None
        self._tag = None

In [173]:
class Span:
    def __init__(self, token_id):
        self._token_id = token_id

In [174]:
def load_tokens(token_filename, path='./devset/'):
    tokens = dict()
    with open(path + token_filename, 'r+', encoding='utf8') as f:
        for line in f:
            split = line.split()
            if split:
                t = Token(split[1], split[2], split[3])
                tokens[split[0]] = t
    return tokens

In [175]:
def load_spans(token_filename, path='./devset/'):
    spans = dict()
    with open(path + token_filename.split('.')[0] + '.spans', 'r+', encoding='utf8') as f:
        for line in f:
            split = line.split()
            s = Span(split[4])
            spans[split[0]] = s
    return spans

In [176]:
def transform_base_tag(base_tag):
    if base_tag == 'Person':
        return 'PER'
    if base_tag == 'Location':
        return 'LOC'
    if base_tag == 'LocOrg':
        return 'LOCORG'
    if base_tag == 'Org':
        return 'ORG'
    else:
        return 'MISC'

In [177]:
def load_objects(token_filename, tokens, spans, path='./devset/'):
    with open(path + token_filename.split('.')[0] + '.objects', 'r+', encoding='utf8') as f:
        for line in f:
            line = line.split(' # ')[0]
            split = line.split()
            base_tag = transform_base_tag(split[1])
            span_ids = split[2:]
            if len(span_ids) == 1:
                tokens[spans[span_ids[0]]._token_id]._tag = 'S-' + base_tag
            else:
                for i, span_id in enumerate(span_ids):
                    if i == 0:
                        tokens[spans[span_ids[i]]._token_id]._tag = 'B-' + base_tag
                    if i == len(span_ids) - 1:
                        tokens[spans[span_ids[i]]._token_id]._tag = 'E-' + base_tag
                    else:
                        tokens[spans[span_ids[i]]._token_id]._tag = 'I-' + base_tag
    return tokens

In [178]:
morph = pymorphy2.MorphAnalyzer()

In [179]:
def fill_pos(tokens):
    for id, token in tokens.items():
        pos = morph.parse(token._text)[0].tag.POS
        if pos is None:
            pos = 'None'
        token._pos = pos
        if token._tag is None:
            token._tag = 'O'
    return tokens

In [180]:
def word2features(sent, i):
    word = sent[i]._text
    postag = sent[i]._pos

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
    }
    '''
    if i > 0:
        word1 = sent[i-1]._text
        postag1 = sent[i-1]._pos
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1]._text
        postag1 = sent[i+1]._pos
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
        })
    else:
        features['EOS'] = True
    '''
    vec = DictVectorizer()
    return vec.fit_transform(features)


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [token._tag for token in sent]

In [181]:
def split_tokens_by_sents(tokens):
    sents = []
    sent = []
    for id, token in tokens.items():
        if token._text != '.':
            sent.append(token)
        else:
            sents.append(sent)
            sent = []
    return sents

# Готовим тренировочную выборку

In [182]:
sents = []
for token_filename in tqdm_notebook(token_filenames):
    tokens = load_tokens(token_filename)
    spans = load_spans(token_filename)
    tokens = load_objects(token_filename, tokens, spans)
    tokens = fill_pos(tokens)
    sents += split_tokens_by_sents(tokens)

HBox(children=(IntProgress(value=0, max=122), HTML(value='')))




In [183]:
len(sents)

1524

In [184]:
from sklearn.model_selection import train_test_split
import numpy as np

train_ids, test_ids = train_test_split(np.arange(len(sents)))

In [185]:
train_sents = np.array(sents)[train_ids]
test_sents = np.array(sents)[test_ids]

In [186]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [191]:
X_train

[[<1x8 sparse matrix of type '<class 'numpy.float64'>'
  	with 6 stored elements in Compressed Sparse Row format>,
  <1x8 sparse matrix of type '<class 'numpy.float64'>'
  	with 5 stored elements in Compressed Sparse Row format>,
  <1x8 sparse matrix of type '<class 'numpy.float64'>'
  	with 5 stored elements in Compressed Sparse Row format>,
  <1x8 sparse matrix of type '<class 'numpy.float64'>'
  	with 5 stored elements in Compressed Sparse Row format>,
  <1x8 sparse matrix of type '<class 'numpy.float64'>'
  	with 6 stored elements in Compressed Sparse Row format>,
  <1x8 sparse matrix of type '<class 'numpy.float64'>'
  	with 5 stored elements in Compressed Sparse Row format>,
  <1x8 sparse matrix of type '<class 'numpy.float64'>'
  	with 5 stored elements in Compressed Sparse Row format>,
  <1x8 sparse matrix of type '<class 'numpy.float64'>'
  	with 5 stored elements in Compressed Sparse Row format>,
  <1x8 sparse matrix of type '<class 'numpy.float64'>'
  	with 5 stored elements

In [189]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
clf = RandomForestClassifier()

In [190]:
clf.fit(X_train, y_train)

ValueError: setting an array element with a sequence.

# Тренируем модель

In [188]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf.fit(X_train, y_train)

SystemError: <class 'TypeError'> returned a result with an error set

In [None]:
from sklearn_crfsuite.metrics import flat_classification_report
predict = crf.predict(X_test)
print(flat_classification_report(y_test, predict))

# Применяем модель

In [None]:
def get_entities(tokens):
    rows = []

    buffer = []
    for id, token in tokens.items():
        tag = token._tag
        if tag.startswith('S'):
                rows.append('%s %d %d\n' % ( tag.split('-')[1], int(token._position), int(token._length)))
        elif tag.startswith('B') or tag.startswith('I'):
            buffer.append(token)
        elif tag.startswith('E'):
            buffer.append(token)
            start = int(buffer[0]._position)
            length = int(buffer[-1]._position) + int(buffer[-1]._length) - int(start)
            rows.append('%s %d %d\n' % (tag.split('-')[1], start, length))
            buffer = []
    return rows

In [None]:
test_token_filenames = [filename for filename in os.listdir('./testset') if '.tokens' in filename]

In [None]:
for token_filename in tqdm_notebook(test_token_filenames):
    tokens = load_tokens(token_filename, path = './testset/')
    tokens = fill_pos(tokens)
    sents = split_tokens_by_sents(tokens)
    X = [sent2features(s) for s in sents]
    y_pred = crf.predict(X)
    for i in range(len(y_pred)):
        for j in range(len(y_pred[i])):
            sents[i][j]._tag = y_pred[i][j]
    rows = get_entities(tokens)
    with open('./results_crf/' + token_filename.split('.')[0] + '.task1', 'w') as f:
        f.writelines(rows)

# Проверяем результаты

In [None]:
!python scripts\t1_eval.py -s .\testset -t .\results_crf -l -o .\output\

In [None]:
!python scripts\t1_eval.py -s .\testset -t .\results_crf