In [1]:
import os
import pymorphy2
import re
import string
import numpy as np
from collections import Counter
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

In [2]:
DEV_PATH  = './devset/'
TEST_PATH = './testset/'

In [3]:
filenames_dev  = [f.split('.')[0] for f in os.listdir(DEV_PATH) if '.tokens' in f]
filenames_test = [f.split('.')[0] for f in os.listdir(TEST_PATH) if '.tokens' in f]
#filenames_dev = ['book_100']
#filenames_test = ['book_3539']

In [4]:
def load_tokens(path = DEV_PATH, filenames = filenames_dev):
    tokens = dict()
    for file in filenames:
        with open(path + file + '.tokens', 'r+', encoding='utf8') as f:
            for line in f:
                split = line.split()
                if split:
                    tokens[split[0]] = split[1:]
    return tokens

In [5]:
def load_spans(path = DEV_PATH, filenames = filenames_dev):
    spans = dict()
    for file in filenames:
        with open(path + file + '.spans', 'r+', encoding='utf8') as f:
            for line in f:
                split = line.split()
                spans[split[0]] = split[1:]
    return spans

In [6]:
def load_objects(path = DEV_PATH, filenames = filenames_dev, train=True):
    objects = dict()
    for file in filenames:
        with open(path + file + '.objects', 'r+', encoding='utf8') as f:
            for line in f:
                part = line.split(' # ')[0]
                split = part.split()
                if train:
                    if split[1] == 'Location' or split[1] == 'LocOrg':
                        objects[split[0]] = split[1:]
                else:
                    objects[split[0]] = split[1:]
    return objects

In [7]:
def transform_base_tag(base_tag):
    if base_tag == 'Person':
        return 'PER'
    if base_tag == 'Location':
        return 'LOC'
    if base_tag == 'LocOrg':
        return 'LOCORG'
    if base_tag == 'Org':
        return 'ORG'
    else:
        return 'MISC'

In [8]:
def parse(path = DEV_PATH, filenames = filenames_dev, train=True):
    tokens = load_tokens(path, filenames)
    spans  = load_spans(path, filenames)
    objects = load_objects(path, filenames, train)
    for key, value in objects.items():
        ne = transform_base_tag(value[0])
        span_ids = value[1:]
        if train:
            if len(span_ids) == 1:
                tokens[spans[span_ids[0]][3]].append('S-' + ne)
            else:
                for i, span_id in enumerate(span_ids):
                    if i == 0:
                        tokens[spans[span_ids[i]][3]].append('B-' + ne)
                    elif i == len(span_ids) - 1:
                        tokens[spans[span_ids[i]][3]].append('E-' + ne)
                    else:
                        tokens[spans[span_ids[i]][3]].append('I-' + ne)
        else:
            for i, span_id in enumerate(span_ids):
                tokens[spans[span_ids[i]][3]].append(ne)

    token_list = []         
    for key, value in tokens.items():
        if len(value) == 3:
            value.append('O')
        token_list.append(value)
    return token_list

In [9]:
POS  = 0
LEN  = 1
WORD = 2
NE   = 3
CTX_LEN = 2

In [10]:
feature_train = parse()
feature_test = parse(TEST_PATH, filenames_test, False)

In [11]:
# POS-тег слова #
morph = pymorphy2.MorphAnalyzer()
def get_pos(token):
    pos = morph.parse(token)[0].tag.POS
    if pos == 'NOUN' or pos == 'ADJF':
        return pos
    return "none"

In [12]:
# Тип регистра слова #
def get_capital(token):
    pattern = re.compile("[{}]+$".format(re.escape(string.punctuation)))
    if pattern.match(token):
        return "none"
    if len(token) == 0:
        return "none"
    if token.islower():
        return "lower"
    elif token.isupper():
        return "upper"
    elif token[0].isupper() and len(token) == 1:
        return "proper"
    elif token[0].isupper() and token[1:].islower():
        return "proper"
    else:
        return "camel"

In [13]:
# Возвращает начальную форму слова #
def get_initial(token):
    init = morph.parse(token)[0].normal_form
    if init:
        return init
    else:
        return "none"

In [14]:
# Заменяет лейбл на "*", если он "редкий" #
NUMBER_OF_OCC = 2
def get_feature(f, feature, counters):
    if feature in counters[f].keys() and counters[f][feature] > NUMBER_OF_OCC:
        return feature
    else:
        return "*"

In [15]:
# Переводит категории в числовое представление #
class ColumnApplier:
    def __init__(self, column_stages):
        self._column_stages = column_stages

    def fit(self, x, y):
        for i, k in self._column_stages.items():
            k.fit(x[:, i])
        return self

    def transform(self, x):
        x = x.copy()
        for i, k in self._column_stages.items():
            x[:, i] = k.transform(x[:, i])
        return x

In [16]:
#Выделим только слова
def get_data(feature):
    data = []
    ans  = []
    for f in feature:
        data.append(f[WORD])
        ans.append(f[NE])

    # Добавим пустые слова для контекстной информации
    data = ["" for i in range(CTX_LEN)] + data
    data = data + ["" for i in range(CTX_LEN)]
    return data, ans

In [17]:
def get_features_new(data):
    features_list = []
    ans = []
    for i in range(len(data)):
        if data[i][3] != 'O':
            arr = []
            arr.append(int(data[i][0]))
            arr.append(int(data[i][1]))
            if i > CTX_LEN:
                for j in range(1, CTX_LEN + 1):
                    arr.append(int(data[i-j][0]))
                    arr.append(int(data[i-j][1]))
            else:
                for j in range(2*CTX_LEN):
                    arr.append(0)
            if i < len(data) - CTX_LEN:
                for j in range(1, CTX_LEN + 1):
                    arr.append(int(data[i+j][0]))
                    arr.append(int(data[i+j][1]))
            else:
                for j in range(2*CTX_LEN):
                    arr.append(0)
            ans.append(data[i][3])
            features_list.append(arr)
    features_list = np.array([np.array(line) for line in features_list])
    return features_list, ans

In [18]:
WEIGHT_PERCENTAGE = 0.9
columns_to_keep = []
multi_encoder = None
enc = None
def get_features(data, ans, clf=ExtraTreesClassifier()):    
    features_list = []
    for k in range(len(data) - 2 * CTX_LEN):
        arr = []
        i = k + CTX_LEN

        pos_arr = [get_pos(data[i])]
        #capital_arr = [get_capital(data[i])]
        #initial_arr = [get_initial(data[i])]

        for j in range(1, CTX_LEN + 1):
            pos_arr.append(get_pos(data[i - j]))
            pos_arr.append(get_pos(data[i + j]))

            #capital_arr.append(get_capital(data[i - j]))
            #capital_arr.append(get_capital(data[i + j]))

            #initial_arr.append(get_initial(data[i - j]))
            #initial_arr.append(get_initial(data[i + j]))
        
        #arr += [int(len(data[i]))]
        arr += pos_arr
        #arr += capital_arr
        #arr += initial_arr

        features_list.append(arr)
    
    features_list = np.array([np.array(line) for line in features_list])
    return features_list

def get_features_bin(features_list, ans, clf=ExtraTreesClassifier()):
    # Выкинем из этого массива классы, встретившиеся менее NUMBER_OF_OCCURENCES раз #
    # Посчитаем частоту лейблов в столбце #
    number_of_columns = features_list.shape[1]
    counters = []
    for u in range(number_of_columns):
        arr = features_list[:, u]
        counter = Counter(arr)
        counters.append(counter)
        
    # Избавимся от редких лейблов (частота < NUMBER_OF_OCC) #
    for y in range(len(features_list)):
        for x in range(number_of_columns):
            features_list[y][x] = get_feature(x, features_list[y][x], counters)
    
    global multi_encoder
    if multi_encoder:
        print("TEST")
        features_list = multi_encoder.transform(features_list)
    else:
        print("TRAIN")
        multi_encoder = ColumnApplier(dict([(i, preprocessing.LabelEncoder()) for i in range(len(features_list[0]))]))
        features_list = multi_encoder.fit(features_list, None).transform(features_list)
    
    global enc
    if not enc:
        enc = preprocessing.OneHotEncoder(dtype=np.bool_, sparse=True)
        enc.fit(features_list)
    features_list = enc.transform(features_list)
    
    clf.fit(features_list, ans)
    features_importances = [(i, el) for i, el in enumerate(clf.feature_importances_)]
    
    
    features_importances = sorted(features_importances, key=lambda el: -el[1])
    current_weight = 0.0
    
    global columns_to_keep
    if columns_to_keep == []:
        for el in features_importances:
            columns_to_keep.append(el[0])
            current_weight += el[1]
            if current_weight > WEIGHT_PERCENTAGE:
                break

    features_list = features_list[:, columns_to_keep]
            
    return features_list

In [19]:
X_tr, Y_train = get_data(feature_train)
X_te, Y_test = get_data(feature_test)

In [20]:
X_train_new = get_features(X_tr, Y_train)
X_test_new = get_features(X_te, Y_test)

In [21]:
X_train = get_features_bin(X_train_new, Y_train)
X_test  = get_features_bin(X_test_new, Y_test)

TRAIN
TEST


In [22]:
clf = LogisticRegression()

In [23]:
clf = RandomForestClassifier() #0.19 - лучший результат

In [24]:
clf = MLPClassifier()

In [25]:
clf = GradientBoostingClassifier()

In [26]:
clf.fit(X_train, Y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [27]:
predict = clf.predict(X_test)

In [28]:
Y_test_np = np.array(Y_test)
predict_np = np.array(predict)
X_test_np = np.array(X_te)
Y_test_i = np.array([Y_test_np != 'O'])
indexes = Y_test_i.reshape(Y_test_np.shape)

In [29]:
Y_test_fixed = Y_test_np[indexes]
predict_fixed = predict_np[indexes]


In [30]:
f1_score(Y_test_fixed, predict_fixed, average="weighted", labels=['LOC', 'LOCORG'])

  'precision', 'predicted', average, warn_for)


0.0

In [31]:
Y_test

['PER',
 'PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'PER',
 'PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'LOC',
 'O',
 'LOC',
 'LOC',
 'O',
 'LOC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',


In [32]:
X_test_np = np.array(feature_test)
X_test_fixed = X_test_np[indexes]

In [33]:
def get_entities(tokens):
    rows = []

    buffer = []
    for token in tokens:
        features = get_features([token[:3]], [token[3]])
        predict = clf.predict(features)
        tag = token[3]
        if tag[0] == 'S':
            rows.append('%s %d %d\n' % (predict, int(token[POS]), int(token[LEN])))
        elif tag[0] == 'B' or tag[0] == 'I':
            buffer.append(token)
        elif tag[0] == 'E':
            buffer.append(token)
            start = int(buffer[0][POS])
            length = int(buffer[-1][POS]) + int(buffer[-1][LEN]) - int(start)
            rows.append('%s %d %d\n' % (predict, start, length))
            buffer = []
    return rows

In [34]:
for file in filenames_test:
    rows = []
    data = parse(TEST_PATH, [file], False)
    #print(data)
    rows = get_entities(data)
    with open('./result/' + file + '.task1', 'w') as f:
        f.writelines(rows)        

ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
!python scripts/t1_eval.py -s ./testset -t ./result