In [31]:
import os
import pymorphy2
import re
import string
import numpy as np
from collections import Counter
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

In [2]:
DEV_PATH  = './devset/'
TEST_PATH = './testset/'

In [3]:
filenames_dev  = [f.split('.')[0] for f in os.listdir(DEV_PATH) if '.tokens' in f]
filenames_test = [f.split('.')[0] for f in os.listdir(TEST_PATH) if '.tokens' in f]
#filenames_dev = ['book_100']

In [4]:
def load_tokens(path = DEV_PATH, filenames = filenames_dev):
    tokens = dict()
    for file in filenames:
        with open(path + file + '.tokens', 'r+', encoding='utf8') as f:
            for line in f:
                split = line.split()
                if split:
                    tokens[split[0]] = split[1:]
    return tokens

In [5]:
def load_spans(path = DEV_PATH, filenames = filenames_dev):
    spans = dict()
    for file in filenames:
        with open(path + file + '.spans', 'r+', encoding='utf8') as f:
            for line in f:
                split = line.split()
                spans[split[0]] = split[1:]
    return spans

In [6]:
def load_objects(path = DEV_PATH, filenames = filenames_dev):
    objects = dict()
    for file in filenames:
        with open(path + file + '.objects', 'r+', encoding='utf8') as f:
            for line in f:
                part = line.split(' # ')[0]
                split = part.split()
                if split[1] == 'Location' or split[1] == 'LocOrg':
                    objects[split[0]] = split[1:]
    return objects

In [7]:
def parse(path = DEV_PATH, filenames = filenames_dev):
    tokens = load_tokens(path, filenames)
    spans  = load_spans(path, filenames)
    objects = load_objects(path, filenames)
    for key, value in objects.items():
        ne = value[0]
        span_ids = value[1:]
        for i, span_id in enumerate(span_ids):
            tokens[spans[span_ids[i]][3]].append(ne)
    
    token_list = []         
    for key, value in tokens.items():
        if len(value) == 3:
            value.append('O')
        token_list.append(value)
    return token_list

In [8]:
POS  = 0
LEN  = 1
WORD = 2
NE   = 3
CTX_LEN = 2

In [9]:
feature_train = parse()
feature_test = parse(TEST_PATH, filenames_test)

In [10]:
# POS-тег слова #
morph = pymorphy2.MorphAnalyzer()
def get_pos(token):
    pos = morph.parse(token)[0].tag.POS
    if pos:
        return pos
    return "none"

In [11]:
# Тип регистра слова #
def get_capital(token):
    pattern = re.compile("[{}]+$".format(re.escape(string.punctuation)))
    if pattern.match(token):
        return "none"
    if len(token) == 0:
        return "none"
    if token.islower():
        return "lower"
    elif token.isupper():
        return "upper"
    elif token[0].isupper() and len(token) == 1:
        return "proper"
    elif token[0].isupper() and token[1:].islower():
        return "proper"
    else:
        return "camel"

In [12]:
# Возвращает начальную форму слова #
def get_initial(token):
    init = morph.parse(token)[0].normal_form
    if init:
        return init
    else:
        return "none"

In [13]:
# Заменяет лейбл на "*", если он "редкий" #
NUMBER_OF_OCC = 5
def get_feature(f, feature, counters):
    if feature in counters[f].keys() and counters[f][feature] > NUMBER_OF_OCC:
        return feature
    else:
        return "*"

In [14]:
# Переводит категории в числовое представление #
class ColumnApplier:
    def __init__(self, column_stages):
        self._column_stages = column_stages

    def fit(self, x, y):
        for i, k in self._column_stages.items():
            k.fit(x[:, i])
        return self

    def transform(self, x):
        x = x.copy()
        for i, k in self._column_stages.items():
            x[:, i] = k.transform(x[:, i])
        return x

In [15]:
#Выделим только слова
def get_data(feature):
    data = []
    ans  = []
    for f in feature:
        data.append(f[WORD])
        ans.append(f[NE])

    # Добавим пустые слова для контекстной информации
    data = ["" for i in range(CTX_LEN)] + data
    data = data + ["" for i in range(CTX_LEN)]
    return data, ans

In [16]:
X_train, Y_train = get_data(feature_train)
X_test, Y_test = get_data(feature_test)

In [17]:
WEIGHT_PERCENTAGE = 0.9
columns_to_keep = []

In [18]:
def get_features(data, ans, clf=ExtraTreesClassifier()):    
    features_list = []
    for k in range(len(data) - 2 * CTX_LEN):
        arr = []
        i = k + CTX_LEN

        pos_arr = [get_pos(data[i])]
        capital_arr = [get_capital(data[i])]
        initial_arr = [get_initial(data[i])]

        for j in range(1, CTX_LEN + 1):
            pos_arr.append(get_pos(data[i - j]))
            pos_arr.append(get_pos(data[i + j]))

            capital_arr.append(get_capital(data[i - j]))
            capital_arr.append(get_capital(data[i + j]))

            initial_arr.append(get_initial(data[i - j]))
            initial_arr.append(get_initial(data[i + j]))
        
        #arr += len(data[i])
        arr += pos_arr
        arr += capital_arr
        arr += initial_arr

        features_list.append(arr)
    
    features_list = np.array([np.array(line) for line in features_list])
    
    # Выкинем из этого массива классы, встретившиеся менее NUMBER_OF_OCCURENCES раз #
    # Посчитаем частоту лейблов в столбце #
    number_of_columns = features_list.shape[1]
    counters = []
    for u in range(number_of_columns):
        arr = features_list[:, u]
        counter = Counter(arr)
        counters.append(counter)
        
    # Избавимся от редких лейблов (частота < NUMBER_OF_OCC) #
    for y in range(len(features_list)):
        for x in range(number_of_columns):
            features_list[y][x] = get_feature(x, features_list[y][x], counters)
            
    multi_encoder = ColumnApplier(dict([(i, preprocessing.LabelEncoder()) for i in range(len(features_list[0]))]))
    features_list = multi_encoder.fit(features_list, None).transform(features_list)
    
    enc = preprocessing.OneHotEncoder(dtype=np.bool_, sparse=True)
    enc.fit(features_list)
    features_list = enc.transform(features_list)
    
    clf.fit(features_list, ans)
    features_importances = [(i, el) for i, el in enumerate(clf.feature_importances_)]
    
    
    features_importances = sorted(features_importances, key=lambda el: -el[1])
    current_weight = 0.0
    
    global columns_to_keep
    if columns_to_keep == []:
        for el in features_importances:
            columns_to_keep.append(el[0])
            current_weight += el[1]
            if current_weight > WEIGHT_PERCENTAGE:
                break

    features_list = features_list[:, columns_to_keep]
            
    return features_list

In [19]:
X_train = get_features(X_train, Y_train)
X_test = get_features(X_test, Y_test)

In [33]:
clf = LogisticRegression()

In [41]:
clf = RandomForestClassifier()

In [47]:
clf = MLPClassifier()

In [23]:
clf = GradientBoostingClassifier()

In [None]:
clf.fit(X_train, Y_train)

In [None]:
predict = clf.predict(X_test)

In [None]:
Y_test_np = np.array(Y_test)
predict_np = np.array(predict)
Y_test_i = np.array([Y_test_np != 'O'])
indexes = Y_test_i.reshape(Y_test_np.shape)

In [None]:
Y_test_fixed = Y_test_np[indexes]
predict_fixed = predict_np[indexes]

In [None]:
f1_score(Y_test_fixed, predict_fixed, average="weighted", labels=['Location', 'LocOrg'])

In [40]:
Y_test_fixed

array(['Location', 'Location', 'Location', ..., 'Location', 'Location',
       'Location'],
      dtype='<U8')