In [1]:
import rdflib

In [2]:
import os
import sys
import glob

In [3]:
g = rdflib.Graph()
g.load('data/_mvd.gov.by_16 (nest_id = 48521669).xml')

In [4]:
Aux = rdflib.Namespace("http://www.abbyy.com/ns/Aux#")
ML = rdflib.Namespace("http://www.abbyy.com/ns/ML#")

In [5]:
set(list(g.predicates()))

{rdflib.term.URIRef('http://www.abbyy.com/ns/Aux#annotation'),
 rdflib.term.URIRef('http://www.abbyy.com/ns/Aux#annotation_end'),
 rdflib.term.URIRef('http://www.abbyy.com/ns/Aux#annotation_start'),
 rdflib.term.URIRef('http://www.abbyy.com/ns/Aux#document_text'),
 rdflib.term.URIRef('http://www.abbyy.com/ns/Aux#instance'),
 rdflib.term.URIRef('http://www.abbyy.com/ns/Aux#property_name'),
 rdflib.term.URIRef('http://www.abbyy.com/ns/Aux#property_value'),
 rdflib.term.URIRef('http://www.abbyy.com/ns/ML#GenericTerm'),
 rdflib.term.URIRef('http://www.abbyy.com/ns/ML#Name'),
 rdflib.term.URIRef('http://www.abbyy.com/ns/ML#Type'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')}

In [6]:
for a in list(g.objects(None, Aux.annotation))[:10]:
    inst = g.value(a, Aux.instance)
    print(
        g.value(inst, ML.Type),
        g.value(inst, ML.GenericTerm),
        g.value(inst, ML.Name),
        g.value(a, Aux.annotation_start),
        g.value(a, Aux.annotation_end),
        sep='\t',
    )

PUBLIC_ROAD	ул.	Грушевской	134	144
PUBLIC_ROAD	ул.	Грушевской	130	133
CITY_TOWN	None	Минск	2	8


In [10]:
import nltk.tokenize
import html
import re
toktok = nltk.tokenize.ToktokTokenizer()

def tokenize(paragraph, shift=0):
    paragraph = paragraph.translate(str.maketrans('«»', '""'))
    
    begin = end = 0
    for sent in nltk.tokenize.sent_tokenize(paragraph):
        for ww in toktok.tokenize(sent):
            ww = html.unescape(ww)
            for w in re.findall('[^.]+\.?', ww):
                try:
                    begin = paragraph.index(w, end)
                except:
                    print(sent)
                    print(w)
                    raise
                end = begin + len(w)
                yield {
                    'word': w,
                    'begin': begin+shift,
                    'end': end+shift,
                }

In [11]:
list(tokenize('в 2015 г. мы, мы всех (всех) & лучше. А вы — нет. Ул.Б.Хмельницкого'))

[{'begin': 0, 'end': 1, 'word': 'в'},
 {'begin': 2, 'end': 6, 'word': '2015'},
 {'begin': 7, 'end': 9, 'word': 'г.'},
 {'begin': 10, 'end': 12, 'word': 'мы'},
 {'begin': 12, 'end': 13, 'word': ','},
 {'begin': 14, 'end': 16, 'word': 'мы'},
 {'begin': 17, 'end': 21, 'word': 'всех'},
 {'begin': 22, 'end': 23, 'word': '('},
 {'begin': 23, 'end': 27, 'word': 'всех'},
 {'begin': 27, 'end': 28, 'word': ')'},
 {'begin': 29, 'end': 30, 'word': '&'},
 {'begin': 31, 'end': 36, 'word': 'лучше'},
 {'begin': 38, 'end': 39, 'word': 'А'},
 {'begin': 40, 'end': 42, 'word': 'вы'},
 {'begin': 43, 'end': 44, 'word': '—'},
 {'begin': 45, 'end': 48, 'word': 'нет'},
 {'begin': 50, 'end': 53, 'word': 'Ул.'},
 {'begin': 53, 'end': 55, 'word': 'Б.'},
 {'begin': 55, 'end': 67, 'word': 'Хмельницкого'}]

In [12]:
import bisect

class Annotator:
    def __init__(self, annotations):
        self.ordered = sorted(annotations)
        
    def __call__(self, token):
        i = bisect.bisect_left(self.ordered, (token['begin'], token['end']))
        for a_start, a_end, a in self.ordered[max(0, i-3):i+3]:
            if a_start <= token['begin'] and token['end'] <= a_end:
                return a
            
class RdfAnnotator(Annotator):
    def __init__(self, g):
        super(RdfAnnotator, self).__init__(
            [
                (
                    int(g.value(a, Aux.annotation_start)),
                    int(g.value(a, Aux.annotation_end)),
                    a
                )
                for a in g.objects(None, Aux.annotation)
            ]
        )

In [13]:
import pymorphy2
morph_analyzer = pymorphy2.MorphAnalyzer()

In [14]:
from functools import lru_cache
import collections

@lru_cache()
def morph_parses(word):
    return morph_analyzer.parse(word.strip('.'))

@lru_cache()
def get_morpho(word):
    parsed = morph_parses(word)
    features = collections.defaultdict(lambda: 0)
    for p in parsed:
        for grammeme in p.tag.grammemes:
            features[grammeme] += p.score
    return features

In [15]:
import pandas as pd

freq_dict = pd.read_csv('freq/5000lemma.utf8.num', sep=' ', names=['i','score','lem','pos'], index_col='i')
freq_dict.head()

Unnamed: 0_level_0,score,lem,pos
i,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,36358.94,и,misc
2,27792.36,в,prep
3,20689.51,не,misc
4,18942.62,он,pron
5,16588.14,на,prep


In [16]:
class WordList:
    def __init__(self, li, fmt):
        self.set = set(li)
        self.fmt = fmt
        
    def word_freq_score(self, word):
        ret = collections.defaultdict(lambda: 0.0)
        for p in morph_parses(word):
            if p.normal_form in self.set:
                ret[self.fmt.format(p.normal_form)] += p.score
        return ret

In [17]:
freq_word_list = WordList(freq_dict.lem, 'is_freq_{}')

In [18]:
key_word_list = WordList(    
    (
        'ш шоссе ул улица пр пр-т проспект пр-д проезд наб набережная пл площадь линия'
        ' п пос пгт п.г.т поселок дер деревня с село селение поселение ст станица г город селение х хутор'
        ' д дом обл область р-н район столица округ р река'
    ).split(),
    'is_key_{}'
)

In [19]:
class PhraseList:
    def __init__(self, lines, fmt):
        self.positions = collections.defaultdict(set)
        for line in lines:
            for i, tok in enumerate(tokenize(line)):
                if tok['word']:
                    self.positions[i].add(tok['word'])
        self.positions = {
            i: WordList(toks, fmt=(fmt + '_pos_{pos}').format(pos=i))
            for i, toks in self.positions.items()
        }
    
    def word_freq_score(self, word):
        ret = {}
        for wlist in self.positions.values():
            ret.update(wlist.word_freq_score(word))
        return ret

In [20]:
with open('osmmm/local.places.list') as places_file:
    places_gazetteer = PhraseList(places_file, 'places')

In [21]:
with open('osmmm/local.ways.list') as ways_file:
    ways_gazetteer = PhraseList(ways_file, 'ways')

In [22]:
import gensim

w2v_news = gensim.models.KeyedVectors.load_word2vec_format('w2v/news_0_300_2.bin', unicode_errors='ignore', binary=True)

len(w2v_news.vocab)

194058

In [23]:
class W2VAdder:
    def __init__(self, w2v, fmt):
        self.w2v = w2v
        self.fmt = fmt
    
    def get_features(self, word):
        ret = collections.defaultdict(lambda: 0.0)
        for p in morph_parses(word):
            k = '{}_{}'.format(p.normal_form, p.tag.POS)
            if k in self.w2v:
                ret.update({self.fmt.format(i): v*p.score for i, v in enumerate(self.w2v[k])})
        return ret

In [24]:
w2v_news_adder = W2VAdder(w2v_news, 'w2v_{}')

In [105]:
def raw_stream(paragraph, shift):
    for token in tokenize(paragraph, shift):
        token['word'] = token['word'].translate(str.maketrans('[]', '()', '<>%'))
        if not token['word']:
            continue
            
        token.update(get_morpho(token['word']))
        
        token['capitalized'] = token['word'][0].istitle()
        token['isupper'] = token['word'][0].isupper()
        token['islower'] = token['word'][0].islower()

        if token.get('PNCT'):
            token['punct_value'] = token['word']
            
        token['endswith_dot'] = token['word'].endswith('.')

        yield token

def raw_streams(text):
    shift = 0
    for paragraph in text.split(u"\u2028\u2028"):
        yield raw_stream(paragraph, shift)
        shift += len(paragraph) + len(u"\u2028\u2028")

def add_annotations(token, annotator, g, manual_annotator):
    annotation = annotator(token)
    if annotation:
        inst = g.value(annotation, Aux.instance)
        token.update({
            'ML_Type': str(g.value(inst, ML.Type)),
#             'ML_GenericTerm': str(g.value(inst, ML.GenericTerm)),
#             'ML_Name': str(g.value(inst, ML.Name)),
        })
        for li in [freq_word_list, key_word_list, places_gazetteer, ways_gazetteer]:
            token.update(li.word_freq_score(token['word']))
        token.update(w2v_news_adder.get_features(token['word']))
    manu = manual_annotator(token)
    if manu:
        token['manual'] = manu
    return token

def annotated_stream(raw, annotator, g, man_ann):
    for token in raw:
        add_annotations(token, annotator, g, man_ann)
        yield token
        
def parse_manual_annotations(text):
    result = ''
    res = []
    for part in text.split('{{{'):
        if '}}}' not in part:
            result += part
            continue
            
        clo = part.index('}}}')
        ins, right = part[:clo], part[clo+len('}}}'):]
        res.append((
            len(result),
            len(result) + len(ins),
            'cool'
        ))
        result += ins + right
    assert '{{{' not in result
    assert '}}}' not in result
    return res, result

def annotated_streams(g):
    annotator = RdfAnnotator(g)
    texts = [o.value for o in g.objects(None, Aux.document_text)]
    assert len(texts) == 1
    text = texts[0]
    manual, text = parse_manual_annotations(text)
    for raw in raw_streams(text):
        yield annotated_stream(raw, annotator, g, Annotator(manual))

In [26]:
import pandas

In [27]:
def window_col(i, f):
    return 'w{}_{}'.format(i, f)

def windowize_cols(i, df):
    return pandas.DataFrame(df, columns=[window_col(i, c) for c in df.columns])

In [28]:
def rolling_reshape(df, window):
    step = len(df)
    values = df.values
    
    values = numpy.concatenate([
        numpy.full((window, len(df.columns)), 0),
        values,
        numpy.full((window, len(df.columns)), 0),
    ])

    values = values.reshape(values.size,)
    size = step * len(df.columns)
    index = df.index

    frame = pandas.DataFrame(index=index)
#     for i, pos in enumerate(range(len(df.columns) * (window // 2), values.size - size + 1, len(df.columns))):
    for i in range(window):
        pos = (i + (window + 1) // 2) * len(df.columns)
        cols = [window_col(i, x) for x in df.columns]
        new_frame = pandas.DataFrame(
            values[pos:pos+size].reshape((step, len(df.columns))),
            index=index,
            columns=cols,
        )
        frame = pandas.concat([frame, new_frame], axis=1)
    return frame

In [357]:
xgb1.fit(X, Y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=90,
       n_jobs=1, nthread=31, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.5)

In [358]:
len(X.columns)

3951

In [363]:
sum(xgb1.feature_importances_ > 0.)

523

In [29]:
import numpy

In [30]:
g.load('manual-data/Музей истории ГУЛАГа открылся в новом здании (nest_id = 77344277).xml')

In [31]:
text = [o.value for o in g.objects(None, Aux.document_text)][0]

In [32]:
parse_manual_annotations(text)

([(57, 63, 'cool'),
  (145, 168, 'cool'),
  (216, 222, 'cool'),
  (328, 351, 'cool'),
  (645, 657, 'cool')],
 'В пятницу, в День памяти жертв политических репрессий, в Москве открылся государственный музей истории ГУЛАГа, который переехал в новое здание в 1-м Самотечном переулке.\u2028\u2028Ранее сообщалось, что музей истории ГУЛАГа в Москве\xa0 возобновит работу в пятницу, 30 октября, в День памяти жертв политических репрессий, в новом здании - в 1-м Самотечном переулке.\u2028\u2028Государственный музей истории ГУЛАГа был основан в 2001 году Антоном Владимировичем Антоновым-Овсеенко – известным историком, публицистом, общественным деятелем, в свое время прошедшим через сталинские лагеря как сын «врага народа». С 2004 по 2015 год Музей располагался в здании по адресу: ул. Петровка, д.16.')

In [33]:
rdfa = RdfAnnotator(g)
rdfa({'begin': 146, 'end': 147})

In [34]:
def make_features(df, window):
    drop_cols = [
        col
        for col in ['begin', 'end', 'word', 'ML_Type', 'Geox', 'manual']#, 'Abbr', 'Fixd', 'Sgtm']
        if col in df.columns
    ]
    return rolling_reshape(df.drop(drop_cols, axis=1), window)

def make_target(df, window):
    if 'ML_Type' in df.columns:
        return df['ML_Type'].fillna('')
    else:
        return numpy.asarray([''] * len(df))

In [35]:
def all_annotated_streams(pattern):
    datasets = glob.glob(pattern)
    for ds in datasets:
        g = rdflib.Graph()
        g.load(ds)
        yield from annotated_streams(g)

In [36]:
WINDOW_SIZE = 5

In [37]:
import pandas as pd

In [38]:
import numpy as np

In [129]:
import itertools

def read_datasets(pattern, texts_number):
    Xs = []
    Ys = []
    heres = []
    sz = 0
    for i, astr in enumerate(itertools.islice(all_annotated_streams(pattern), texts_number)):
        entries = list(astr)
        here = pandas.DataFrame(
            entries,
            index=np.arange(sz, sz + len(entries))
        )
        heres.append(here)
        Xs.append(make_features(here, WINDOW_SIZE))
        Ys.append(make_target(here, WINDOW_SIZE))
        sz += len(entries)
        print('\r{:5}/{} {}'.format(i+1, texts_number, sz), end='')
        assert len(Xs[-1]) == len(Ys[-1])
    print()
    
    tokens = pandas.concat(heres)
    del heres
    
    X = pandas.concat(Xs)
    del Xs
    print('X concated')
    
    X = pandas.get_dummies(X, columns=[window_col(i, 'punct_value') for i in range(WINDOW_SIZE)])
    X.fillna(0.0, inplace=True)
    print('X fillnad')
    
    cap = [
        window_col(i, bool_feat)
        for i in range(WINDOW_SIZE)
        for bool_feat in tokens.columns[tokens.dtypes == np.dtype('bool')]
    ]
    X[cap] = X[cap].astype(bool)
    print('X astyped')
    
    Y = numpy.concatenate(Ys)
    del Ys
    print('Y concated')

    Y = Y.astype(bool)
    print('Y astyped')
    
    return tokens, X, Y

In [130]:
%%time
tokens, X, Y = read_datasets('data/*.xml', None)

 6911/None 398071
X concated
X fillnad
X astyped
Y concated
Y astyped
CPU times: user 22min 25s, sys: 59.9 s, total: 23min 25s
Wall time: 23min 31s


In [131]:
X.shape

(398071, 3951)

In [132]:
man_tokens, man_X, man_Y_abbyy = read_datasets('manual-data/*.xml', None)
man_Y_man = (man_tokens.manual == 'cool')
man_tokens = man_tokens.reindex(columns=tokens.columns)
man_X = man_X.reindex(columns=X.columns).fillna(0.0)

  807/None 41335
X concated
X fillnad
X astyped
Y concated
Y astyped


In [374]:
len(X)

398071

In [375]:
len(man_X)

41335

In [191]:
compare = join_outputs(dropouts['>.<'])

In [192]:
print(classification_report(man_Y_man, man_Y_abbyy))

             precision    recall  f1-score   support

      False       0.98      0.99      0.99     38588
       True       0.85      0.73      0.79      2747

avg / total       0.97      0.97      0.97     41335



In [328]:
len(compare)

41335

In [346]:
len(compare[compare.out != compare.abbyy])

255

In [194]:
metrics.confusion_matrix(compare.man, compare.abbyy)

array([[38243,   345],
       [  736,  2011]])

In [345]:
list(tokenize('Волгоград, ул. Грановитая'))

[{'begin': 0, 'end': 9, 'word': 'Волгоград'},
 {'begin': 9, 'end': 10, 'word': ','},
 {'begin': 11, 'end': 13, 'word': 'ул'},
 {'begin': 15, 'end': 25, 'word': 'Грановитая'}]

In [134]:
def join_outputs(out):
    compare = rolling_reshape(man_tokens[['word']], 7)
    compare['man'] = man_Y_man
    compare['abbyy'] = man_Y_abbyy
    compare['out'] = out
    return compare

In [222]:
import re
def drop_by_re(df, r, invert):
    return df.drop([c for c in df.columns if bool(re.search(r, c)) != invert], axis=1)

In [223]:
def run_dropped(drop_re, alg_src, invert):
    xre = drop_by_re(X, drop_re, invert)
    manx = drop_by_re(man_X, drop_re, invert)
    
    alg = XGBClassifier(**alg_src.get_params())
    alg.fit(xre, Y)
    return alg, alg.predict(manx)

In [137]:
from sklearn.metrics import classification_report
def eval_out(out):
    print('man:' + classification_report(man_Y_man, out))
    print('abbyy:' + classification_report(man_Y_abbyy, out))

In [224]:
def eval_dropped(drop_re, alg_src, invert=False):
    alg, out = run_dropped(drop_re, alg_src, invert)
    eval_out(out)
    return out

In [139]:
feature_res = ['_ways_', '_places_', '_w2v_', 'is_freq', 'is_key_', 'capitalized|islower|isupper']

In [140]:
dropouts = {}

In [150]:
%%time
import itertools

for i in [0,1,2]:
    for drops in itertools.combinations(feature_res, i):
        k = '|'.join(sorted(drops)) or '>.<'
        if k not in dropouts:
            print(k)
            dropouts[k] = eval_dropped(k, xgb1)

capitalized|islower|isupper|is_freq
man:             precision    recall  f1-score   support

      False       0.98      0.99      0.99     38588
       True       0.87      0.72      0.79      2747

avg / total       0.97      0.97      0.97     41335

abbyy:             precision    recall  f1-score   support

      False       1.00      1.00      1.00     38979
       True       0.96      0.92      0.94      2356

avg / total       0.99      0.99      0.99     41335

capitalized|islower|isupper|is_key_
man:             precision    recall  f1-score   support

      False       0.98      0.99      0.99     38588
       True       0.87      0.72      0.79      2747

avg / total       0.97      0.97      0.97     41335

abbyy:             precision    recall  f1-score   support

      False       1.00      1.00      1.00     38979
       True       0.96      0.93      0.94      2356

avg / total       0.99      0.99      0.99     41335

CPU times: user 2h 43min 29s, sys: 1min 49s, tot

In [225]:
dropouts['not w2v,places,ways']= eval_dropped('_places_|_w2v_|_ways', xgb1, invert=True)

man:             precision    recall  f1-score   support

      False       0.98      0.99      0.99     38588
       True       0.85      0.72      0.78      2747

avg / total       0.97      0.97      0.97     41335

abbyy:             precision    recall  f1-score   support

      False       0.99      0.99      0.99     38979
       True       0.91      0.90      0.90      2356

avg / total       0.99      0.99      0.99     41335



In [317]:
dropouts['morpho_only'] = eval_dropped(
    'places|w2v|ways|freq|is_key|capital|lower|upper|punct_value',
    xgb1,
)

man:             precision    recall  f1-score   support

      False       0.96      1.00      0.98     38588
       True       0.95      0.40      0.56      2747

avg / total       0.96      0.96      0.95     41335

abbyy:             precision    recall  f1-score   support

      False       0.96      0.99      0.98     38979
       True       0.71      0.35      0.47      2356

avg / total       0.95      0.95      0.95     41335



In [318]:
dropouts['but_morpho'] = eval_dropped(
    'places|w2v|ways|freq|is_key|capital|lower|upper|punct_value',
    xgb1,
    invert=True,
)

man:             precision    recall  f1-score   support

      False       0.98      0.99      0.99     38588
       True       0.87      0.74      0.80      2747

avg / total       0.97      0.98      0.97     41335

abbyy:             precision    recall  f1-score   support

      False       1.00      1.00      1.00     38979
       True       0.94      0.93      0.93      2356

avg / total       0.99      0.99      0.99     41335



In [261]:
dropouts['gaz'] = eval_dropped('places|ways', xgb1, invert=True)

man:             precision    recall  f1-score   support

      False       0.97      0.99      0.98     38588
       True       0.81      0.60      0.69      2747

avg / total       0.96      0.96      0.96     41335

abbyy:             precision    recall  f1-score   support

      False       0.98      0.99      0.99     38979
       True       0.82      0.70      0.75      2356

avg / total       0.97      0.97      0.97     41335



In [302]:
dropouts['w2v_only'] = eval_dropped('w2v', xgb1, invert=True)

man:             precision    recall  f1-score   support

      False       0.97      1.00      0.99     38588
       True       0.93      0.63      0.75      2747

avg / total       0.97      0.97      0.97     41335

abbyy:             precision    recall  f1-score   support

      False       0.99      1.00      0.99     38979
       True       0.98      0.78      0.87      2356

avg / total       0.99      0.99      0.99     41335



In [355]:
dropouts

{'>.<': array([False,  True, False, ..., False, False, False], dtype=bool),
 '_places_': array([False,  True, False, ..., False, False, False], dtype=bool),
 '_places_|_w2v_': array([False,  True, False, ..., False, False, False], dtype=bool),
 '_places_|_w2v_|_ways_': array([False,  True, False, ..., False, False, False], dtype=bool),
 '_places_|_ways_': array([False,  True, False, ..., False, False, False], dtype=bool),
 '_places_|capitalized|islower|isupper': array([False,  True, False, ..., False, False, False], dtype=bool),
 '_places_|is_freq': array([False,  True, False, ..., False, False, False], dtype=bool),
 '_places_|is_key_': array([False,  True, False, ..., False, False, False], dtype=bool),
 '_w2v_': array([False,  True, False, ..., False, False, False], dtype=bool),
 '_w2v_|_ways_': array([False,  True, False, ..., False, False, False], dtype=bool),
 '_w2v_|capitalized|islower|isupper': array([False,  True, False, ..., False, False, False], dtype=bool),
 '_w2v_|is_freq': 

In [329]:
fsets = [
    '>.<',
    'but_morpho',
    'capitalized|islower|isupper',
    'is_freq|is_key_',
    '_places_|_ways_',
    '_w2v_',
    'morpho_only',
    'gaz',
    '_w2v_|capitalized|islower|isupper',
    '_places_|_w2v_|_ways_',
    'not w2v,places,ways',
    'w2v_only',
]

dropcomp = pd.DataFrame(
    [
        {
            'pred': pred,

            'precision': metrics.precision_score(man_Y_man, pred),
            'recall': metrics.recall_score(man_Y_man, pred),
            'f1': metrics.f1_score(man_Y_man, pred),
            
#             'a_precision': metrics.precision_score(man_Y_abbyy, pred),
#             'a_recall': metrics.recall_score(man_Y_abbyy, pred),
#             'a_f1': metrics.f1_score(man_Y_abbyy, pred),
        } 
        for k, pred in [(kk, dropouts[kk]) for kk in fsets]
    ],
    index=['F{}'.format(i) for i in range(1, len(fsets) + 1)],
)

In [364]:
acc = dropcomp.drop(['pred'], axis=1)

In [332]:
metrics.confusion_matrix(compare.man, compare.out)

array([[38303,   285],
       [  755,  1992]])

In [333]:
metrics.confusion_matrix(compare.man, compare.abbyy)

array([[38243,   345],
       [  736,  2011]])

In [334]:
print(dropcomp.drop(['pred'], axis=1).transpose().to_latex(
    float_format=lambda f: str(round(f,2))
))

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
{} &   F1 &   F2 &   F3 &   F4 &   F5 &   F6 &   F7 &   F8 &   F9 &  F10 &  F11 &  F12 \\
\midrule
f1        & 0.79 &  0.8 & 0.79 & 0.79 & 0.79 & 0.78 & 0.56 & 0.69 & 0.76 & 0.71 & 0.78 & 0.75 \\
precision & 0.87 & 0.87 & 0.87 & 0.88 & 0.93 & 0.88 & 0.95 & 0.81 & 0.87 & 0.97 & 0.85 & 0.93 \\
recall    & 0.73 & 0.74 & 0.72 & 0.73 & 0.68 & 0.71 &  0.4 &  0.6 & 0.68 & 0.57 & 0.72 & 0.63 \\
\bottomrule
\end{tabular}



## Классификаторы

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [180]:
importances = pd.Series(xgb1.feature_importances_, index=X.columns)

In [382]:
importances.sort_values(ascending=False).head(50)

NameError: name 'importances' is not defined

In [126]:
def reduced(x, number):
    return x[importances.sort_values(ascending=False).head(number).index]

In [604]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [444]:
from sklearn.model_selection import cross_val_score

## XGBoost!

In [50]:
from sklearn import metrics
from sklearn.grid_search import GridSearchCV

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

import xgboost as xgb
from xgboost import XGBClassifier



In [41]:
def fit_xgb(alg, dtrain, ytrain, dtest, ytest, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain.values, label=ytrain)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, stratified=True)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(dtrain, ytrain, eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain)
    dtrain_predprob = alg.predict_proba(dtrain)[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(ytrain, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(ytrain, dtrain_predprob))
    
#     Predict on testing data:
    results = alg.predict_proba(dtest)[:,1]
    print('AUC Score (Test): %f' % metrics.roc_auc_score(ytest, results))

    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False).head(100)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [143]:
xgb1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=90,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.5,
    colsample_bytree=0.5,
    objective='binary:logistic',
    nthread=31,
    scale_pos_weight=1,
    seed=27
)

In [111]:
%%time
fit_xgb(xgb1, X, Y, man_X, man_Y_man)

NameError: name 'fit_xgb' is not defined

In [52]:
xgb1

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=322,
       n_jobs=1, nthread=30, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8)

In [40]:
backup = pd.HDFStore('backup.h5')

In [41]:
X, Y, tokens = backup.X, backup.Y, backup.tokens

In [222]:
backup['X'], backup['Y'], backup['tokens'] = X, Y, tokens

TypeError: cannot properly create the storer for: [_TYPE_MAP] [group->/Y (Group) '',value-><class 'numpy.ndarray'>,format->fixed,append->False,kwargs->{'encoding': None}]

In [42]:
import sklearn_crfsuite as crfsuite

In [43]:
crf = crfsuite.CRF(c1=0.1, c2=0.1)

In [274]:
cap = [
    window_col(i, bool_feat)
    for i in range(WINDOW_SIZE)
    for bool_feat in tokens.columns[tokens.dtypes == np.dtype('bool')]
]

In [277]:
X[cap] = X[cap].astype(int)

In [295]:
next(iter(X.head(1000).values)).dtype

dtype('float64')

In [331]:
%%time
crf.fit([X.values], [[str(y) for y in Y]])

CPU times: user 6.76 s, sys: 5.68 s, total: 12.4 s
Wall time: 12.5 s


CRF(algorithm=None, all_possible_states=None, all_possible_transitions=None,
  averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
  calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=None,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [332]:
crf.classes_

['False', 'True']

In [None]:
%%time
y = [s == 'True' for ss in crf.predict(man_X.head(1000).values) for s in ss]

In [340]:
len(y)

3941000

In [257]:
ro = X.iloc[0]

In [268]:
pd.Series(c.dtype for c in ro).unique()

array([dtype('float64'), dtype('bool'), dtype('uint8')], dtype=object)

In [249]:
pd.Series(X.head(1000).values[0]

dtype('O')

In [114]:
def read_rfc_input(pat, num):
    Xs = []
    Ys = []
    manys = []
    for entries in itertools.islice(all_annotated_streams(pat), num):
        Xs.append([])
        Ys.append([])
        manys.append([])
        for e in entries:
            Xs[-1].append({
                k: v for k, v in e.items()
                if k not in {'ML_Type', 'manual', 'begin', 'end', 'Geox'} and '_w2v_' not in k
            })
            pa = morph_parses(e['word'])
            if pa:
                Xs[-1][-1]['lem'] = pa[0].normal_form
            Ys[-1].append(e.get('ML_Type', 'no'))
            manys[-1].append(e.get('manual', 'no'))
    return Xs, Ys, manys

In [117]:
%%time
Xs, Ys, _ = read_rfc_input('data/*.xml', None)

CPU times: user 1min 55s, sys: 207 ms, total: 1min 55s
Wall time: 1min 56s


In [118]:
%%time
crf.fit(Xs, Ys)

CPU times: user 4min 33s, sys: 1.02 ms, total: 4min 33s
Wall time: 4min 34s


CRF(algorithm=None, all_possible_states=None, all_possible_transitions=None,
  averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
  calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=None,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [119]:
%%time
manxs, manabbyys, manys = read_rfc_input('manual-data/*.xml', None)

CPU times: user 12.9 s, sys: 40.2 ms, total: 12.9 s
Wall time: 13 s


In [120]:
%%time
manpreds = crf.predict(manxs)

CPU times: user 420 ms, sys: 2.02 ms, total: 422 ms
Wall time: 423 ms


In [121]:
manysflat = [m != 'no' for m in itertools.chain(*manys)]

In [122]:
manpredsflat = [m != 'no' for m in itertools.chain(*manpreds)]

In [123]:
from sklearn.metrics import classification_report
print(classification_report(manysflat, manpredsflat))

             precision    recall  f1-score   support

      False       0.98      0.99      0.99     38588
       True       0.87      0.72      0.79      2747

avg / total       0.97      0.97      0.97     41335



In [128]:
crf.attributes_

['PREP',
 'endswith_dot',
 'islower',
 'capitalized',
 'Vpre',
 'isupper',
 'lem:в',
 'nomn',
 'masc',
 'accs',
 'sing',
 'NOUN',
 'inan',
 'Abbr',
 'Fixd',
 'word:в',
 'plur',
 'loct',
 'ablt',
 'datv',
 'gent',
 'lem:офис',
 'femn',
 'ADJF',
 'lem:социальный',
 'Qual',
 'loc2',
 'lem:сеть',
 'lem:"',
 'word:"',
 'PNCT',
 'punct_value:"',
 'Sgtm',
 'lem:вконтакте',
 'lem:дом',
 'Surn',
 'anim',
 'PRCL',
 'INTJ',
 'lem:на',
 'w2v_88',
 'w2v_288',
 'w2v_293',
 'w2v_37',
 'w2v_260',
 'w2v_268',
 'w2v_291',
 'w2v_48',
 'w2v_26',
 'w2v_1',
 'w2v_170',
 'w2v_52',
 'w2v_86',
 'w2v_213',
 'w2v_97',
 'w2v_29',
 'w2v_136',
 'w2v_276',
 'w2v_69',
 'w2v_248',
 'w2v_258',
 'w2v_215',
 'w2v_296',
 'w2v_39',
 'w2v_221',
 'w2v_216',
 'w2v_149',
 'w2v_189',
 'w2v_102',
 'w2v_5',
 'w2v_63',
 'w2v_46',
 'w2v_236',
 'w2v_204',
 'w2v_141',
 'w2v_65',
 'w2v_4',
 'w2v_148',
 'w2v_145',
 'w2v_90',
 'w2v_19',
 'w2v_197',
 'w2v_75',
 'w2v_40',
 'w2v_251',
 'w2v_180',
 'w2v_68',
 'w2v_24',
 'w2v_274',
 'w2v_169

In [125]:
collections.Counter(crf.state_features_).most_common(30)

[(('PNCT', 'no'), 9.914438),
 (('is_freq_московский', 'CITY_TOWN'), 9.10397),
 (('w2v_19', 'CITY_TOWN'), 8.313187),
 (('w2v_291', 'CITY_TOWN'), 8.289856),
 (('PREP', 'no'), 7.796155),
 (('w2v_170', 'CITY_TOWN'), 7.528315),
 (('is_freq_московский', 'PUBLIC_ROAD'), 7.342685),
 (('CONJ', 'no'), 7.330513),
 (('w2v_291', 'PUBLIC_ROAD'), 6.825587),
 (('w2v_64', 'CITY_TOWN'), 6.606673),
 (('w2v_46', 'PUBLIC_ROAD'), 6.470935),
 (('w2v_122', 'CITY_TOWN'), 6.357505),
 (('w2v_116', 'CITY_TOWN'), 6.241968),
 (('w2v_34', 'CITY_TOWN'), 6.124438),
 (('w2v_49', 'CITY_TOWN'), 6.091447),
 (('is_freq_кремль', 'HOUSE_BLOCK'), 6.046871),
 (('w2v_74', 'CITY_TOWN'), 6.023019),
 (('ADVB', 'no'), 6.00771),
 (('ways_pos_2', 'HOUSE_BLOCK'), 5.916516),
 (('w2v_37', 'CITY_TOWN'), 5.890586),
 (('w2v_209', 'CITY_TOWN'), 5.860156),
 (('w2v_271', 'CITY_TOWN'), 5.857976),
 (('is_freq_ленинградский', 'CITY_TOWN'), 5.853622),
 (('w2v_46', 'CITY_TOWN'), 5.763661),
 (('w2v_10', 'CITY_TOWN'), 5.746648),
 (('lem:уфимский', '