# Assignment 4: Named entity recognition

Построить модель для обнаружения и классификации именованных сущностей (named entities). На базе корпуса CoNLL 2002.  

Используйте в своем решении ансамбли над решающими деревьями: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost) 
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 


Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества f1 (f1_macro) (чем выше, тем лучше)
 
baseline 1: 0.0604      random labels  
baseline 2: 0.3966      PoS features + logistic regression  
baseline 3: 0.8122      word2vec cbow embedding + baseline 2 + svm    

! Your results must be reproducible. Если ваша модель - стохастическая, то вы явно должны задавать все seed и random_state в параметрах моделей   

bonus, think about:  
1. How can you exploit that words belong to some sentence?
2. Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=1337

In [2]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [3]:
# number of sentences
df.sentence_idx.max()

1500.0

In [4]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [5]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [6]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [7]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


In [8]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [9]:
# some wrappers to work with word2vec
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from collections import defaultdict

   
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])
    


In [10]:
%%time
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)

CPU times: user 41.8 s, sys: 535 ms, total: 42.4 s
Wall time: 17.6 s


In [None]:
%%time
# baseline 1 
# random labels
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyClassifier


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', DummyClassifier(random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))


In [None]:
%%time
# baseline 2 
# pos features + one hot encoding + logistic regression
from sklearn.preprocessing import OneHotEncoder


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
                             penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

In [None]:
%%time
# baseline 3
# use word2vec cbow embedding + baseline 2 + svm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
import scipy.sparse as sp

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embeding.transform(df_train.word),
    embeding.transform(df_train['next-word']),
    embeding.transform(df_train['next-next-word']),
    embeding.transform(df_train['prev-word']),
    embeding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])
X_test = sp.hstack([
    embeding.transform(df_test.word),
    embeding.transform(df_test['next-word']),
    embeding.transform(df_test['next-next-word']),
    embeding.transform(df_test['prev-word']),
    embeding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])

model = model_selection.GridSearchCV(LinearSVC(penalty='l2', multi_class='ovr', random_state=SEED), 
                                    {'C': np.logspace(-4, 0, 5)}, 
                                    cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

# My model

In [17]:
from catboost import Pool, CatBoostClassifier
import numpy as np
from catboost import CatBoostClassifier, FeaturesData
from sklearn.model_selection import StratifiedKFold

In [18]:
from sklearn.model_selection import GridSearchCV

In [13]:
columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos', 'sentence_idx']

In [15]:
model = CatBoostClassifier(iterations=1000,
                           random_state=SEED,
                           learning_rate=1,
                           depth=10,
                           loss_function='MultiClassOneVsAll', 
                           custom_loss='F1')
model.fit(df_train[columns], y_train)
print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

0:	learn: -0.1520296	total: 339ms	remaining: 5m 39s
1:	learn: -0.0711681	total: 674ms	remaining: 5m 36s
2:	learn: -0.0425666	total: 1s	remaining: 5m 33s
3:	learn: -0.0331851	total: 1.37s	remaining: 5m 41s
4:	learn: -0.0283574	total: 1.7s	remaining: 5m 39s
5:	learn: -0.0261516	total: 2.04s	remaining: 5m 38s
6:	learn: -0.0249479	total: 2.4s	remaining: 5m 40s
7:	learn: -0.0237545	total: 2.74s	remaining: 5m 39s
8:	learn: -0.0229641	total: 3.07s	remaining: 5m 37s
9:	learn: -0.0224826	total: 3.39s	remaining: 5m 35s
10:	learn: -0.0216834	total: 3.74s	remaining: 5m 36s
11:	learn: -0.0209366	total: 4.07s	remaining: 5m 35s
12:	learn: -0.0204668	total: 4.41s	remaining: 5m 34s
13:	learn: -0.0199539	total: 4.74s	remaining: 5m 33s
14:	learn: -0.0195303	total: 5.08s	remaining: 5m 33s
15:	learn: -0.0191501	total: 5.41s	remaining: 5m 32s
16:	learn: -0.0188818	total: 5.74s	remaining: 5m 32s
17:	learn: -0.0185861	total: 6.07s	remaining: 5m 30s
18:	learn: -0.0182668	total: 6.41s	remaining: 5m 30s
19:	lear

155:	learn: -0.0058288	total: 55.3s	remaining: 4m 59s
156:	learn: -0.0058119	total: 55.6s	remaining: 4m 58s
157:	learn: -0.0057891	total: 56s	remaining: 4m 58s
158:	learn: -0.0057500	total: 56.3s	remaining: 4m 57s
159:	learn: -0.0057181	total: 56.6s	remaining: 4m 57s
160:	learn: -0.0056843	total: 57s	remaining: 4m 56s
161:	learn: -0.0056526	total: 57.3s	remaining: 4m 56s
162:	learn: -0.0056183	total: 57.6s	remaining: 4m 55s
163:	learn: -0.0055753	total: 57.9s	remaining: 4m 55s
164:	learn: -0.0055562	total: 58.3s	remaining: 4m 54s
165:	learn: -0.0055375	total: 58.6s	remaining: 4m 54s
166:	learn: -0.0055095	total: 58.9s	remaining: 4m 53s
167:	learn: -0.0054661	total: 59.2s	remaining: 4m 53s
168:	learn: -0.0054496	total: 59.6s	remaining: 4m 52s
169:	learn: -0.0054290	total: 1m	remaining: 4m 52s
170:	learn: -0.0054118	total: 1m	remaining: 4m 52s
171:	learn: -0.0053891	total: 1m	remaining: 4m 52s
172:	learn: -0.0053634	total: 1m 1s	remaining: 4m 52s
173:	learn: -0.0053304	total: 1m 1s	remai

306:	learn: -0.0031907	total: 1m 47s	remaining: 4m 1s
307:	learn: -0.0031792	total: 1m 47s	remaining: 4m 1s
308:	learn: -0.0031731	total: 1m 47s	remaining: 4m
309:	learn: -0.0031617	total: 1m 48s	remaining: 4m
310:	learn: -0.0031554	total: 1m 48s	remaining: 4m
311:	learn: -0.0031504	total: 1m 48s	remaining: 3m 59s
312:	learn: -0.0031393	total: 1m 49s	remaining: 3m 59s
313:	learn: -0.0031322	total: 1m 49s	remaining: 3m 59s
314:	learn: -0.0031194	total: 1m 49s	remaining: 3m 58s
315:	learn: -0.0031129	total: 1m 50s	remaining: 3m 58s
316:	learn: -0.0031047	total: 1m 50s	remaining: 3m 57s
317:	learn: -0.0030978	total: 1m 50s	remaining: 3m 57s
318:	learn: -0.0030855	total: 1m 51s	remaining: 3m 57s
319:	learn: -0.0030742	total: 1m 51s	remaining: 3m 56s
320:	learn: -0.0030626	total: 1m 51s	remaining: 3m 56s
321:	learn: -0.0030539	total: 1m 52s	remaining: 3m 55s
322:	learn: -0.0030390	total: 1m 52s	remaining: 3m 55s
323:	learn: -0.0030329	total: 1m 52s	remaining: 3m 55s
324:	learn: -0.0030206	t

456:	learn: -0.0021635	total: 2m 37s	remaining: 3m 7s
457:	learn: -0.0021595	total: 2m 38s	remaining: 3m 7s
458:	learn: -0.0021558	total: 2m 38s	remaining: 3m 6s
459:	learn: -0.0021531	total: 2m 38s	remaining: 3m 6s
460:	learn: -0.0021491	total: 2m 39s	remaining: 3m 5s
461:	learn: -0.0021462	total: 2m 39s	remaining: 3m 5s
462:	learn: -0.0021401	total: 2m 39s	remaining: 3m 5s
463:	learn: -0.0021347	total: 2m 40s	remaining: 3m 4s
464:	learn: -0.0021325	total: 2m 40s	remaining: 3m 4s
465:	learn: -0.0021269	total: 2m 40s	remaining: 3m 4s
466:	learn: -0.0021238	total: 2m 41s	remaining: 3m 3s
467:	learn: -0.0021207	total: 2m 41s	remaining: 3m 3s
468:	learn: -0.0021172	total: 2m 41s	remaining: 3m 3s
469:	learn: -0.0021140	total: 2m 41s	remaining: 3m 2s
470:	learn: -0.0021105	total: 2m 42s	remaining: 3m 2s
471:	learn: -0.0021086	total: 2m 42s	remaining: 3m 1s
472:	learn: -0.0021035	total: 2m 42s	remaining: 3m 1s
473:	learn: -0.0020975	total: 2m 43s	remaining: 3m 1s
474:	learn: -0.0020936	total

607:	learn: -0.0016471	total: 3m 28s	remaining: 2m 14s
608:	learn: -0.0016451	total: 3m 28s	remaining: 2m 14s
609:	learn: -0.0016428	total: 3m 29s	remaining: 2m 13s
610:	learn: -0.0016396	total: 3m 29s	remaining: 2m 13s
611:	learn: -0.0016375	total: 3m 29s	remaining: 2m 13s
612:	learn: -0.0016343	total: 3m 30s	remaining: 2m 12s
613:	learn: -0.0016328	total: 3m 30s	remaining: 2m 12s
614:	learn: -0.0016309	total: 3m 30s	remaining: 2m 11s
615:	learn: -0.0016284	total: 3m 31s	remaining: 2m 11s
616:	learn: -0.0016267	total: 3m 31s	remaining: 2m 11s
617:	learn: -0.0016256	total: 3m 31s	remaining: 2m 10s
618:	learn: -0.0016213	total: 3m 32s	remaining: 2m 10s
619:	learn: -0.0016189	total: 3m 32s	remaining: 2m 10s
620:	learn: -0.0016167	total: 3m 33s	remaining: 2m 10s
621:	learn: -0.0016148	total: 3m 33s	remaining: 2m 9s
622:	learn: -0.0016111	total: 3m 33s	remaining: 2m 9s
623:	learn: -0.0016095	total: 3m 34s	remaining: 2m 9s
624:	learn: -0.0016069	total: 3m 34s	remaining: 2m 8s
625:	learn: -0

758:	learn: -0.0013434	total: 4m 20s	remaining: 1m 22s
759:	learn: -0.0013421	total: 4m 20s	remaining: 1m 22s
760:	learn: -0.0013410	total: 4m 21s	remaining: 1m 22s
761:	learn: -0.0013395	total: 4m 21s	remaining: 1m 21s
762:	learn: -0.0013371	total: 4m 21s	remaining: 1m 21s
763:	learn: -0.0013351	total: 4m 22s	remaining: 1m 20s
764:	learn: -0.0013324	total: 4m 22s	remaining: 1m 20s
765:	learn: -0.0013315	total: 4m 22s	remaining: 1m 20s
766:	learn: -0.0013306	total: 4m 23s	remaining: 1m 19s
767:	learn: -0.0013294	total: 4m 23s	remaining: 1m 19s
768:	learn: -0.0013280	total: 4m 23s	remaining: 1m 19s
769:	learn: -0.0013267	total: 4m 24s	remaining: 1m 18s
770:	learn: -0.0013250	total: 4m 24s	remaining: 1m 18s
771:	learn: -0.0013241	total: 4m 24s	remaining: 1m 18s
772:	learn: -0.0013224	total: 4m 25s	remaining: 1m 17s
773:	learn: -0.0013211	total: 4m 25s	remaining: 1m 17s
774:	learn: -0.0013197	total: 4m 25s	remaining: 1m 17s
775:	learn: -0.0013182	total: 4m 26s	remaining: 1m 16s
776:	learn

911:	learn: -0.0011446	total: 5m 12s	remaining: 30.1s
912:	learn: -0.0011428	total: 5m 12s	remaining: 29.8s
913:	learn: -0.0011413	total: 5m 13s	remaining: 29.5s
914:	learn: -0.0011400	total: 5m 13s	remaining: 29.1s
915:	learn: -0.0011395	total: 5m 13s	remaining: 28.8s
916:	learn: -0.0011390	total: 5m 14s	remaining: 28.4s
917:	learn: -0.0011381	total: 5m 14s	remaining: 28.1s
918:	learn: -0.0011368	total: 5m 14s	remaining: 27.8s
919:	learn: -0.0011362	total: 5m 15s	remaining: 27.4s
920:	learn: -0.0011354	total: 5m 15s	remaining: 27.1s
921:	learn: -0.0011347	total: 5m 15s	remaining: 26.7s
922:	learn: -0.0011337	total: 5m 16s	remaining: 26.4s
923:	learn: -0.0011328	total: 5m 16s	remaining: 26.1s
924:	learn: -0.0011319	total: 5m 17s	remaining: 25.7s
925:	learn: -0.0011312	total: 5m 17s	remaining: 25.4s
926:	learn: -0.0011299	total: 5m 17s	remaining: 25s
927:	learn: -0.0011293	total: 5m 18s	remaining: 24.7s
928:	learn: -0.0011283	total: 5m 18s	remaining: 24.4s
929:	learn: -0.0011271	total: 

```
learning_rate = [0.001, 1]
depth = [5, 10]
model = CatBoostClassifier(iterations=1000,
                           random_state=SEED,
                           loss_function='MultiClassOneVsAll', 
                           custom_loss='F1')
param_grid = dict(learning_rate=learning_rate, depth=depth)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
grid_search = GridSearchCV(model, param_grid, scoring="f1_macro", cv=kfold)
grid_result = grid_search.fit(df_train[columns], y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
```

In [20]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.781086 using {'depth': 10, 'learning_rate': 1}


In [23]:
best = grid_search.best_estimator_.fit(df_train[columns], y_train)

0:	learn: -0.1520296	total: 494ms	remaining: 8m 13s
1:	learn: -0.0711681	total: 997ms	remaining: 8m 17s
2:	learn: -0.0425666	total: 1.43s	remaining: 7m 56s
3:	learn: -0.0331851	total: 1.95s	remaining: 8m 5s
4:	learn: -0.0283574	total: 2.43s	remaining: 8m 3s
5:	learn: -0.0261516	total: 2.84s	remaining: 7m 50s
6:	learn: -0.0249479	total: 3.23s	remaining: 7m 38s
7:	learn: -0.0237545	total: 3.64s	remaining: 7m 31s
8:	learn: -0.0229641	total: 4.07s	remaining: 7m 28s
9:	learn: -0.0224826	total: 4.48s	remaining: 7m 23s
10:	learn: -0.0216834	total: 4.93s	remaining: 7m 23s
11:	learn: -0.0209366	total: 5.32s	remaining: 7m 17s
12:	learn: -0.0204668	total: 5.84s	remaining: 7m 23s
13:	learn: -0.0199539	total: 6.43s	remaining: 7m 33s
14:	learn: -0.0195303	total: 6.89s	remaining: 7m 32s
15:	learn: -0.0191501	total: 7.29s	remaining: 7m 28s
16:	learn: -0.0188818	total: 7.7s	remaining: 7m 25s
17:	learn: -0.0185861	total: 8.18s	remaining: 7m 26s
18:	learn: -0.0182668	total: 8.6s	remaining: 7m 24s
19:	lea

155:	learn: -0.0058288	total: 1m 5s	remaining: 5m 54s
156:	learn: -0.0058119	total: 1m 5s	remaining: 5m 53s
157:	learn: -0.0057891	total: 1m 6s	remaining: 5m 53s
158:	learn: -0.0057500	total: 1m 6s	remaining: 5m 52s
159:	learn: -0.0057181	total: 1m 7s	remaining: 5m 51s
160:	learn: -0.0056843	total: 1m 7s	remaining: 5m 51s
161:	learn: -0.0056526	total: 1m 7s	remaining: 5m 50s
162:	learn: -0.0056183	total: 1m 8s	remaining: 5m 49s
163:	learn: -0.0055753	total: 1m 8s	remaining: 5m 49s
164:	learn: -0.0055562	total: 1m 8s	remaining: 5m 48s
165:	learn: -0.0055375	total: 1m 9s	remaining: 5m 47s
166:	learn: -0.0055095	total: 1m 9s	remaining: 5m 47s
167:	learn: -0.0054661	total: 1m 10s	remaining: 5m 46s
168:	learn: -0.0054496	total: 1m 10s	remaining: 5m 46s
169:	learn: -0.0054290	total: 1m 10s	remaining: 5m 45s
170:	learn: -0.0054118	total: 1m 11s	remaining: 5m 45s
171:	learn: -0.0053891	total: 1m 11s	remaining: 5m 45s
172:	learn: -0.0053634	total: 1m 12s	remaining: 5m 45s
173:	learn: -0.0053304

306:	learn: -0.0031907	total: 2m 6s	remaining: 4m 44s
307:	learn: -0.0031792	total: 2m 6s	remaining: 4m 44s
308:	learn: -0.0031731	total: 2m 7s	remaining: 4m 44s
309:	learn: -0.0031617	total: 2m 7s	remaining: 4m 43s
310:	learn: -0.0031554	total: 2m 7s	remaining: 4m 43s
311:	learn: -0.0031504	total: 2m 8s	remaining: 4m 43s
312:	learn: -0.0031393	total: 2m 8s	remaining: 4m 42s
313:	learn: -0.0031322	total: 2m 9s	remaining: 4m 42s
314:	learn: -0.0031194	total: 2m 9s	remaining: 4m 41s
315:	learn: -0.0031129	total: 2m 10s	remaining: 4m 41s
316:	learn: -0.0031047	total: 2m 10s	remaining: 4m 41s
317:	learn: -0.0030978	total: 2m 10s	remaining: 4m 40s
318:	learn: -0.0030855	total: 2m 11s	remaining: 4m 40s
319:	learn: -0.0030742	total: 2m 11s	remaining: 4m 39s
320:	learn: -0.0030626	total: 2m 12s	remaining: 4m 39s
321:	learn: -0.0030539	total: 2m 12s	remaining: 4m 39s
322:	learn: -0.0030390	total: 2m 12s	remaining: 4m 38s
323:	learn: -0.0030329	total: 2m 13s	remaining: 4m 38s
324:	learn: -0.0030

457:	learn: -0.0021595	total: 3m 6s	remaining: 3m 40s
458:	learn: -0.0021558	total: 3m 6s	remaining: 3m 40s
459:	learn: -0.0021531	total: 3m 7s	remaining: 3m 39s
460:	learn: -0.0021491	total: 3m 7s	remaining: 3m 39s
461:	learn: -0.0021462	total: 3m 7s	remaining: 3m 38s
462:	learn: -0.0021401	total: 3m 8s	remaining: 3m 38s
463:	learn: -0.0021347	total: 3m 8s	remaining: 3m 37s
464:	learn: -0.0021325	total: 3m 8s	remaining: 3m 37s
465:	learn: -0.0021269	total: 3m 9s	remaining: 3m 36s
466:	learn: -0.0021238	total: 3m 9s	remaining: 3m 36s
467:	learn: -0.0021207	total: 3m 10s	remaining: 3m 36s
468:	learn: -0.0021172	total: 3m 10s	remaining: 3m 35s
469:	learn: -0.0021140	total: 3m 10s	remaining: 3m 35s
470:	learn: -0.0021105	total: 3m 11s	remaining: 3m 34s
471:	learn: -0.0021086	total: 3m 11s	remaining: 3m 34s
472:	learn: -0.0021035	total: 3m 11s	remaining: 3m 33s
473:	learn: -0.0020975	total: 3m 12s	remaining: 3m 33s
474:	learn: -0.0020936	total: 3m 12s	remaining: 3m 33s
475:	learn: -0.00208


Iteration with suspicious time 107 sec ignored in overall statistics.


581:	learn: -0.0017131	total: 3m 54s	remaining: 2m 49s
582:	learn: -0.0017095	total: 3m 55s	remaining: 2m 48s
583:	learn: -0.0017078	total: 3m 55s	remaining: 2m 48s
584:	learn: -0.0017065	total: 3m 56s	remaining: 2m 47s
585:	learn: -0.0017041	total: 3m 56s	remaining: 2m 47s
586:	learn: -0.0017012	total: 3m 57s	remaining: 2m 47s
587:	learn: -0.0016994	total: 3m 57s	remaining: 2m 46s
588:	learn: -0.0016969	total: 3m 57s	remaining: 2m 46s
589:	learn: -0.0016947	total: 3m 58s	remaining: 2m 45s
590:	learn: -0.0016926	total: 3m 58s	remaining: 2m 45s
591:	learn: -0.0016893	total: 3m 59s	remaining: 2m 45s
592:	learn: -0.0016862	total: 3m 59s	remaining: 2m 44s
593:	learn: -0.0016830	total: 4m	remaining: 2m 44s
594:	learn: -0.0016819	total: 4m	remaining: 2m 44s
595:	learn: -0.0016774	total: 4m	remaining: 2m 43s
596:	learn: -0.0016757	total: 4m 1s	remaining: 2m 43s
597:	learn: -0.0016719	total: 4m 1s	remaining: 2m 42s
598:	learn: -0.0016689	total: 4m 2s	remaining: 2m 42s
599:	learn: -0.0016664	to

732:	learn: -0.0013854	total: 4m 59s	remaining: 1m 49s
733:	learn: -0.0013841	total: 4m 59s	remaining: 1m 48s
734:	learn: -0.0013828	total: 4m 59s	remaining: 1m 48s
735:	learn: -0.0013805	total: 5m	remaining: 1m 47s
736:	learn: -0.0013778	total: 5m	remaining: 1m 47s
737:	learn: -0.0013771	total: 5m 1s	remaining: 1m 47s
738:	learn: -0.0013761	total: 5m 1s	remaining: 1m 46s
739:	learn: -0.0013745	total: 5m 2s	remaining: 1m 46s
740:	learn: -0.0013721	total: 5m 2s	remaining: 1m 45s
741:	learn: -0.0013695	total: 5m 2s	remaining: 1m 45s
742:	learn: -0.0013683	total: 5m 3s	remaining: 1m 45s
743:	learn: -0.0013660	total: 5m 3s	remaining: 1m 44s
744:	learn: -0.0013644	total: 5m 4s	remaining: 1m 44s
745:	learn: -0.0013629	total: 5m 4s	remaining: 1m 43s
746:	learn: -0.0013622	total: 5m 4s	remaining: 1m 43s
747:	learn: -0.0013607	total: 5m 5s	remaining: 1m 43s
748:	learn: -0.0013596	total: 5m 5s	remaining: 1m 42s
749:	learn: -0.0013583	total: 5m 6s	remaining: 1m 42s
750:	learn: -0.0013566	total: 5

883:	learn: -0.0011816	total: 6m 2s	remaining: 47.7s
884:	learn: -0.0011787	total: 6m 3s	remaining: 47.2s
885:	learn: -0.0011773	total: 6m 3s	remaining: 46.8s
886:	learn: -0.0011758	total: 6m 3s	remaining: 46.4s
887:	learn: -0.0011736	total: 6m 4s	remaining: 46s
888:	learn: -0.0011724	total: 6m 4s	remaining: 45.6s
889:	learn: -0.0011717	total: 6m 5s	remaining: 45.2s
890:	learn: -0.0011708	total: 6m 5s	remaining: 44.8s
891:	learn: -0.0011695	total: 6m 5s	remaining: 44.3s
892:	learn: -0.0011686	total: 6m 6s	remaining: 43.9s
893:	learn: -0.0011672	total: 6m 6s	remaining: 43.5s
894:	learn: -0.0011663	total: 6m 7s	remaining: 43.1s
895:	learn: -0.0011650	total: 6m 7s	remaining: 42.7s
896:	learn: -0.0011643	total: 6m 7s	remaining: 42.3s
897:	learn: -0.0011621	total: 6m 8s	remaining: 41.9s
898:	learn: -0.0011613	total: 6m 8s	remaining: 41.5s
899:	learn: -0.0011595	total: 6m 9s	remaining: 41.1s
900:	learn: -0.0011588	total: 6m 9s	remaining: 40.6s
901:	learn: -0.0011579	total: 6m 9s	remaining: 4

In [24]:
print('train', metrics.f1_score(y_train, best.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, best.predict(df_test[columns]), average='macro'))

train 0.992994419700279
test 0.837898088113896


### Answers

    1) How can you exploit that words belong to some sentence?
    Фича, которая показывает, к какому предлодению принадлежит та или иная сущность важна, так как модель учится, какие сущности могут встречаться в одном предложении, а какие нет, например.
    
    2) Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?
    https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin:
    In a multi-class classification setup, micro-average is preferable if you suspect there might be class imbalance, as we have in our dataset.