# Assignment 4: Named entity recognition

Построить модель для обнаружения и классификации именованных сущностей (named entities). На базе корпуса CoNLL 2002.  

Используйте в своем решении ансамбли над решающими деревьями: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost) 
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 


Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества f1 (f1_macro) (чем выше, тем лучше)
 
baseline 1: 0.0604      random labels  
baseline 2: 0.3966      PoS features + logistic regression  
baseline 3: 0.8122      word2vec cbow embedding + baseline 2 + svm    

! Your results must be reproducible. Если ваша модель - стохастическая, то вы явно должны задавать все seed и random_state в параметрах моделей   

bonus, think about:  
1. How can you exploit that words belong to some sentence?
2. Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=1337

In [2]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [3]:
# number of sentences
df.sentence_idx.max()

1500.0

In [4]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [5]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [6]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [7]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


In [8]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [9]:
# some wrappers to work with word2vec
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from collections import defaultdict

   
class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])
    


In [10]:
%%time
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)

CPU times: user 29.5 s, sys: 351 ms, total: 29.8 s
Wall time: 12.5 s


In [11]:
%%time
# baseline 1 
# random labels
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyClassifier


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', DummyClassifier(random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))


train 0.05887736725599869
test 0.060439542712750365
CPU times: user 98.7 ms, sys: 17.1 ms, total: 116 ms
Wall time: 122 ms


In [12]:
%%time
# baseline 2 
# pos features + one hot encoding + logistic regression
from sklearn.preprocessing import OneHotEncoder


columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('est', LogisticRegressionCV(Cs=5, cv=5, n_jobs=-1, scoring='f1_macro', 
                             penalty='l2', solver='newton-cg', multi_class='multinomial', random_state=SEED)),
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.46639500282346874
test 0.39660981421559566
CPU times: user 2min 40s, sys: 9.99 s, total: 2min 50s
Wall time: 11min 32s


In [13]:
%%time
## baseline 3
# use word2vec cbow embedding + baseline 2 + svm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
import scipy.sparse as sp

embeding = w2v_cbow
encoder_pos = OneHotEncoder()
X_train = sp.hstack([
    embeding.transform(df_train.word),
    embeding.transform(df_train['next-word']),
    embeding.transform(df_train['next-next-word']),
    embeding.transform(df_train['prev-word']),
    embeding.transform(df_train['prev-prev-word']),
    encoder_pos.fit_transform(df_train[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])
X_test = sp.hstack([
    embeding.transform(df_test.word),
    embeding.transform(df_test['next-word']),
    embeding.transform(df_test['next-next-word']),
    embeding.transform(df_test['prev-word']),
    embeding.transform(df_test['prev-prev-word']),
    encoder_pos.transform(df_test[['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])

model = model_selection.GridSearchCV(LinearSVC(penalty='l2', multi_class='ovr', random_state=SEED), 
                                    {'C': np.logspace(-4, 0, 5)}, 
                                    cv=3, scoring='f1_macro', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  6.1min finished


train 0.9550564339022908
test 0.8020799802744993
CPU times: user 1min 59s, sys: 6.46 s, total: 2min 5s
Wall time: 8min 13s


In [14]:
df_train.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
36858,153.0,35,that,18,camps,30,16,16,June,Soh,found,O,54
39120,259.0,7,the,28,to,32,30,5,and,was,known,O,58
53612,910.0,15,dictator,10,former,7,30,1,",",showed,the,O,50
6150,280.0,9,of,15,risk,7,9,1,",",despite,the,O,44
3771,169.0,10,Mexican,1,",",16,15,16,Saltillo,plant,Friday,B-tim,52


**Будем также учитывать `sentence_idx` и `length`**

In [15]:
idxs = [0, 1, 3, 5, 6, 7, 12]
cols = list(df_train)
train_cols = list(np.array(cols)[idxs])

In [16]:
train_cols

['sentence_idx',
 'next-next-pos',
 'next-pos',
 'pos',
 'prev-pos',
 'prev-prev-pos',
 'length']

In [29]:
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold

In [30]:
params = {
    'iterations' : [500, 1000],
    'learning_rate' : [0.03, 1],
    'depth' : [8, 10]
}

In [31]:
model = CatBoostClassifier(
    loss_function='MultiClass',
    custom_loss=['F1'],
    random_seed=SEED,
    logging_level='Verbose'
)

In [32]:
%%time
kf = KFold(n_splits=5, shuffle=True)
clf = model_selection.GridSearchCV(model, params, scoring='f1_macro', n_jobs=-1, cv=kf)
clf.fit(df_train[train_cols], y_train)

0:	learn: -0.6013347	total: 211ms	remaining: 3m 31s
1:	learn: -5.6726363	total: 359ms	remaining: 2m 59s
2:	learn: -6.9616574	total: 499ms	remaining: 2m 45s
3:	learn: -6.6089993	total: 665ms	remaining: 2m 45s
4:	learn: -5.5586517	total: 825ms	remaining: 2m 44s
5:	learn: -5.3501627	total: 967ms	remaining: 2m 40s
6:	learn: -4.9516412	total: 1.11s	remaining: 2m 38s
7:	learn: -4.8114729	total: 1.26s	remaining: 2m 36s
8:	learn: -4.3101047	total: 1.41s	remaining: 2m 35s
9:	learn: -4.3394741	total: 1.55s	remaining: 2m 33s
10:	learn: -4.1811566	total: 1.68s	remaining: 2m 31s
11:	learn: -3.8724359	total: 1.82s	remaining: 2m 29s
12:	learn: -3.6842454	total: 1.96s	remaining: 2m 28s
13:	learn: -3.5410602	total: 2.09s	remaining: 2m 27s
14:	learn: -3.3792883	total: 2.23s	remaining: 2m 26s
15:	learn: -3.4562716	total: 2.36s	remaining: 2m 25s
16:	learn: -3.1474633	total: 2.5s	remaining: 2m 24s
17:	learn: -2.9723337	total: 2.64s	remaining: 2m 24s
18:	learn: -2.8639973	total: 2.78s	remaining: 2m 23s
19:	

157:	learn: -0.8987479	total: 22.1s	remaining: 1m 57s
158:	learn: -0.8966638	total: 22.2s	remaining: 1m 57s
159:	learn: -0.8943604	total: 22.4s	remaining: 1m 57s
160:	learn: -0.8928115	total: 22.5s	remaining: 1m 57s
161:	learn: -0.8820512	total: 22.7s	remaining: 1m 57s
162:	learn: -0.8733953	total: 22.8s	remaining: 1m 57s
163:	learn: -0.8702703	total: 22.9s	remaining: 1m 56s
164:	learn: -0.8681344	total: 23.1s	remaining: 1m 56s
165:	learn: -0.8637435	total: 23.2s	remaining: 1m 56s
166:	learn: -0.8619572	total: 23.3s	remaining: 1m 56s
167:	learn: -0.8563944	total: 23.5s	remaining: 1m 56s
168:	learn: -0.8543506	total: 23.6s	remaining: 1m 56s
169:	learn: -0.8525150	total: 23.7s	remaining: 1m 55s
170:	learn: -0.8501946	total: 23.9s	remaining: 1m 55s
171:	learn: -0.8478084	total: 24s	remaining: 1m 55s
172:	learn: -0.8464536	total: 24.2s	remaining: 1m 55s
173:	learn: -0.8376387	total: 24.3s	remaining: 1m 55s
174:	learn: -0.8361681	total: 24.4s	remaining: 1m 55s
175:	learn: -0.8353089	total: 

311:	learn: -0.5110774	total: 43.2s	remaining: 1m 35s
312:	learn: -0.5091102	total: 43.4s	remaining: 1m 35s
313:	learn: -0.5082062	total: 43.5s	remaining: 1m 35s
314:	learn: -0.5073750	total: 43.6s	remaining: 1m 34s
315:	learn: -0.5065820	total: 43.8s	remaining: 1m 34s
316:	learn: -0.5056279	total: 43.9s	remaining: 1m 34s
317:	learn: -0.4998137	total: 44s	remaining: 1m 34s
318:	learn: -0.4988749	total: 44.2s	remaining: 1m 34s
319:	learn: -0.4962480	total: 44.3s	remaining: 1m 34s
320:	learn: -0.4952569	total: 44.5s	remaining: 1m 34s
321:	learn: -0.4947426	total: 44.6s	remaining: 1m 33s
322:	learn: -0.4930848	total: 44.8s	remaining: 1m 33s
323:	learn: -0.4892802	total: 44.9s	remaining: 1m 33s
324:	learn: -0.4883930	total: 45s	remaining: 1m 33s
325:	learn: -0.4847426	total: 45.2s	remaining: 1m 33s
326:	learn: -0.4792466	total: 45.3s	remaining: 1m 33s
327:	learn: -0.4771525	total: 45.4s	remaining: 1m 33s
328:	learn: -0.4726499	total: 45.6s	remaining: 1m 32s
329:	learn: -0.4716750	total: 45

465:	learn: -0.2562517	total: 1m 4s	remaining: 1m 13s
466:	learn: -0.2559421	total: 1m 4s	remaining: 1m 13s
467:	learn: -0.2553027	total: 1m 4s	remaining: 1m 13s
468:	learn: -0.2546087	total: 1m 4s	remaining: 1m 13s
469:	learn: -0.2518467	total: 1m 5s	remaining: 1m 13s
470:	learn: -0.2515093	total: 1m 5s	remaining: 1m 13s
471:	learn: -0.2510769	total: 1m 5s	remaining: 1m 13s
472:	learn: -0.2506028	total: 1m 5s	remaining: 1m 13s
473:	learn: -0.2499486	total: 1m 5s	remaining: 1m 12s
474:	learn: -0.2497052	total: 1m 5s	remaining: 1m 12s
475:	learn: -0.2493299	total: 1m 5s	remaining: 1m 12s
476:	learn: -0.2490095	total: 1m 6s	remaining: 1m 12s
477:	learn: -0.2485224	total: 1m 6s	remaining: 1m 12s
478:	learn: -0.2483323	total: 1m 6s	remaining: 1m 12s
479:	learn: -0.2479304	total: 1m 6s	remaining: 1m 12s
480:	learn: -0.2472594	total: 1m 6s	remaining: 1m 11s
481:	learn: -0.2469025	total: 1m 6s	remaining: 1m 11s
482:	learn: -0.2458305	total: 1m 6s	remaining: 1m 11s
483:	learn: -0.2450662	total

619:	learn: -0.1789204	total: 1m 25s	remaining: 52.7s
620:	learn: -0.1785394	total: 1m 26s	remaining: 52.5s
621:	learn: -0.1783314	total: 1m 26s	remaining: 52.4s
622:	learn: -0.1781201	total: 1m 26s	remaining: 52.2s
623:	learn: -0.1778549	total: 1m 26s	remaining: 52.1s
624:	learn: -0.1777472	total: 1m 26s	remaining: 52s
625:	learn: -0.1776298	total: 1m 26s	remaining: 51.8s
626:	learn: -0.1766179	total: 1m 26s	remaining: 51.7s
627:	learn: -0.1761802	total: 1m 27s	remaining: 51.5s
628:	learn: -0.1759733	total: 1m 27s	remaining: 51.4s
629:	learn: -0.1758039	total: 1m 27s	remaining: 51.3s
630:	learn: -0.1757117	total: 1m 27s	remaining: 51.1s
631:	learn: -0.1744827	total: 1m 27s	remaining: 51s
632:	learn: -0.1743344	total: 1m 27s	remaining: 50.9s
633:	learn: -0.1738601	total: 1m 27s	remaining: 50.7s
634:	learn: -0.1734716	total: 1m 28s	remaining: 50.6s
635:	learn: -0.1733106	total: 1m 28s	remaining: 50.4s
636:	learn: -0.1731197	total: 1m 28s	remaining: 50.3s
637:	learn: -0.1721227	total: 1m

773:	learn: -0.1439578	total: 1m 47s	remaining: 31.4s
774:	learn: -0.1438343	total: 1m 47s	remaining: 31.2s
775:	learn: -0.1436659	total: 1m 47s	remaining: 31.1s
776:	learn: -0.1435440	total: 1m 47s	remaining: 30.9s
777:	learn: -0.1434507	total: 1m 47s	remaining: 30.8s
778:	learn: -0.1433711	total: 1m 48s	remaining: 30.7s
779:	learn: -0.1432270	total: 1m 48s	remaining: 30.5s
780:	learn: -0.1431002	total: 1m 48s	remaining: 30.4s
781:	learn: -0.1427831	total: 1m 48s	remaining: 30.2s
782:	learn: -0.1427316	total: 1m 48s	remaining: 30.1s
783:	learn: -0.1426674	total: 1m 48s	remaining: 30s
784:	learn: -0.1421900	total: 1m 48s	remaining: 29.8s
785:	learn: -0.1420772	total: 1m 49s	remaining: 29.7s
786:	learn: -0.1419874	total: 1m 49s	remaining: 29.6s
787:	learn: -0.1418864	total: 1m 49s	remaining: 29.4s
788:	learn: -0.1418127	total: 1m 49s	remaining: 29.3s
789:	learn: -0.1417498	total: 1m 49s	remaining: 29.1s
790:	learn: -0.1417026	total: 1m 49s	remaining: 29s
791:	learn: -0.1416542	total: 1m

927:	learn: -0.1293651	total: 2m 8s	remaining: 10s
928:	learn: -0.1293536	total: 2m 9s	remaining: 9.86s
929:	learn: -0.1292286	total: 2m 9s	remaining: 9.73s
930:	learn: -0.1291387	total: 2m 9s	remaining: 9.59s
931:	learn: -0.1288314	total: 2m 9s	remaining: 9.45s
932:	learn: -0.1287962	total: 2m 9s	remaining: 9.31s
933:	learn: -0.1287529	total: 2m 9s	remaining: 9.17s
934:	learn: -0.1285576	total: 2m 9s	remaining: 9.03s
935:	learn: -0.1285077	total: 2m 10s	remaining: 8.89s
936:	learn: -0.1283599	total: 2m 10s	remaining: 8.76s
937:	learn: -0.1282628	total: 2m 10s	remaining: 8.62s
938:	learn: -0.1282353	total: 2m 10s	remaining: 8.48s
939:	learn: -0.1282124	total: 2m 10s	remaining: 8.34s
940:	learn: -0.1281377	total: 2m 10s	remaining: 8.2s
941:	learn: -0.1280338	total: 2m 10s	remaining: 8.06s
942:	learn: -0.1279995	total: 2m 11s	remaining: 7.92s
943:	learn: -0.1279676	total: 2m 11s	remaining: 7.78s
944:	learn: -0.1276577	total: 2m 11s	remaining: 7.64s
945:	learn: -0.1276260	total: 2m 11s	re

In [33]:
clf.best_params_

{'depth': 10, 'iterations': 1000, 'learning_rate': 1}

In [36]:
model = CatBoostClassifier(
    iterations=1000,
    depth=10,
    learning_rate=1,
    loss_function='MultiClass',
    custom_loss=['F1'],
    random_seed=SEED,
    logging_level='Silent'
)

In [37]:
model.fit(df_train[train_cols], y_train)

<catboost.core.CatBoostClassifier at 0x117ba0240>

In [38]:
print('train', metrics.f1_score(y_train, model.predict(df_train[train_cols]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[train_cols]), average='macro'))

train 0.9996512534279905
test 0.8274820993917573


`RandomForestClassifier`

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
rf = RandomForestClassifier(
    random_state=SEED,
    max_depth=100,
    n_estimators=500,
    class_weight='balanced'
)

In [41]:
rf.fit(df_train[train_cols], y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=100, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=1337,
            verbose=0, warm_start=False)

In [42]:
print('train', metrics.f1_score(y_train, rf.predict(df_train[train_cols]), average='macro'))
print('test', metrics.f1_score(y_test, rf.predict(df_test[train_cols]), average='macro'))

train 0.9998132730662732
test 0.8519797533974067
