# Assignment 4: Named entity recognition

Построить модель для обнаружения и классификации именованных сущностей (named entities). На базе корпуса CoNLL 2002.  

Используйте в своем решении ансамбли над решающими деревьями: RandomForest, Gradient Boosting (xgboost, lightgbm, catboost) 
Tutorials:  
1. https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
1. https://github.com/catboost/tutorials 


Чем больше baseline'ов вы превзойдете, тем выше ваша оценка
Метрика качества f1 (f1_macro) (чем выше, тем лучше)
 
baseline 1: 0.0604      random labels  
baseline 2: 0.3966      PoS features + logistic regression  
baseline 3: 0.8122      word2vec cbow embedding + baseline 2 + svm    

! Your results must be reproducible. Если ваша модель - стохастическая, то вы явно должны задавать все seed и random_state в параметрах моделей   

bonus, think about:  
1. How can you exploit that words belong to some sentence?
2. Why we selected f1 score with macro averaging as our classification quality measure? What other metrics are suitable?   

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


SEED=1337

In [2]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [3]:
# number of sentences
df.sentence_idx.max()

1500.0

In [4]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [44]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [45]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])
df['word_length'] = df['word'].apply(len)

In [46]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length,word_length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48,9
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48,2
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48,13
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48,4
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48,7


In [49]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


#### Пробуем побить бэйзлайны с помощью RandomForest

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [52]:
%time
rf = RandomForestClassifier(criterion='gini', n_estimators=200, max_depth=50,
                             random_state=SEED)

columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

rf.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, rf.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, rf.predict(df_test[columns]), average='macro'))

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 24.1 µs
train 0.7508884417518296
test 0.6249571350554785


In [53]:
#Попробуем добавить длину:
    
columns.append('length')

In [25]:
%time

rf.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, rf.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, rf.predict(df_test[columns]), average='macro'))

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 11 µs
train 0.9634465223650243
test 0.806251614055832


#### Хорошо, но до третьего бейзлайна немного не хватает

In [35]:
#Попробуем перебрать какие-то параметры:

F1 = []
estimators = [50, 100, 150, 200, 250, 300, 350]
max_depth = [10, 20, 30, 40, 50, 60, 70]
for e in estimators:
    for d in max_depth:
        rf = RandomForestClassifier(criterion='gini', n_estimators=e, max_depth=d, random_state=SEED)
        rf.fit(df_train[columns], y_train)
        F1.append(metrics.f1_score(y_test, rf.predict(df_test[columns]), average='macro'))

In [36]:
F1.index(max(F1))

17

#### Получается, максимум достигается при n-estimators = 150 и max_depth = 40

In [37]:
rf = RandomForestClassifier(criterion='gini', n_estimators=150, max_depth=40, random_state=SEED)

rf.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, rf.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, rf.predict(df_test[columns]), average='macro'))

train 0.964132768426875
test 0.807066121649574


#### Всё равно не получилось побить третий бейзлайн. Попробуем добавить длину слов:

In [54]:
columns.append('word_length')

In [56]:
rf = RandomForestClassifier(criterion='gini', n_estimators=150, max_depth=40, random_state=SEED)

rf.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, rf.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, rf.predict(df_test[columns]), average='macro'))

train 0.9923830838076908
test 0.8684126780836225
