In [0]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

SEED=1337

In [0]:
from google.colab import drive

In [27]:
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [0]:
#import os
#os.chdir('gdrive/My Drive/Colab Notebooks')

In [29]:
df = pd.read_csv('ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [30]:
# number of sentences
df.sentence_idx.max()

1500.0

In [31]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [0]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [33]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,Thousands,O,48
1,1.0,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,of,O,48
2,1.0,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,demonstrators,O,48
3,1.0,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,have,O,48
4,1.0,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,marched,O,48


In [0]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [35]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [36]:
df_train.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
36858,153.0,35,that,18,camps,30,16,16,June,Soh,found,O,54
39120,259.0,7,the,28,to,32,30,5,and,was,known,O,58
53612,910.0,15,dictator,10,former,7,30,1,",",showed,the,O,50
6150,280.0,9,of,15,risk,7,9,1,",",despite,the,O,44
3771,169.0,10,Mexican,1,",",16,15,16,Saltillo,plant,Friday,B-tim,52


In [37]:
%%time
# baseline 1, baseline 2  

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

columns = ['pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']

model = Pipeline([
    ('enc', OneHotEncoder()),
    ('classif', RandomForestClassifier(max_depth=None, random_state=0, n_estimators=100))
])

model.fit(df_train[columns], y_train)

print('train', metrics.f1_score(y_train, model.predict(df_train[columns]), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(df_test[columns]), average='macro'))

train 0.7408211521199051
test 0.5936476949449351
CPU times: user 22.7 s, sys: 51.9 ms, total: 22.8 s
Wall time: 22.8 s


In [38]:
%%time
#baseline 3  

from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(random_state=SEED)
model = model_selection.GridSearchCV(RFC, {'n_estimators': [10, 30, 50]}, 
                                    cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print('train', metrics.f1_score(y_train, model.predict(X_train), average='macro'))
print('test', metrics.f1_score(y_test, model.predict(X_test), average='macro'))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 28.1min finished


train 0.9941642765978723
test 0.8611734425854218
CPU times: user 5min 51s, sys: 753 ms, total: 5min 52s
Wall time: 33min 57s


Why did we select f1 score with macro averaging as our classification quality measure? What other metrics are suitable?


У нас несбалансированные классы тэгов: тэг "О" встречается значительно чаще, чем остальные. Нам выгоднее использовать macro-averaging, потому что он не принимает во внимание несбалансированность тэгов, а считает F-меры для каждого класса и находит их среднее.
Также можно было бы использовать weighted-average, fbeta_score.