In [1]:
# ignore warning
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
import eli5

In [3]:
data = pd.read_csv("data/ner_dataset.csv", encoding="utf-8")
data = data.fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,Tag
369382,Sentence: 16858,vùng,O
369383,Sentence: 16858,chôn_cất,O
369384,Sentence: 16858,người_thân,O
369385,Sentence: 16858,khi,O
369386,Sentence: 16858,qua_đời,O
369387,Sentence: 16858,",",O
369388,Sentence: 16858,gọi,O
369389,Sentence: 16858,là,O
369390,Sentence: 16858,nhị_tì,O
369391,Sentence: 16858,...,O


In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [5]:
getter = SentenceGetter(data)
sent = getter.get_next()
print(sent)

[('Đó', 'O'), ('là', 'O'), ('con', 'O'), ('đường', 'O'), ('biển', 'O'), ('ngắn', 'O'), ('nhất', 'O'), ('để', 'O'), ('đi', 'O'), ('từ', 'O'), ('Ấn_Độ_Dương', 'LOCATION'), ('sang', 'O'), ('Thái_Bình_Dương', 'LOCATION'), (',', 'O'), ('chiếm', 'O'), ('đến', 'O'), ('lượng', 'O'), ('hàng_hoá', 'O'), ('lưu_thông', 'O'), ('đường_biển', 'O'), ('của', 'O'), ('thế_giới', 'O'), (',', 'O'), ('đó', 'O'), ('là', 'O'), ('hải_trình', 'O'), ('lớn', 'O'), ('nhất', 'O'), ('từ', 'O'), ('tây', 'O'), ('sang', 'O'), ('đông', 'O'), ('với', 'O'), ('50.000', 'O'), ('lượt', 'O'), ('tàu_bè', 'O'), ('qua_lại', 'O'), ('mỗi', 'O'), ('năm', 'O'), ('...', 'O')]


In [6]:
sentences = getter.sentences

In [7]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [9]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
print(X[0]) # list of dictionaries of features
print(y[0]) # list of tags

[{'BOS': True, 'word.lower()': 'đó', 'word.istitle()': True, 'bias': 1.0, 'word[-3:]': 'Đó', 'word[-2:]': 'Đó', '+1:word.istitle()': False, 'word.isupper()': False, '+1:word.lower()': 'là', '+1:word.isupper()': False, 'word.isdigit()': False}, {'-1:word.istitle()': True, '-1:word.lower()': 'đó', 'word[-2:]': 'là', '-1:word.isupper()': False, '+1:word.lower()': 'con', 'word.lower()': 'là', 'word.istitle()': False, 'bias': 1.0, 'word[-3:]': 'là', '+1:word.istitle()': False, 'word.isupper()': False, '+1:word.isupper()': False, 'word.isdigit()': False}, {'-1:word.istitle()': False, '-1:word.lower()': 'là', 'word[-2:]': 'on', '-1:word.isupper()': False, '+1:word.lower()': 'đường', 'word.lower()': 'con', 'word.istitle()': False, 'bias': 1.0, 'word[-3:]': 'con', '+1:word.istitle()': False, 'word.isupper()': False, '+1:word.isupper()': False, 'word.isdigit()': False}, {'-1:word.istitle()': False, '-1:word.lower()': 'con', 'word[-2:]': 'ng', '-1:word.isupper()': False, '+1:word.lower()': 'biển'


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOCATION', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [12]:
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [13]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

    LOCATION       0.85      0.85      0.85      9028
        MISC       0.91      0.92      0.91       561
           O       0.99      1.00      0.99    345533
ORGANIZATION       0.76      0.59      0.66      3268
      PERSON       0.93      0.85      0.89     11002

 avg / total       0.98      0.99      0.98    369392



In [14]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [17]:
eli5.show_weights(crf, top=30)

From \ To,LOCATION,MISC,O,ORGANIZATION,PERSON
LOCATION,4.199,0.0,-0.74,-2.367,-4.924
MISC,0.0,6.349,-0.012,0.001,0.0
O,1.022,-1.361,3.073,0.074,0.249
ORGANIZATION,-2.142,0.0,-1.736,5.47,-2.699
PERSON,-2.803,0.0,-1.093,0.0,3.305

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+6.315,word.lower():tây_nguyên,,,
+5.323,word.lower():biển_đông,,,
+5.280,word.lower():nam_bộ,,,
+5.079,word.lower():phương_tây,,,
+4.987,word.lower():tiên_sơn_động,,,
+4.427,word.lower():trà_buông,,,
+3.656,word.lower():bom_loọng,,,
+3.621,word.lower():hương_rừng,,,
+3.497,word[-2:]:ĐL,,,
+3.497,word.lower():đl,,,

Weight?,Feature
+6.315,word.lower():tây_nguyên
+5.323,word.lower():biển_đông
+5.280,word.lower():nam_bộ
+5.079,word.lower():phương_tây
+4.987,word.lower():tiên_sơn_động
+4.427,word.lower():trà_buông
+3.656,word.lower():bom_loọng
+3.621,word.lower():hương_rừng
+3.497,word[-2:]:ĐL
+3.497,word.lower():đl

Weight?,Feature
+3.611,-1:word.lower():người
+3.318,-1:word.lower():tiếng
+3.165,word.lower():tiếng
+3.148,word[-3:]:ếng
+3.067,+1:word.lower():anh
+2.956,+1:word.lower():chung_thuỷ
+2.821,+1:word.lower():pháp
+2.585,word[-3:]:Anh
+2.319,-1:word.lower():gái
+2.162,+1:word.lower():đl

Weight?,Feature
+5.131,word.lower():sang
+4.774,+1:word.lower():frederic
+4.404,BOS
+4.238,word.lower():trong
+4.147,word.lower():không
+4.114,bias
+3.858,-1:word.lower():chiếc
+3.803,word.lower():ở
+3.801,word.lower():mình
+3.796,word.lower():muay_thái

Weight?,Feature
+4.982,word.lower():khai_minh
+4.479,word.lower():phong_phú
+3.971,word[-2:]:ex
+3.928,-1:word.lower():hãng
+3.782,-1:word.lower():dntn
+3.394,word.lower():vksnd_tối_cao
+3.273,word.lower():đảng
+3.083,word.lower():thành_công
+3.070,word[-2:]:co
+3.043,-1:word.lower():xe

Weight?,Feature
+8.043,word.lower():đông-gioăng
+5.983,word.lower():hải
+5.427,word.lower():hoàng
+5.175,word.lower():fred
+4.939,word.lower():hùng
+4.830,word.lower():tùng
+4.787,word.lower():hiền
+4.642,word.istitle()
+4.642,word.lower():bình
+4.546,word.lower():lan


In [10]:
crf = CRF(algorithm='lbfgs',
          c1=10,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [11]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

In [12]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

    LOCATION       0.78      0.73      0.75      9028
        MISC       0.67      0.75      0.71       561
           O       0.99      1.00      0.99    345533
ORGANIZATION       0.63      0.37      0.47      3268
      PERSON       0.82      0.80      0.81     11002

 avg / total       0.98      0.98      0.98    369392



In [13]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=10, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [14]:
eli5.show_weights(crf, top=30)

From \ To,LOCATION,MISC,O,ORGANIZATION,PERSON
LOCATION,3.774,0.0,0.145,-0.728,-3.442
MISC,0.0,5.633,0.0,0.0,0.0
O,0.627,-0.258,2.136,0.163,-0.086
ORGANIZATION,-0.438,0.0,-0.549,5.115,-1.378
PERSON,-0.47,0.0,-0.426,0.0,3.427

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+3.648,word[-2:]:lộ,,,
+2.941,word.lower():sông,,,
+2.698,word.lower():miền,,,
+2.310,word[-2:]:Mỹ,,,
+2.160,word[-2:]:óm,,,
+2.048,word.lower():làng,,,
+2.045,word.lower():huyện,,,
+2.017,-1:word.lower():tại,,,
+1.978,-1:word.lower():đường,,,
+1.974,word.lower():cầu,,,

Weight?,Feature
+3.648,word[-2:]:lộ
+2.941,word.lower():sông
+2.698,word.lower():miền
+2.310,word[-2:]:Mỹ
+2.160,word[-2:]:óm
+2.048,word.lower():làng
+2.045,word.lower():huyện
+2.017,-1:word.lower():tại
+1.978,-1:word.lower():đường
+1.974,word.lower():cầu

Weight?,Feature
3.459,-1:word.lower():người
3.0,-1:word.lower():tiếng
2.412,word.lower():tiếng
2.31,word[-3:]:ếng
1.919,word.lower():người
1.574,+1:word.lower():việt
1.446,+1:word.isupper()
1.359,+1:word.lower():anh
1.318,word[-2:]:ời
1.281,word[-3:]:ười

Weight?,Feature
+5.493,BOS
+4.199,word.lower():ở
+3.725,bias
+2.997,-1:word.lower():“
+2.776,"-1:word.lower():"""
+2.337,word.lower():muay_thái
+2.267,word.lower():anh
+2.118,word.lower():và
+2.067,word.lower():tại
+1.938,EOS

Weight?,Feature
+2.942,word[-3:]:Trẻ
+2.038,-1:word.lower():hãng
+2.021,word.lower():đảng
+1.845,-1:word.lower():bộ
+1.771,word[-3:]:oàn
+1.637,word[-3:]:học
+1.604,word[-2:]:ội
+1.484,-1:word.lower():công_ty
+1.432,word[-2:]:ục
+1.345,-1:word.lower():trường

Weight?,Feature
+4.462,word.lower():hải
+3.831,word.istitle()
+3.466,-1:word.lower():ông
+2.958,-1:word.lower():chị
+2.948,word.lower():hoàng
+2.874,-1:word.lower():bà
+2.725,word.lower():phú
+2.707,word.lower():lan
+2.610,word.lower():fred
+2.521,word.lower():hùng
