In [21]:
# ignore warning
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [23]:
# Ánh xạ word -> feature vector
def feature_map(word):
    '''Simple feature map.'''
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha()])

In [24]:
class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {} # là một dictionary ánh xạ: word -> từ điển đếm của những tag cửa word đó
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                # nếu list tag không chứa tag, thêm tag vào list
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get) # memory của một word là tag phổ biến nhất của word đó
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [25]:
data = pd.read_csv("data/ner_dataset.csv", encoding="utf-8")
data = data.fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,Tag
369382,Sentence: 16858,vùng,O
369383,Sentence: 16858,chôn_cất,O
369384,Sentence: 16858,người_thân,O
369385,Sentence: 16858,khi,O
369386,Sentence: 16858,qua_đời,O
369387,Sentence: 16858,",",O
369388,Sentence: 16858,gọi,O
369389,Sentence: 16858,là,O
369390,Sentence: 16858,nhị_tì,O
369391,Sentence: 16858,...,O


In [26]:
words = [feature_map(w) for w in data["Word"].values.tolist()]
tags = data["Tag"].values.tolist()
pred = cross_val_predict(RandomForestClassifier(n_estimators=20), X=words, y=tags, cv=5)

report = classification_report(y_pred=pred, y_true=tags)
print(report)

              precision    recall  f1-score   support

    LOCATION       0.71      0.49      0.58      9028
        MISC       0.00      0.00      0.00       561
           O       0.97      0.99      0.98    345533
ORGANIZATION       0.19      0.00      0.01      3268
      PERSON       0.57      0.45      0.50     11002

 avg / total       0.94      0.95      0.94    369392



In [27]:
class FeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.memory_tagger = MemoryTagger()
        self.tag_encoder = LabelEncoder()
        
    def fit(self, X, y):
        words = X["Word"].values.tolist()
        tags = X["Tag"].values.tolist()
        self.memory_tagger.fit(words, tags)
        self.tag_encoder.fit(tags)
        return self
    
    def transform(self, X, y=None):
        words = X["Word"].values.tolist()
        out = []
        for i in range(len(words)):
            w = words[i]
            if i < len(words) - 1:
                wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i+1]]))[0]
            else:
                wp = self.tag_encoder.transform(['O'])[0]
            if i > 0:
                if words[i-1] != ".":
                    wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i-1]]))[0]
                else:
                    wm = self.tag_encoder.transform(['O'])[0]
            else:
                wm = self.tag_encoder.transform(['O'])[0]
            out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(), wp, wm]))
        return out

In [28]:
pred = cross_val_predict(Pipeline([("feature_map", FeatureTransformer()), 
                                   ("clf", RandomForestClassifier(n_estimators=20, n_jobs=3))]),
                         X=data, y=tags, cv=5)

report = classification_report(y_pred=pred, y_true=tags)
print(report)

              precision    recall  f1-score   support

    LOCATION       0.72      0.56      0.63      9028
        MISC       0.86      0.13      0.22       561
           O       0.97      1.00      0.98    345533
ORGANIZATION       0.66      0.31      0.42      3268
      PERSON       0.85      0.48      0.61     11002

 avg / total       0.96      0.96      0.96    369392

