# CRF based NER
- [참고 코드](https://lovit.github.io/nlp/2018/06/22/crf_based_ner/)
- data : [MSRA](https://github.com/OYE93/Chinese-NLP-Corpus/tree/master/NER/MSRA)

# 0. Settings

In [1]:
!pip install python-crfsuite



In [2]:
import nltk
import pycrfsuite
import warnings
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from itertools import chain
#warnings.filterwarnings('ignore')

# 1. Data

In [46]:
train_set = './data/msra_train_bio.txt'
test_set = './data/msra_test_bio.txt'

###  데이터 형식
- token/ner tagging
- 문장 구별은 '\n' 문자로 되어있음

In [52]:
f = open(train_set)
f.readlines()[:10]

['当\tO\n',
 '希\tO\n',
 '望\tO\n',
 '工\tO\n',
 '程\tO\n',
 '救\tO\n',
 '助\tO\n',
 '的\tO\n',
 '百\tO\n',
 '万\tO\n']

### Preprocessing

In [44]:
def raw_data_preprocessing(file_name):
    tagged_sentences = []
    sentence = []
    with open(file_name,'r') as f:
        for line in f.readlines():
            if len(line) == 0 or line[0] == '\n':
                if len(sentence) > 0:
                    tagged_sentences.append(sentence)
                    sentence = []
                continue
            if line =='0\t\n':
                continue
            word, ner_tag = line.strip().split('\t') 
            sentence.append((word, ner_tag)) # 단어와 개체명 태깅만 기록
    return tagged_sentences

In [54]:
train_sents = raw_data_preprocessing(train_set)
test_sents = raw_data_preprocessing(test_set)

In [55]:
len(train_sents), len(test_sents)

(45000, 3442)

In [53]:
train_sents[40000]

[('胡', 'B-PER'),
 ('锦', 'I-PER'),
 ('涛', 'I-PER'),
 ('欢', 'O'),
 ('迎', 'O'),
 ('伊', 'B-PER'),
 ('利', 'I-PER'),
 ('埃', 'I-PER'),
 ('斯', 'I-PER'),
 ('库', 'I-PER'),
 ('率', 'O'),
 ('团', 'O'),
 ('访', 'O'),
 ('华', 'B-LOC'),
 ('，', 'O'),
 ('并', 'O'),
 ('表', 'O'),
 ('示', 'O'),
 ('相', 'O'),
 ('信', 'O'),
 ('此', 'O'),
 ('次', 'O'),
 ('来', 'O'),
 ('访', 'O'),
 ('将', 'O'),
 ('对', 'O'),
 ('两', 'O'),
 ('党', 'O'),
 ('、', 'O'),
 ('两', 'O'),
 ('国', 'O'),
 ('关', 'O'),
 ('系', 'O'),
 ('的', 'O'),
 ('发', 'O'),
 ('展', 'O'),
 ('起', 'O'),
 ('到', 'O'),
 ('促', 'O'),
 ('进', 'O'),
 ('作', 'O'),
 ('用', 'O'),
 ('。', 'O')]

# 2. using PyCRFSuite 
## PyCRFSuite란?
- c++로 구현된 CRFSuite 구현체를 Python 환경에서 이용할 수 있도록 도와주는 라이브러리
- 해당 라이브러리를 이용하기위해서는 potential function을 직접 디자인해야 함

### potential function
- word2features는 문장 sent의 시점 i에 대한 potential function임

In [64]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(), # word lower
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(), # word is upper?
        'word.istitle=%s' % word.istitle(), # word is title?
        'word.isdigit=%s' % word.isdigit(), # word is digit?
        #'postag=' + postag,
        #'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            #'-1:postag=' + postag1,
            #'-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            #'+1:postag=' + postag1,
            #'+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features

### Sentence to features, labels, tokens 

In [65]:
def sent2features(sent):
    return [word2features(sent, i) for  i in range(len(sent))]
def sent2labels(sent):
    return [label for token, label in sent]
def sent2tokens(sent):
    return [token for token, label in sent]

In [66]:
sent2features(train_sents[0])[0]

['bias',
 'word.lower=当',
 'word[-3:]=当',
 'word[-2:]=当',
 'word.isupper=False',
 'word.istitle=False',
 'word.isdigit=False',
 'BOS',
 '+1:word.lower=希',
 '+1:word.istitle=False',
 '+1:word.isupper=False']

In [67]:
# 문장을 학습 가능한 형태의 데이터로 변환
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

# 3. Train the model
## 3-1. 주어진 모든 feature를 다 가지고 학습

In [68]:
# 모델에 데이터를 append 하여 학습할 준비 함
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq,yseq)

In [69]:
# parameter setting
# 최소 다섯번 이상 등장한 feature만 이용
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True,
    
    # minimum frequency
    'feature.minfreq': 5
})

In [73]:
# 모델 학습
model_name = './model/msra_cn.crfsuite'
trainer.train(model_name)

In [74]:
tagger = pycrfsuite.Tagger() # 학습된 모델을 tagger로 불러옴
tagger.open(model_name)

<contextlib.closing at 0x7f9e60479210>

In [77]:
# 테스트 문장에 대하여 ner tagging 수행
ex_sent = test_sents[10]
print(' '.join(sent2tokens(ex_sent)), end='\n\n')
print("Predicted:",', '.join(tagger.tag(sent2features(ex_sent))))
print("Correct:",', '.join(sent2labels(ex_sent)))

在 跨 世 纪 的 征 途 上 ， 在 中 国 共 产 党 领 导 下 ， 我 们 要 努 力 实 现 包 括 各 民 主 党 派 、 各 人 民 团 体 、 无 党 派 人 士 在 内 的 全 体 中 国 人 民 的 大 团 结 ， 实 现 包 括 大 陆 同 胞 、 台 港 澳 同 胞 和 海 外 侨 胞 在 内 的 所 有 爱 国 的 中 华 儿 女 的 大 团 结 ， 从 而 战 胜 各 种 艰 难 险 阻 ， 实 现 跨 世 纪 的 宏 伟 蓝 图 。

Predicted: O, O, O, O, O, O, O, O, O, O, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC, O, O, O, B-LOC, I-LOC, I-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O
Correct: O, O, O, O, O, O, O, O, O, O, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC, O, O, O, B-LOC, B-LOC, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O, O, O

In [78]:
# for tagging performance
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = preprocessing.LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
        
    )

In [79]:
y_true = y_test
y_pred = []
for sent in test_sents:
    y_pred.append(tagger.tag(sent2features(sent)))

In [80]:
bio_classification_report(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n       B-LOC       0.88      0.75      0.81      2886\n       I-LOC       0.83      0.66      0.74      4405\n       B-ORG       0.70      0.62      0.66      1331\n       I-ORG       0.70      0.72      0.71      5646\n       B-PER       0.92      0.64      0.76      1973\n       I-PER       0.83      0.83      0.83      3851\n\n   micro avg       0.80      0.72      0.75     20092\n   macro avg       0.81      0.70      0.75     20092\nweighted avg       0.80      0.72      0.75     20092\n samples avg       0.08      0.08      0.08     20092\n'

## 3-2. 한정된 feature만 가지고 학습
- bias, word lower, word[-3:], word[-2:]만 이용

In [81]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
        ])
    else:
        features.append('EOS')
                
    return features


In [82]:
# 문장을 학습 가능한 형태의 데이터로 변환
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [83]:
# 모델에 데이터를 append 하여 학습할 준비 함
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq,yseq)

In [84]:
# parameter setting
# 최소 다섯번 이상 등장한 feature만 이용
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True,
    
    # minimum frequency
    'feature.minfreq': 5
})

In [86]:
# 모델 학습
model_name = './model/msra_cn_lower_features.crfsuite'
trainer.train(model_name)
tagger = pycrfsuite.Tagger() # 학습된 모델을 tagger로 불러옴
tagger.open(model_name)

<contextlib.closing at 0x7f9d34bc5690>

In [87]:
y_true = y_test
y_pred = []
for sent in test_sents:
    y_pred.append(tagger.tag(sent2features(sent)))

In [88]:
bio_classification_report(y_true, y_pred)

'              precision    recall  f1-score   support\n\n       B-LOC       0.86      0.74      0.79      2886\n       I-LOC       0.80      0.60      0.69      4405\n       B-ORG       0.69      0.59      0.64      1331\n       I-ORG       0.69      0.70      0.70      5646\n       B-PER       0.91      0.62      0.74      1973\n       I-PER       0.82      0.83      0.83      3851\n\n   micro avg       0.78      0.69      0.73     20092\n   macro avg       0.80      0.68      0.73     20092\nweighted avg       0.79      0.69      0.73     20092\n samples avg       0.08      0.08      0.08     20092\n'

# 4.모델 확인
- 영향력이 높은 features, 각각에 해당하는 weight확인
- 3-1에서 모든 feature를 이용했던 모델로 평가
- 해당 결과로 ner tagging에서 중요한 정보는 앞/뒤에 등장하는 단어임을 알 수 있음

In [89]:
debugger = tagger.info()
weights = debugger.state_features
location_features = {feature:weight for feature, weight in weights.items() if 'LOC' in feature[1]}

for feature, weight in sorted(location_features.items(), key=lambda x:-x[1])[:50]:
    print('{} : {}'.format(feature, weight))

('+1:word.lower=堂', 'I-LOC') : 3.928522
('word[-3:]=淮', 'B-LOC') : 3.210983
('word[-2:]=淮', 'B-LOC') : 3.210983
('-1:word.lower=赴', 'B-LOC') : 3.190209
('-1:word.lower=℃', 'B-LOC') : 3.186736
('+1:word.lower=寺', 'I-LOC') : 3.005503
('+1:word.lower=运', 'B-LOC') : 2.944599
('+1:word.lower=两', 'B-LOC') : 2.904629
('-1:word.lower=抗', 'B-LOC') : 2.898436
('-1:word.lower=访', 'B-LOC') : 2.879258
('-1:word.lower=县', 'B-LOC') : 2.76594
('-1:word.lower=拿', 'I-LOC') : 2.662926
('-1:word.lower=省', 'B-LOC') : 2.625714
('-1:word.lower=报', 'B-LOC') : 2.531602
('+1:word.lower=友', 'B-LOC') : 2.464358
('+1:word.lower=关', 'B-LOC') : 2.429949
('+1:word.lower=球', 'B-LOC') : 2.414464
('+1:word.lower=举', 'I-LOC') : 2.401059
('-1:word.lower=社', 'B-LOC') : 2.383605
('+1:word.lower=村', 'I-LOC') : 2.337183
('+1:word.lower=晴', 'I-LOC') : 2.336281
('+1:word.lower=州', 'B-LOC') : 2.300002
('+1:word.lower=谷', 'B-LOC') : 2.288912
('+1:word.lower=双', 'B-LOC') : 2.23587
('+1:word.lower=街', 'I-LOC') : 2.203206
('-1:word.