# CRF based NER
- [참고 코드](https://lovit.github.io/nlp/2018/06/22/crf_based_ner/)
- data : [MSRA](https://github.com/OYE93/Chinese-NLP-Corpus/tree/master/NER/MSRA)

# 0. Settings

In [1]:
!pip install python-crfsuite



In [12]:
import nltk
import pycrfsuite
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from itertools import chain
from seqeval.metrics import classification_report
from sklearn.model_selection import train_test_split

# 1. Data preparation
### Load the data

In [4]:
data_set = '../data/dataset'

###  Data format
- token '\t' ne-tagging
- 문장 구별은 '\n' 문자로 되어있음

In [5]:
f = open(data_set)
f.readlines()[:10]

['中\tB-ORG\n',
 '共\tI-ORG\n',
 '中\tI-ORG\n',
 '央\tI-ORG\n',
 '致\tO\n',
 '中\tB-ORG\n',
 '国\tI-ORG\n',
 '致\tI-ORG\n',
 '公\tI-ORG\n',
 '党\tI-ORG\n']

### Data Preprocessing

In [6]:
def raw_data_preprocessing(file_name):
    tagged_sentences = []
    sentence = []
    with open(file_name,'r') as f:
        for line in f.readlines():
            if len(line) == 0 or line[0] == '\n':
                if len(sentence) > 0:
                    tagged_sentences.append(sentence)
                    sentence = []
                continue
            if line =='0\t\n':
                continue
            word, ner_tag = line.strip().split('\t') 
            sentence.append((word, ner_tag)) # 단어와 개체명 태깅만 기록
    return tagged_sentences

In [8]:
sents = raw_data_preprocessing(data_set)

In [9]:
len(sents)

78150

In [14]:
train_sents, test_sents = train_test_split(sents, test_size=0.2, random_state=42)

In [15]:
print("train set :", len(train_sents))
print("test set: ", len(test_sents))

train set : 62520
test set:  15630


In [34]:
test_sents

[[('在', 'O'),
  ('抓', 'O'),
  ('举', 'O'),
  ('比', 'O'),
  ('赛', 'O'),
  ('中', 'O'),
  ('，', 'O'),
  ('王', 'B-PER'),
  ('宏', 'I-PER'),
  ('宇', 'I-PER'),
  ('试', 'O'),
  ('举', 'O'),
  ('只', 'O'),
  ('成', 'O'),
  ('功', 'O'),
  ('了', 'O'),
  ('一', 'O'),
  ('次', 'O'),
  ('（', 'O'),
  ('1', 'O'),
  ('5', 'O'),
  ('0', 'O'),
  ('公', 'O'),
  ('斤', 'O'),
  ('）', 'O'),
  ('，', 'O'),
  ('落', 'O'),
  ('后', 'O'),
  ('马', 'B-PER'),
  ('尔', 'I-PER'),
  ('科', 'I-PER'),
  ('夫', 'I-PER'),
  ('7', 'O'),
  ('．', 'O'),
  ('5', 'O'),
  ('公', 'O'),
  ('斤', 'O'),
  ('。', 'O')],
 [('由', 'O'),
  ('“', 'B-ORG'),
  ('青', 'I-ORG'),
  ('岛', 'I-ORG'),
  ('”', 'I-ORG'),
  ('号', 'I-ORG'),
  ('导', 'I-ORG'),
  ('弹', 'I-ORG'),
  ('驱', 'I-ORG'),
  ('逐', 'I-ORG'),
  ('舰', 'I-ORG'),
  ('和', 'O'),
  ('“', 'B-ORG'),
  ('世', 'I-ORG'),
  ('昌', 'I-ORG'),
  ('”', 'I-ORG'),
  ('号', 'I-ORG'),
  ('综', 'I-ORG'),
  ('合', 'I-ORG'),
  ('训', 'I-ORG'),
  ('练', 'I-ORG'),
  ('舰', 'I-ORG'),
  ('组', 'O'),
  ('成', 'O'),
  ('的', 'O'),
  ('中', '

# 2. using PyCRFSuite 
## PyCRFSuite란?
- c++로 구현된 CRFSuite 구현체를 Python 환경에서 이용할 수 있도록 도와주는 라이브러리
- 해당 라이브러리를 이용하기위해서는 potential function을 직접 디자인해야 함

### potential function
- word2features는 문장 sent의 시점 i에 대한 potential function임

In [16]:
def word2features(sent, i):
    word = sent[i][0]
    #postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(), # word lower
        #'word[-3:]=' + word[-3:],
        #'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(), # word is upper?
        'word.istitle=%s' % word.istitle(), # word is title?
        'word.isdigit=%s' % word.isdigit(), # word is digit?
        #'postag=' + postag,
        #'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            #'-1:postag=' + postag1,
            #'-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            #'+1:postag=' + postag1,
            #'+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features

### Sentence to features, labels, tokens 

In [17]:
def sent2features(sent):
    return [word2features(sent, i) for  i in range(len(sent))]
def sent2labels(sent):
    return [label for token, label in sent]
def sent2tokens(sent):
    return [token for token, label in sent]

In [18]:
sent2features(train_sents[0])[0]

['bias',
 'word.lower=原',
 'word.isupper=False',
 'word.istitle=False',
 'word.isdigit=False',
 'BOS',
 '+1:word.lower=来',
 '+1:word.istitle=False',
 '+1:word.isupper=False']

In [19]:
# 문장을 학습 가능한 형태의 데이터로 변환
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

# 3. Train the model
## 3-1. 주어진 모든 feature를 다 가지고 학습

In [20]:
# 모델에 데이터를 append 하여 학습할 준비 함
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq,yseq)

In [21]:
# parameter setting
# 최소 다섯번 이상 등장한 feature만 이용
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True,
    
    # minimum frequency
    'feature.minfreq': 3
})

In [22]:
# 모델 학습
model_name = './model/chinese_sum.crfsuite'
trainer.train(model_name)

In [23]:
model_name = './model/chinese_sum.crfsuite'
tagger = pycrfsuite.Tagger() # 학습된 모델을 tagger로 불러옴
tagger.open(model_name)

<contextlib.closing at 0x7fdfc51f1810>

In [36]:
# 테스트 문장에 대하여 ner tagging 수행
ex_sent = test_sents[1]
print(' '.join(sent2tokens(ex_sent)), end='\n\n')
print("Predicted:",', '.join(tagger.tag(sent2features(ex_sent))))
print("Correct:",', '.join(sent2labels(ex_sent)))

由 “ 青 岛 ” 号 导 弹 驱 逐 舰 和 “ 世 昌 ” 号 综 合 训 练 舰 组 成 的 中 国 海 军 舰 艇 编 队 今 天 上 午 驶 抵 奥 克 兰 港 ， 开 始 对 新 西 兰 为 期 ３ 天 的 友 好 访 问 。

Predicted: O, O, B-LOC, I-LOC, I-LOC, I-LOC, O, O, O, O, O, O, O, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, O, O, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, O, O, O, O, O, O, B-LOC, I-LOC, I-LOC, I-LOC, O, O, O, O, B-LOC, I-LOC, I-LOC, O, O, O, O, O, O, O, O, O, O
Correct: O, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, O, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, O, O, O, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, I-ORG, O, O, O, O, O, O, B-LOC, I-LOC, I-LOC, I-LOC, O, O, O, O, B-LOC, I-LOC, I-LOC, O, O, O, O, O, O, O, O, O, O


In [25]:
# for tagging performance
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = preprocessing.LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
        
    )

In [26]:
y_true = y_test
y_pred = []
for sent in test_sents:
    y_pred.append(tagger.tag(sent2features(sent)))

In [27]:
print(classification_report(y_true, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         GPE       0.00      0.00      0.00        55
         LOC       0.79      0.71      0.75     12580
         ORG       0.73      0.55      0.63      6835
         PER       0.82      0.66      0.73      6380

   micro avg       0.78      0.66      0.72     25850
   macro avg       0.59      0.48      0.53     25850
weighted avg       0.78      0.66      0.71     25850



## 3-2. 한정된 feature만 가지고 학습
- bias, word lower, word[-3:], word[-2:]만 이용

In [173]:
def word2features(sent, i):
    word = sent[i][0]
    #postag = sent[i][1]
    features = [
        'bias',
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        #'postag=' + postag,
        #'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            #'-1:postag=' + postag1,
            #'-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            #'+1:postag=' + postag1,
            #'+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


In [174]:
# 문장을 학습 가능한 형태의 데이터로 변환
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [175]:
# 모델에 데이터를 append 하여 학습할 준비 함
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq,yseq)

KeyboardInterrupt: 

In [None]:
# parameter setting
# 최소 다섯번 이상 등장한 feature만 이용
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True,
    
    # minimum frequency
    'feature.minfreq': 5
})

In [None]:
# 모델 학습
model_name = './model/msra_cn_lower_features.crfsuite'
trainer.train(model_name)
tagger = pycrfsuite.Tagger() # 학습된 모델을 tagger로 불러옴
tagger.open(model_name)

In [None]:
y_true = y_test
y_pred = []
for sent in test_sents:
    y_pred.append(tagger.tag(sent2features(sent)))

In [None]:
print(classification_report(y_true, y_pred))

# 4.모델 확인
- 영향력이 높은 features, 각각에 해당하는 weight확인
- 3-1에서 모든 feature를 이용했던 모델로 평가
- 해당 결과로 ner tagging에서 중요한 정보는 앞/뒤에 등장하는 단어임을 알 수 있음

In [None]:
debugger = tagger.info()
weights = debugger.state_features
location_features = {feature:weight for feature, weight in weights.items() if 'LOC' in feature[1]}

for feature, weight in sorted(location_features.items(), key=lambda x:-x[1])[:50]:
    print('{} : {}'.format(feature, weight))