# CRF based NER
- [참고 코드](https://lovit.github.io/nlp/2018/06/22/crf_based_ner/)
- data : CoNLL 2002 dataset

# 0. Settings

In [26]:
!pip install python-crfsuite



In [27]:
import nltk
import pycrfsuite
import warnings
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import preprocessing
from itertools import chain
#warnings.filterwarnings('ignore')

# 1. Load the Data

In [28]:
# 다운로드 및 field확인
nltk.download('conll2002')
nltk.corpus.conll2002.fileids()

[nltk_data] Downloading package conll2002 to /Users/mac/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

In [29]:
data = nltk.corpus.conll2002
train_sents =  list(data.iob_sents('esp.train'))
test_sents = list(data.iob_sents('esp.testb'))

In [30]:
# column : token | pos_tag | ner_tag
train_sents

[[('Melbourne', 'NP', 'B-LOC'),
  ('(', 'Fpa', 'O'),
  ('Australia', 'NP', 'B-LOC'),
  (')', 'Fpt', 'O'),
  (',', 'Fc', 'O'),
  ('25', 'Z', 'O'),
  ('may', 'NC', 'O'),
  ('(', 'Fpa', 'O'),
  ('EFE', 'NC', 'B-ORG'),
  (')', 'Fpt', 'O'),
  ('.', 'Fp', 'O')],
 [('-', 'Fg', 'O')],
 [('El', 'DA', 'O'),
  ('Abogado', 'NC', 'B-PER'),
  ('General', 'AQ', 'I-PER'),
  ('del', 'SP', 'I-PER'),
  ('Estado', 'NC', 'I-PER'),
  (',', 'Fc', 'O'),
  ('Daryl', 'VMI', 'B-PER'),
  ('Williams', 'NC', 'I-PER'),
  (',', 'Fc', 'O'),
  ('subrayó', 'VMI', 'O'),
  ('hoy', 'RG', 'O'),
  ('la', 'DA', 'O'),
  ('necesidad', 'NC', 'O'),
  ('de', 'SP', 'O'),
  ('tomar', 'VMN', 'O'),
  ('medidas', 'NC', 'O'),
  ('para', 'SP', 'O'),
  ('proteger', 'VMN', 'O'),
  ('al', 'SP', 'O'),
  ('sistema', 'NC', 'O'),
  ('judicial', 'AQ', 'O'),
  ('australiano', 'AQ', 'O'),
  ('frente', 'RG', 'O'),
  ('a', 'SP', 'O'),
  ('una', 'DI', 'O'),
  ('página', 'NC', 'O'),
  ('de', 'SP', 'O'),
  ('internet', 'NC', 'O'),
  ('que', 'PR', 'O'),

# 2. using PyCRFSuite 
## PyCRFSuite란?
- c++로 구현된 CRFSuite 구현체를 Python 환경에서 이용할 수 있도록 도와주는 라이브러리
- 해당 라이브러리를 이용하기위해서는 potential function을 직접 디자인해야 함

### potential function
- word2features는 문장 sent의 시점 i에 대한 potential function임

In [31]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(), # word lower
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(), # word is upper?
        'word.istitle=%s' % word.istitle(), # word is title?
        'word.isdigit=%s' % word.isdigit(), # word is digit?
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features

### Sentence to features, labels, tokens 

In [32]:
def sent2features(sent):
    return [word2features(sent, i) for  i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [33]:
sent2features(train_sents[0])[0]

['bias',
 'word.lower=melbourne',
 'word[-3:]=rne',
 'word[-2:]=ne',
 'word.isupper=False',
 'word.istitle=True',
 'word.isdigit=False',
 'postag=NP',
 'postag[:2]=NP',
 'BOS',
 '+1:word.lower=(',
 '+1:word.istitle=False',
 '+1:word.isupper=False',
 '+1:postag=Fpa',
 '+1:postag[:2]=Fp']

In [34]:
# 문장을 학습 가능한 형태의 데이터로 변환
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

# 3. Train the model
## 3-1. 주어진 모든 feature를 다 가지고 학습

In [35]:
# 모델에 데이터를 append 하여 학습할 준비 함
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq,yseq)

In [36]:
# parameter setting
# 최소 다섯번 이상 등장한 feature만 이용
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True,
    
    # minimum frequency
    'feature.minfreq': 5
})

In [37]:
# 모델 학습
model_name = './model/conll2002-esp.crfsuite'
trainer.train(model_name)

In [13]:
tagger = pycrfsuite.Tagger() # 학습된 모델을 tagger로 불러옴
tagger.open(model_name)

<contextlib.closing at 0x7fe730befe10>

In [14]:
# 테스트 문장에 대하여 ner tagging 수행
ex_sent = test_sents[0]
print(' '.join(sent2tokens(ex_sent)), end='\n\n')
print("Predicted:",', '.join(tagger.tag(sent2features(ex_sent))))
print("Correct:",', '.join(sent2labels(ex_sent)))

La Coruña , 23 may ( EFECOM ) .

Predicted: B-LOC, I-LOC, O, O, O, O, B-ORG, O, O
Correct: B-LOC, I-LOC, O, O, O, O, B-ORG, O, O


In [15]:
# for tagging performance
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = preprocessing.LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
        
    )

In [16]:
y_true = y_test
y_pred = []
for sent in test_sents:
    y_pred.append(tagger.tag(sent2features(sent)))

In [17]:
bio_classification_report(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n       B-LOC       0.74      0.71      0.73      1084\n       I-LOC       0.57      0.51      0.54       325\n      B-MISC       0.61      0.37      0.46       339\n      I-MISC       0.59      0.43      0.50       557\n       B-ORG       0.76      0.78      0.77      1400\n       I-ORG       0.78      0.76      0.77      1104\n       B-PER       0.77      0.87      0.82       735\n       I-PER       0.83      0.94      0.88       634\n\n   micro avg       0.75      0.72      0.73      6178\n   macro avg       0.71      0.67      0.68      6178\nweighted avg       0.74      0.72      0.73      6178\n samples avg       0.09      0.09      0.09      6178\n'

## 3-2. 한정된 feature만 가지고 학습
- bias, word lower, word[-3:], word[-2:]만 이용

In [18]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
        ])
    else:
        features.append('EOS')
                
    return features


In [19]:
# 문장을 학습 가능한 형태의 데이터로 변환
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [20]:
# 모델에 데이터를 append 하여 학습할 준비 함
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq,yseq)

In [21]:
# parameter setting
# 최소 다섯번 이상 등장한 feature만 이용
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True,
    
    # minimum frequency
    'feature.minfreq': 5
})

In [22]:
# 모델 학습
data_name = 'conll2002-esp.crfsuite'
trainer.train(data_name)
tagger = pycrfsuite.Tagger() # 학습된 모델을 tagger로 불러옴
tagger.open(data_name)

<contextlib.closing at 0x7fe730b63410>

In [23]:
y_true = y_test
y_pred = []
for sent in test_sents:
    y_pred.append(tagger.tag(sent2features(sent)))

In [24]:
bio_classification_report(y_true, y_pred)

'              precision    recall  f1-score   support\n\n       B-LOC       0.69      0.49      0.58      1084\n       I-LOC       0.60      0.47      0.52       325\n      B-MISC       0.52      0.20      0.29       339\n      I-MISC       0.52      0.36      0.43       557\n       B-ORG       0.74      0.55      0.63      1400\n       I-ORG       0.71      0.52      0.60      1104\n       B-PER       0.83      0.69      0.76       735\n       I-PER       0.86      0.86      0.86       634\n\n   micro avg       0.72      0.54      0.62      6178\n   macro avg       0.68      0.52      0.58      6178\nweighted avg       0.71      0.54      0.61      6178\n samples avg       0.07      0.07      0.07      6178\n'

# 4.모델 확인
- 영향력이 높은 features, 각각에 해당하는 weight확인
- 3-1에서 모든 feature를 이용했던 모델로 평가
- 해당 결과로 ner tagging에서 중요한 정보는 앞/뒤에 등장하는 단어임을 알 수 있음

In [25]:
debugger = tagger.info()
weights = debugger.state_features
location_features = {feature:weight for feature, weight in weights.items() if 'LOC' in feature[1]}

for feature, weight in sorted(location_features.items(), key=lambda x:-x[1])[:50]:
    print('{} : {}'.format(feature, weight))

('-1:word.lower=despejado', 'B-LOC') : 6.919385
('-1:word.lower=efe-cantabria', 'B-LOC') : 6.274558
('word[-3:]=yun', 'B-LOC') : 5.874011
('-1:word.lower=palacio', 'I-LOC') : 5.86573
('-1:word.lower=puente', 'I-LOC') : 5.553516
('-1:word.lower=costa', 'I-LOC') : 5.458388
('-1:word.lower=avenida', 'I-LOC') : 5.372484
('word[-3:]=nón', 'B-LOC') : 5.322154
('word[-3:]=iés', 'B-LOC') : 5.147951
('-1:word.lower=nuboso', 'B-LOC') : 5.10912
('word[-3:]=ael', 'B-LOC') : 4.857369
('-1:word.lower=cantabria', 'B-LOC') : 4.785114
('-1:word.lower=santa', 'I-LOC') : 4.763376
('-1:word.lower=parque', 'I-LOC') : 4.587954
('word[-3:]=kio', 'B-LOC') : 4.379538
('+1:word.lower=cairo', 'B-LOC') : 4.342166
('+1:word.lower=coruña', 'B-LOC') : 4.315112
('+1:word.lower=unido', 'B-LOC') : 3.890058
('word[-3:]=lmo', 'B-LOC') : 3.739574
('-1:word.lower=paseo', 'I-LOC') : 3.709889
('-1:word.lower=bulevar', 'I-LOC') : 3.681638
('-1:word.lower=lluvioso', 'B-LOC') : 3.674013
('word[-3:]=uay', 'B-LOC') : 3.642079
('w