# NER using sklearn-crfsuite
- [sklearn-crfsuite tutorial](https://eli5.readthedocs.io/en/latest/_notebooks/debug-sklearn-crfsuite.html)

# 0. Settings

In [6]:
!pip install sklearn_crfsuite
!pip install eli5

In [52]:
import nltk
import sklearn_crfsuite
import eli5
from sklearn import preprocessing
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix

# 1. Training data
- MSRA train set

In [8]:
train_set = './data/msra_train_bio.txt'
test_set = './data/msra_test_bio.txt'

In [9]:
def raw_data_preprocessing(file_name):
    tagged_sentences = []
    sentence = []
    with open(file_name,'r') as f:
        for line in f.readlines():
            if len(line) == 0 or line[0] == '\n':
                if len(sentence) > 0:
                    tagged_sentences.append(sentence)
                    sentence = []
                continue
            if line =='0\t\n':
                continue
            word, ner_tag = line.strip().split('\t') 
            sentence.append((word, ner_tag)) # 단어와 개체명 태깅만 기록
    return tagged_sentences

In [10]:
train_sents = raw_data_preprocessing(train_set)
test_sents = raw_data_preprocessing(test_set)

# 2. Feature extraction

In [14]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        #'postag': postag,
        #'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            #'-1:postag': postag1,
            #'-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            #'+1:postag': postag1,
            #'+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [15]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [16]:
X_train[0][1]

{'bias': 1.0,
 'word.lower()': '希',
 'word[-3:]': '希',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 '-1:word.lower()': '当',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '+1:word.lower()': '望',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False}

# 3. Train a CRF model

In [17]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X_train, y_train);

# 4. Inspect model weights

In [19]:
eli5.show_weights(crf, top=10)



From \ To,O,B-LOC,I-LOC,B-ORG,I-ORG,B-PER,I-PER
O,3.182,0.468,0.0,0.434,0.0,0.252,0.0
B-LOC,-1.389,-0.017,4.692,-0.158,0.0,0.0,0.0
I-LOC,-0.404,-0.151,2.719,-0.19,0.0,-0.198,0.0
B-ORG,-4.611,0.0,0.0,-0.129,5.408,0.0,0.0
I-ORG,-0.515,-0.339,0.0,-0.251,4.389,-0.211,0.0
B-PER,-2.236,0.0,0.0,0.0,0.0,-0.104,4.555
I-PER,-0.35,-0.418,0.0,-0.164,0.0,-0.134,2.855

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6
+7.516,EOS,,,,,
+3.712,word[-3:]:。,,,,,
+3.712,word.lower():。,,,,,
+1.778,word[-3:]:，,,,,,
+1.778,word.lower():，,,,,,
+1.427,word.lower():的,,,,,
+1.427,word[-3:]:的,,,,,
+0.922,BOS,,,,,
+0.816,word[-3:]:在,,,,,
+0.816,word.lower():在,,,,,

Weight?,Feature
+7.516,EOS
+3.712,word[-3:]:。
+3.712,word.lower():。
+1.778,word[-3:]:，
+1.778,word.lower():，
+1.427,word.lower():的
+1.427,word[-3:]:的
+0.922,BOS
+0.816,word[-3:]:在
+0.816,word.lower():在

Weight?,Feature
+0.909,-1:word.lower():在
+0.726,word.lower():美
+0.726,word[-3:]:美
+0.670,word.lower():中
+0.670,word[-3:]:中
+0.668,-1:word.lower():、
+0.635,+1:word.lower():国
+0.471,+1:word.lower():京
+0.445,word[-3:]:北
+0.445,word.lower():北

Weight?,Feature
+0.521,word[-3:]:国
+0.521,word.lower():国
+0.486,-1:word.lower():北
+0.470,word.lower():京
+0.470,word[-3:]:京
+0.393,-1:word.lower():中
+0.385,word.lower():洲
+0.385,word[-3:]:洲
+0.346,-1:word.lower():西
… 2784 more positive …,… 2784 more positive …

Weight?,Feature
+0.767,+1:word.lower():国
+0.614,word.lower():中
+0.614,word[-3:]:中
+0.500,-1:word.lower():、
+0.289,word.lower():北
+0.289,word[-3:]:北
+0.284,word[-3:]:国
+0.284,word.lower():国
+0.205,-1:word.lower():和
… 1640 more positive …,… 1640 more positive …

Weight?,Feature
+0.824,+1:word.lower():队
+0.818,word[-3:]:队
+0.818,word.lower():队
+0.471,word[-3:]:委
+0.471,word.lower():委
+0.450,word[-3:]:会
+0.450,word.lower():会
+0.437,+1:word.lower():会
… 2629 more positive …,… 2629 more positive …
… 4232 more negative …,… 4232 more negative …

Weight?,Feature
+0.404,-1:word.lower():、
+0.363,-1:word.lower():长
+0.292,word[-3:]:李
+0.292,word.lower():李
+0.237,word[-3:]:王
+0.237,word.lower():王
+0.220,+1:word.lower():小
+0.215,word.lower():邓
… 2013 more positive …,… 2013 more positive …
… 1170 more negative …,… 1170 more negative …

Weight?,Feature
+0.386,+1:word.lower():、
+0.278,word[-3:]:尔
+0.278,word.lower():尔
+0.277,-1:word.lower():李
+0.255,word[-3:]:平
+0.255,word.lower():平
+0.247,word.lower():斯
+0.247,word[-3:]:斯
… 3669 more positive …,… 3669 more positive …
… 1905 more negative …,… 1905 more negative …


# 5. BIO classification report

In [20]:
# for tagging performance
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = preprocessing.LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
        
    )

In [22]:
help(crf)

Help on CRF in module sklearn_crfsuite.estimator object:

class CRF(sklearn.base.BaseEstimator)
 |  CRF(algorithm=None, min_freq=None, all_possible_states=None, all_possible_transitions=None, c1=None, c2=None, max_iterations=None, num_memories=None, epsilon=None, period=None, delta=None, linesearch=None, max_linesearch=None, calibration_eta=None, calibration_rate=None, calibration_samples=None, calibration_candidates=None, calibration_max_trials=None, pa_type=None, c=None, error_sensitive=None, averaging=None, variance=None, gamma=None, verbose=False, model_filename=None, keep_tempfiles=False, trainer_cls=None)
 |  
 |  python-crfsuite wrapper with interface siimlar to scikit-learn.
 |  It allows to use a familiar fit/predict interface and scikit-learn
 |  model selection utilities (cross-validation, hyperparameter optimization).
 |  
 |  Unlike pycrfsuite.Trainer / pycrfsuite.Tagger this object is picklable;
 |  on-disk files are managed automatically.
 |  
 |  Parameters
 |  --------

In [47]:
ex_sent = test_sents[1000]
print(sent2tokens(ex_sent))
print("predicted:",', '.join(crf.predict_single(sent2features(ex_sent))))
print("Correct:",', '.join(sent2labels(ex_sent)))

['电', '文', '说', '，', '邓', '小', '平', '一', '生', '致', '力', '于', '社', '会', '进', '步', '和', '人', '类', '福', '祉', '，', '为', '中', '国', '的', '现', '代', '化', '和', '发', '展', '贡', '献', '了', '毕', '生', '的', '精', '力', '。']
predicted: O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O
Correct: O, O, O, O, B-PER, I-PER, I-PER, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O


In [48]:
y_true = y_test
y_pred = []
for sent in test_sents:
    y_pred.append(crf.predict_single(sent2features(sent)))

In [53]:
bio_classification_report(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n       B-LOC       0.73      0.25      0.38      2886\n       I-LOC       0.56      0.18      0.27      4405\n       B-ORG       0.72      0.04      0.08      1331\n       I-ORG       0.64      0.08      0.14      5646\n       B-PER       0.60      0.00      0.00      1973\n       I-PER       0.56      0.01      0.01      3851\n\n   micro avg       0.64      0.10      0.17     20092\n   macro avg       0.64      0.09      0.15     20092\nweighted avg       0.62      0.10      0.16     20092\n samples avg       0.01      0.01      0.01     20092\n'