In [1]:
# 画图相关风格
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
# 导入相关包
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics



# 1. CoNLL2002语料

In [3]:
# 查看训练和测试数据，这里数据使用NLTK中CoNLL2002的西班牙语料
print("文件ID", nltk.corpus.conll2002.fileids())
# 读出训练句子序列
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
# 读出测试句子序列
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
# 1个句子由单词序列构成，每个的单词是一个三元组（词形，词性，实体标记）
print("打印训练集第1个句子", train_sents[0])

文件ID ['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']
打印训练集第1个句子 [('Melbourne', 'NP', 'B-LOC'), ('(', 'Fpa', 'O'), ('Australia', 'NP', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('25', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFE', 'NC', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]


# 2. 抽取特征

In [4]:
# 这种定义一个函数来抽取特征的编码方式非常符合NLTK风格
# 给定一个句子的单词元组序列和当前词位置，返回针对该当前词的特征字典
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

# 对一个句子中的每个单词抽取一个特征字典，返回所有单词的特征构成的特征序列
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 返回一个句子（单词序列）的实体标记序列
def sent2labels(sent):
    return [label for token, postag, label in sent]

In [5]:
# 演示由一个句子抽取出的第1个单词的特征
sent2features(train_sents[0])[0]

{'bias': 1.0,
 'word.lower()': 'melbourne',
 'word[-3:]': 'rne',
 'word[-2:]': 'ne',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'postag': 'NP',
 'postag[:2]': 'NP',
 'BOS': True,
 '+1:word.lower()': '(',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'Fpa',
 '+1:postag[:2]': 'Fp'}

In [6]:
%%time
# 抽取训练数据每个单词的特征字典，构成特征字典序列X_train
X_train = [sent2features(s) for s in train_sents]
# 训练数据每个特征字典对应的标记构成训练标记序列y_train
y_train = [sent2labels(s) for s in train_sents]
# 抽取测试数据每个单词的特征字典，构成特征字典序列X_test
X_test = [sent2features(s) for s in test_sents]
# 测试数据每个特征字典对应的标记构成测试标记序列y_test
y_test = [sent2labels(s) for s in test_sents]

Wall time: 1.04 s


# 3. 训练

In [7]:
%%time
# 这里使用 L-BFGS训练算法，以（L1+L2）作为正则化参数，
# 如果需要了解其他的CRF参数可以查看文档
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 35.2 s


# 4. 评测

In [8]:
y_pred = crf.predict(X_test)
labels = list(crf.classes_)
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
# 从评测函数的输入参数可以看出：
# 实际标记序列y_test与预测标记序列y_pred必须在相同位置上必须对应的是同一个词语，
# 即分词一致
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

          O      0.992     0.997     0.994     45355
      B-LOC      0.810     0.784     0.797      1084
      I-LOC      0.690     0.637     0.662       325
     B-MISC      0.731     0.569     0.640       339
     I-MISC      0.699     0.589     0.639       557
      B-ORG      0.807     0.832     0.820      1400
      I-ORG      0.852     0.786     0.818      1104
      B-PER      0.850     0.884     0.867       735
      I-PER      0.893     0.943     0.917       634

avg / total      0.970     0.971     0.971     51533

