In [2]:
# encoding:utf8
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

print(sklearn.__version__)

ImportError: No module named pycrfsuite

## 使用CoNLL 2000 数据构建NP抽取系统 to build a NER system

- 使用NLTK内部提供的CoNLL2000语料

In [2]:
nltk.corpus.conll2000.fileids()

['train.txt', 'test.txt']

In [3]:
%%time
train_sents = list(nltk.corpus.conll2000.iob_sents('train.txt'))
test_sents = list(nltk.corpus.conll2000.iob_sents('test.txt'))

CPU times: user 936 ms, sys: 992 ms, total: 1.93 s
Wall time: 1.97 s


原始数据格式，方便提取特征

In [4]:
train_sents[0]

[(u'Confidence', u'NN', u'B-NP'),
 (u'in', u'IN', u'B-PP'),
 (u'the', u'DT', u'B-NP'),
 (u'pound', u'NN', u'I-NP'),
 (u'is', u'VBZ', u'B-VP'),
 (u'widely', u'RB', u'I-VP'),
 (u'expected', u'VBN', u'I-VP'),
 (u'to', u'TO', u'I-VP'),
 (u'take', u'VB', u'I-VP'),
 (u'another', u'DT', u'B-NP'),
 (u'sharp', u'JJ', u'I-NP'),
 (u'dive', u'NN', u'I-NP'),
 (u'if', u'IN', u'B-SBAR'),
 (u'trade', u'NN', u'B-NP'),
 (u'figures', u'NNS', u'I-NP'),
 (u'for', u'IN', u'B-PP'),
 (u'September', u'NNP', u'B-NP'),
 (u',', u',', u'O'),
 (u'due', u'JJ', u'B-ADJP'),
 (u'for', u'IN', u'B-PP'),
 (u'release', u'NN', u'B-NP'),
 (u'tomorrow', u'NN', u'B-NP'),
 (u',', u',', u'O'),
 (u'fail', u'VB', u'B-VP'),
 (u'to', u'TO', u'I-VP'),
 (u'show', u'VB', u'I-VP'),
 (u'a', u'DT', u'B-NP'),
 (u'substantial', u'JJ', u'I-NP'),
 (u'improvement', u'NN', u'I-NP'),
 (u'from', u'IN', u'B-PP'),
 (u'July', u'NNP', u'B-NP'),
 (u'and', u'CC', u'I-NP'),
 (u'August', u'NNP', u'I-NP'),
 (u"'s", u'POS', u'B-NP'),
 (u'near-record', u'JJ

## 特征模板

In [5]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]    

特征示例

In [6]:
sent2features(train_sents[0])[0]

['bias',
 u'word.lower=confidence',
 u'word[-3:]=nce',
 u'word[-2:]=ce',
 'word.isupper=False',
 'word.istitle=True',
 'word.isdigit=False',
 u'postag=NN',
 u'postag[:2]=NN',
 'BOS',
 u'+1:word.lower=in',
 '+1:word.istitle=False',
 '+1:word.isupper=False',
 u'+1:postag=IN',
 u'+1:postag[:2]=IN']

抽取特征

In [7]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 416 ms, sys: 3.81 s, total: 4.23 s
Wall time: 4.25 s


## 模型训练
- 构建模型
- 导入学习数据

In [8]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 688 ms, sys: 2.82 s, total: 3.5 s
Wall time: 3.51 s


设置训练参数：使用L-BFGSsu算法(默认值)并用Elastic Net进行正则化（本质就是L1和L2一起用）

In [9]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [10]:
# 模型训练的参数
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

Train the model:

In [11]:
%%time
trainer.train('conll2000-np.crfsuite')

CPU times: user 35.4 s, sys: 652 ms, total: 36.1 s
Wall time: 36.2 s


训练模型保存到问句中

In [12]:
!ls -lh ./conll2000-np.crfsuite

-rw------- 1 dl dl 513K 9月  15 08:22 ./conll2000-np.crfsuite


查看模型训练情况：使用logparser查看。

In [13]:
trainer.logparser.last_iteration

{'active_features': 11099,
 'error_norm': 1002.760167,
 'feature_norm': 93.39015,
 'linesearch_step': 1.0,
 'linesearch_trials': 1,
 'loss': 21714.5293,
 'num': 50,
 'scores': {},
 'time': 0.662}

logparser记录的其他信息

In [14]:
print len(trainer.logparser.iterations), trainer.logparser.iterations[-2]

50 {'loss': 21769.328496, 'error_norm': 835.229788, 'linesearch_trials': 1, 'active_features': 11241, 'num': 49, 'time': 0.677, 'scores': {}, 'linesearch_step': 1.0, 'feature_norm': 92.739679}


## 模型预测
- 创建模型
- 导入模型文件

In [15]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2000-np.crfsuite')

<contextlib.closing at 0x7f963cbbb550>

对一个句子进行标注

In [16]:
example_sent = test_sents[0]
print(' '.join(sent2tokens(example_sent)))

print "预测标签:", ' '.join(tagger.tag(sent2features(example_sent)))
print "真实标签:  ", ' '.join(sent2labels(example_sent))

Rockwell International Corp. 's Tulsa unit said it signed a tentative agreement extending its contract with Boeing Co. to provide structural parts for Boeing 's 747 jetliners .
预测标签: B-NP I-NP I-NP B-NP I-NP I-NP B-VP B-NP B-VP B-NP I-NP I-NP B-VP B-NP I-NP B-PP B-NP I-NP B-VP I-VP B-NP I-NP B-PP B-NP B-NP I-NP I-NP O
真实标签:   B-NP I-NP I-NP B-NP I-NP I-NP B-VP B-NP B-VP B-NP I-NP I-NP B-VP B-NP I-NP B-PP B-NP I-NP B-VP I-VP B-NP I-NP B-PP B-NP B-NP I-NP I-NP O


## 模型评估

In [17]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

对测试集中所有数据进行预测

In [18]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 488 ms, sys: 56 ms, total: 544 ms
Wall time: 559 ms


按照各类标签类型进行评价

In [19]:
print(bio_classification_report(y_test, y_pred))

             precision    recall  f1-score   support

     B-ADJP       0.78      0.74      0.76       438
     I-ADJP       0.76      0.66      0.71       167
     B-ADVP       0.84      0.81      0.82       866
     I-ADVP       0.65      0.58      0.62        89
    B-CONJP       0.56      0.56      0.56         9
    I-CONJP       0.71      0.77      0.74        13
     B-INTJ       1.00      0.50      0.67         2
      B-LST       0.00      0.00      0.00         5
      I-LST       0.00      0.00      0.00         2
       B-NP       0.96      0.96      0.96     12422
       I-NP       0.96      0.96      0.96     14376
       B-PP       0.96      0.98      0.97      4811
       I-PP       0.89      0.65      0.75        48
      B-PRT       0.75      0.76      0.76       106
     B-SBAR       0.88      0.83      0.85       535
     I-SBAR       0.17      0.75      0.27         4
       B-VP       0.96      0.95      0.96      4658
       I-VP       0.94      0.96      0.95   

  'precision', 'predicted', average, warn_for)
