In [1]:
! pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 4.6MB/s 
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [2]:
import nltk
import pickle
import sklearn_crfsuite
from nltk.corpus.reader import ConllCorpusReader
from sklearn_crfsuite import metrics

# Chuẩn bị dữ liệu

In [3]:
# Download dữ liệu CONLL2002
import nltk
nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2002.zip.


True

In [4]:
nltk.corpus.conll2002.fileids()

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

## Kiểm tra dữ liệu

In [5]:
! head -10 /root/nltk_data/corpora/conll2002/esp.train

Melbourne NP B-LOC
( Fpa O
Australia NP B-LOC
) Fpt O
, Fc O
25 Z O
may NC O
( Fpa O
EFE NC B-ORG
) Fpt O


Chuyển sang format IOB

In [6]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))


CPU times: user 1.78 s, sys: 103 ms, total: 1.89 s
Wall time: 1.9 s


In [7]:
train_sents[:3]

[[('Melbourne', 'NP', 'B-LOC'),
  ('(', 'Fpa', 'O'),
  ('Australia', 'NP', 'B-LOC'),
  (')', 'Fpt', 'O'),
  (',', 'Fc', 'O'),
  ('25', 'Z', 'O'),
  ('may', 'NC', 'O'),
  ('(', 'Fpa', 'O'),
  ('EFE', 'NC', 'B-ORG'),
  (')', 'Fpt', 'O'),
  ('.', 'Fp', 'O')],
 [('-', 'Fg', 'O')],
 [('El', 'DA', 'O'),
  ('Abogado', 'NC', 'B-PER'),
  ('General', 'AQ', 'I-PER'),
  ('del', 'SP', 'I-PER'),
  ('Estado', 'NC', 'I-PER'),
  (',', 'Fc', 'O'),
  ('Daryl', 'VMI', 'B-PER'),
  ('Williams', 'NC', 'I-PER'),
  (',', 'Fc', 'O'),
  ('subrayó', 'VMI', 'O'),
  ('hoy', 'RG', 'O'),
  ('la', 'DA', 'O'),
  ('necesidad', 'NC', 'O'),
  ('de', 'SP', 'O'),
  ('tomar', 'VMN', 'O'),
  ('medidas', 'NC', 'O'),
  ('para', 'SP', 'O'),
  ('proteger', 'VMN', 'O'),
  ('al', 'SP', 'O'),
  ('sistema', 'NC', 'O'),
  ('judicial', 'AQ', 'O'),
  ('australiano', 'AQ', 'O'),
  ('frente', 'RG', 'O'),
  ('a', 'SP', 'O'),
  ('una', 'DI', 'O'),
  ('página', 'NC', 'O'),
  ('de', 'SP', 'O'),
  ('internet', 'NC', 'O'),
  ('que', 'PR', 'O'),

# Rút trích đặc trưng của từ, biểu diễn câu thành vector các đặc trưng

In [8]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [9]:
# Xem kết quả của việc rút trích đặc trưng của 2 từ đầu tiên trong câu đầu tiên
sent2features(train_sents[0])[:2]


[{'+1:postag': 'Fpa',
  '+1:postag[:2]': 'Fp',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': '(',
  'BOS': True,
  'bias': 1.0,
  'postag': 'NP',
  'postag[:2]': 'NP',
  'word.isdigit()': False,
  'word.istitle()': True,
  'word.isupper()': False,
  'word.lower()': 'melbourne',
  'word[-2:]': 'ne',
  'word[-3:]': 'rne'},
 {'+1:postag': 'NP',
  '+1:postag[:2]': 'NP',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'australia',
  '-1:postag': 'NP',
  '-1:postag[:2]': 'NP',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:word.lower()': 'melbourne',
  'bias': 1.0,
  'postag': 'Fpa',
  'postag[:2]': 'Fp',
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': '(',
  'word[-2:]': '(',
  'word[-3:]': '('}]

In [10]:
%%time

# Chuyển tất cả các câu trong tập huấn luyện sang dạng biểu diễn đặc trưng phù hợp với sklearn
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]


CPU times: user 953 ms, sys: 248 ms, total: 1.2 s
Wall time: 1.21 s


# Huấn luyện CRF

In [11]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True
)
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 8323/8323 [00:03<00:00, 2195.44it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 96120
Seconds required: 0.722

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.74  loss=272138.42 active=95793 feature_norm=1.00
Iter 2   time=0.38  loss=228680.28 active=93231 feature_norm=3.45
Iter 3   time=0.37  loss=186924.92 active=92489 feature_norm=3.00
Iter 4   time=1.46  loss=123941.39 active=92553 feature_norm=2.41
Iter 5   time=0.37  loss=110203.61 active=94714 feature_norm=2.85
Iter 6   time=0.37  loss=79755.81 active=88971 feature_norm=5.18
Iter 7   time=0.37  loss=70986.18 active=84396 feature_norm=6.26
Iter 8   time=0.36  loss=63757.55 active=79035 feature_norm=6.92
Iter 9   time=0.38  loss=49828.57 active=71091 feature_norm=9.48
Iter 10  

# Đánh giá mô hình

In [12]:
labels = list(crf.classes_)
labels.remove('O')
print(labels)
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']


0.7964686316443963

In [13]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.810     0.784     0.797      1084
       I-LOC      0.690     0.637     0.662       325
      B-MISC      0.731     0.569     0.640       339
      I-MISC      0.699     0.589     0.639       557
       B-ORG      0.807     0.832     0.820      1400
       I-ORG      0.852     0.786     0.818      1104
       B-PER      0.850     0.884     0.867       735
       I-PER      0.893     0.943     0.917       634

   micro avg      0.813     0.787     0.799      6178
   macro avg      0.791     0.753     0.770      6178
weighted avg      0.809     0.787     0.796      6178



In [14]:
for a, b in zip(test_sents[0], crf.predict([X_test[0]])[0]):
  print(a, b)


('La', 'DA', 'B-LOC') B-LOC
('Coruña', 'NC', 'I-LOC') I-LOC
(',', 'Fc', 'O') O
('23', 'Z', 'O') O
('may', 'NC', 'O') O
('(', 'Fpa', 'O') O
('EFECOM', 'NP', 'B-ORG') B-ORG
(')', 'Fpt', 'O') O
('.', 'Fp', 'O') O
