In [1]:
! pip install sklearn_crfsuite



In [2]:
import nltk
import os 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
import nltk
import pickle
import sklearn_crfsuite
from nltk.corpus.reader import ConllCorpusReader
from sklearn_crfsuite import metrics

In [4]:
! git clone https://github.com/ntson2002/text-mining-tutorials.git

fatal: destination path 'text-mining-tutorials' already exists and is not an empty directory.


In [5]:
from nltk.corpus.reader import ConllCorpusReader
train_sents = ConllCorpusReader('/content/text-mining-tutorials/data/conll2003', 'train.txt', ['words', 'pos', 'ignore', 'chunk']).iob_sents()
test_sents = ConllCorpusReader('/content/text-mining-tutorials/data/conll2003', 'valid.txt', ['words', 'pos', 'ignore', 'chunk']).iob_sents()

In [6]:
# Bỏ các câu rỗng do lỗi dữ liệu
train_sents = [x for x in train_sents if len(x) > 0]
test_sents = [x for x in test_sents if len(x) > 0]

In [7]:
# Xem thông tin 3 câu đầu tiên
train_sents[:3]

[[('EU', 'NNP', 'B-ORG'),
  ('rejects', 'VBZ', 'O'),
  ('German', 'JJ', 'B-MISC'),
  ('call', 'NN', 'O'),
  ('to', 'TO', 'O'),
  ('boycott', 'VB', 'O'),
  ('British', 'JJ', 'B-MISC'),
  ('lamb', 'NN', 'O'),
  ('.', '.', 'O')],
 [('Peter', 'NNP', 'B-PER'), ('Blackburn', 'NNP', 'I-PER')],
 [('BRUSSELS', 'NNP', 'B-LOC'), ('1996-08-22', 'CD', 'O')]]

# Rút trích đặc trưng của từ, biểu diễn câu thành vector các đặc trưng

In [8]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [9]:
# Xem kết quả của việc rút trích đặc trưng
sent2features(train_sents[0])[0:2]


[{'+1:postag': 'VBZ',
  '+1:postag[:2]': 'VB',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'rejects',
  'BOS': True,
  'bias': 1.0,
  'postag': 'NNP',
  'postag[:2]': 'NN',
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': True,
  'word.lower()': 'eu',
  'word[-2:]': 'EU',
  'word[-3:]': 'EU'},
 {'+1:postag': 'JJ',
  '+1:postag[:2]': 'JJ',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:word.lower()': 'german',
  '-1:postag': 'NNP',
  '-1:postag[:2]': 'NN',
  '-1:word.istitle()': False,
  '-1:word.isupper()': True,
  '-1:word.lower()': 'eu',
  'bias': 1.0,
  'postag': 'VBZ',
  'postag[:2]': 'VB',
  'word.isdigit()': False,
  'word.istitle()': False,
  'word.isupper()': False,
  'word.lower()': 'rejects',
  'word[-2:]': 'ts',
  'word[-3:]': 'cts'}]

# Chuyển dataset thành dạng biểu diễn đặc trưng cho CRF


In [10]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]


CPU times: user 802 ms, sys: 282 ms, total: 1.08 s
Wall time: 1.09 s


# Huấn luyện CRF

In [11]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True
)
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 14041/14041 [00:02<00:00, 5094.65it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 86687
Seconds required: 0.542

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.56  loss=232367.34 active=86384 feature_norm=1.00
Iter 2   time=0.28  loss=217022.74 active=83848 feature_norm=3.45
Iter 3   time=0.29  loss=161383.49 active=83844 feature_norm=2.99
Iter 4   time=0.85  loss=119552.24 active=83579 feature_norm=2.74
Iter 5   time=0.29  loss=94630.26 active=86129 feature_norm=3.20
Iter 6   time=0.28  loss=88330.83 active=85472 feature_norm=3.56
Iter 7   time=0.28  loss=71522.54 active=80105 feature_norm=5.12
Iter 8   time=0.29  loss=61055.57 active=64864 feature_norm=6.38
Iter 9   time=0.29  loss=54560.88 active=61779 feature_norm=7.80
Iter 10  t

# Đánh giá mô hình

In [12]:
labels = list(crf.classes_)
labels.remove('O')
print(labels)
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

['B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']


0.8811508100893499

In [13]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.914     0.876     0.894      1837
       I-LOC      0.878     0.786     0.830       257
      B-MISC      0.926     0.839     0.881       922
      I-MISC      0.885     0.737     0.804       346
       B-ORG      0.851     0.809     0.830      1341
       I-ORG      0.817     0.824     0.820       751
       B-PER      0.901     0.908     0.905      1842
       I-PER      0.941     0.955     0.948      1307

   micro avg      0.896     0.868     0.882      8603
   macro avg      0.889     0.842     0.864      8603
weighted avg      0.896     0.868     0.881      8603



In [14]:
for a, b in zip(test_sents[0], crf.predict([X_test[0]])[0]):
    print(a, b)


('CRICKET', 'NNP', 'O') O
('-', ':', 'O') O
('LEICESTERSHIRE', 'NNP', 'B-ORG') B-ORG
('TAKE', 'NNP', 'O') O
('OVER', 'IN', 'O') O
('AT', 'NNP', 'O') O
('TOP', 'NNP', 'O') O
('AFTER', 'NNP', 'O') O
('INNINGS', 'NNP', 'O') O
('VICTORY', 'NN', 'O') O
('.', '.', 'O') O


# Predict from text

In [15]:
# Chuyển câu sang conll format
import nltk
sentence = nltk.word_tokenize("He is a German who works at Google Inc.")
x = nltk.pos_tag(sentence)
test_sent = [i + ('-', ) for i in x]
test_sent

[('He', 'PRP', '-'),
 ('is', 'VBZ', '-'),
 ('a', 'DT', '-'),
 ('German', 'JJ', '-'),
 ('who', 'WP', '-'),
 ('works', 'VBZ', '-'),
 ('at', 'IN', '-'),
 ('Google', 'NNP', '-'),
 ('Inc', 'NNP', '-'),
 ('.', '.', '-')]

In [16]:
# Rút trích đặc trưng của câu đầu vào 
test_sent_feature = sent2features(test_sent)

In [17]:
crf.predict([test_sent_feature])

[['O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O']]