In [2]:
# !pip3 install sklearn-crfsuite

In [4]:
import json
import re
import numpy as np
import pickle

In [5]:
with open('bert/session-pos.pkl', 'rb') as fopen:
    data = pickle.load(fopen)
data.keys()

dict_keys(['train_X', 'test_X', 'train_Y', 'test_Y'])

In [14]:
train_X = data['train_X']
test_X = data['test_X']
train_Y = data['train_Y']
test_Y = data['test_Y']

In [7]:
with open('bert/dictionary-pos.json') as fopen:
    dictionary = json.load(fopen)
dictionary.keys()

dict_keys(['word2idx', 'idx2word', 'tag2idx', 'idx2tag', 'char2idx'])

In [8]:
word2idx = dictionary['word2idx']
idx2word = {int(k): v for k, v in dictionary['idx2word'].items()}
tag2idx = dictionary['tag2idx']
idx2tag = {int(k): v for k, v in dictionary['idx2tag'].items()}
char2idx = dictionary['char2idx']

In [9]:
idx2tag

{0: 'PAD',
 1: 'X',
 2: 'PROPN',
 3: 'AUX',
 4: 'DET',
 5: 'NOUN',
 6: 'PRON',
 7: 'VERB',
 8: 'ADP',
 9: 'PUNCT',
 10: 'ADV',
 11: 'CCONJ',
 12: 'SCONJ',
 13: 'NUM',
 14: 'ADJ',
 15: 'PART',
 16: 'SYM'}

In [10]:
tag2idx

{'PAD': 0,
 'X': 1,
 'PROPN': 2,
 'AUX': 3,
 'DET': 4,
 'NOUN': 5,
 'PRON': 6,
 'VERB': 7,
 'ADP': 8,
 'PUNCT': 9,
 'ADV': 10,
 'CCONJ': 11,
 'SCONJ': 12,
 'NUM': 13,
 'ADJ': 14,
 'PART': 15,
 'SYM': 16}

In [11]:
train_X[0]

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 12, 15, 16, 12, 17,
       18, 19, 20, 21, 22, 23,  9, 24, 25, 26, 27,  5, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 18, 37, 38, 39, 40, 41, 42,  7, 43, 44, 45, 46])

In [15]:
train_sentences = []
for s in train_X:
    train_sentences.append([idx2word[d] for d in s])

In [17]:
test_sentences = []
for s in test_X:
    test_sentences.append([idx2word[d] for d in s])

In [21]:
train_labels = []
for s in train_Y:
    train_labels.append([idx2tag[d] for d in s])

In [22]:
test_labels = []
for s in test_Y:
    test_labels.append([idx2tag[d] for d in s])

In [25]:
from nltk.tag.util import untag

def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'prev_word-prefix-1': '' if index == 0 else sentence[index - 1][0],
        'prev_word-prefix-2': '' if index == 0 else sentence[index - 1][:2],
        'prev_word-prefix-3': '' if index == 0 else sentence[index - 1][:3],
        'prev_word-suffix-1': '' if index == 0 else sentence[index - 1][-1],
        'prev_word-suffix-2': '' if index == 0 else sentence[index - 1][-2:],
        'prev_word-suffix-3': '' if index == 0 else sentence[index - 1][-3:],
        'next_word-prefix-1': '' if index == len(sentence) - 1 else sentence[index + 1][0],
        'next_word-prefix-2': '' if index == len(sentence) - 1 else sentence[index + 1][:2],
        'next_word-prefix-3': '' if index == len(sentence) - 1 else sentence[index + 1][:3],
        'next_word-suffix-1': '' if index == len(sentence) - 1 else sentence[index + 1][-1],
        'next_word-suffix-2': '' if index == len(sentence) - 1 else sentence[index + 1][-2:],
        'next_word-suffix-3': '' if index == len(sentence) - 1 else sentence[index + 1][-3:],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
    }

def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        X.append([features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
 
    return X, y

In [30]:
train = np.array([train_sentences, train_labels])
train = np.transpose(train, [1, 2, 0])
train.shape, train[0]

((97488, 50, 2), array([['Sembungan', 'PROPN'],
        ['adalah', 'AUX'],
        ['sebuah', 'DET'],
        ['desa', 'NOUN'],
        ['yang', 'PRON'],
        ['terletak', 'VERB'],
        ['di', 'ADP'],
        ['kecamatan', 'NOUN'],
        ['Kejajar', 'PROPN'],
        [',', 'PUNCT'],
        ['kabupaten', 'NOUN'],
        ['Wonosobo', 'PROPN'],
        [',', 'PUNCT'],
        ['Jawa', 'PROPN'],
        ['Tengah', 'PROPN'],
        [',', 'PUNCT'],
        ['Indonesia', 'PROPN'],
        ['.', 'PUNCT'],
        ['Sebuah', 'DET'],
        ['serangan', 'NOUN'],
        ['pengayauan', 'NOUN'],
        ['biasanya', 'ADV'],
        ['terjadi', 'VERB'],
        ['di', 'ADP'],
        ['ladang', 'NOUN'],
        ['atau', 'CCONJ'],
        ['dengan', 'ADP'],
        ['membakar', 'VERB'],
        ['sebuah', 'DET'],
        ['rumah', 'NOUN'],
        ['dan', 'CCONJ'],
        ['memenggal', 'VERB'],
        ['semua', 'DET'],
        ['penghuninya', 'NOUN'],
        ['ketika', 'SCONJ'],
     

In [31]:
test = np.array([test_sentences, test_labels])
test = np.transpose(test, [1, 2, 0])
test.shape, test[0]

((24335, 50, 2), array([['Emma', 'PROPN'],
        ['Summerton', 'PROPN'],
        ['pada', 'ADP'],
        ['bulan', 'NOUN'],
        ['April', 'PROPN'],
        ['2010', 'NUM'],
        ['dan', 'CCONJ'],
        ['tiga', 'NUM'],
        ['gambar', 'NOUN'],
        ['lainnya', 'ADJ'],
        ['diambil', 'VERB'],
        ['oleh', 'ADP'],
        ['artis', 'ADJ'],
        ['yang', 'PRON'],
        ['dirilis', 'VERB'],
        ['untuk', 'ADP'],
        ['mempromosikan', 'VERB'],
        ['album', 'NOUN'],
        ['di', 'ADP'],
        ['bulan', 'NOUN'],
        ['Juli', 'PROPN'],
        ['.', 'PUNCT'],
        ['Sampul', 'PROPN'],
        ['resmi', 'ADJ'],
        ['album', 'NOUN'],
        ['menunjukkan', 'VERB'],
        ['Perry', 'PROPN'],
        ['sedang', 'ADV'],
        ['berbaring', 'VERB'],
        ['telanjang', 'ADJ'],
        ['di', 'ADP'],
        ['awan', 'NOUN'],
        ['kembang', 'NOUN'],
        ['gula', 'NOUN'],
        [',', 'PUNCT'],
        ['dilukis', 'VERB'],
 

In [32]:
%%time

train_X, train_Y = transform_to_dataset(train)

CPU times: user 5min 51s, sys: 4.52 s, total: 5min 56s
Wall time: 5min 56s


In [33]:
%%time

test_X, test_Y = transform_to_dataset(test)

CPU times: user 1min 27s, sys: 1.29 s, total: 1min 28s
Wall time: 1min 28s


In [34]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [35]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=50,
    all_possible_transitions=True
)
crf.fit(train_X, train_Y)

CPU times: user 18min 15s, sys: 1.58 s, total: 18min 16s
Wall time: 18min 15s


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=50,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [36]:
labels = list(crf.classes_)
labels

['PROPN',
 'AUX',
 'DET',
 'NOUN',
 'PRON',
 'VERB',
 'ADP',
 'PUNCT',
 'ADV',
 'CCONJ',
 'SCONJ',
 'NUM',
 'ADJ',
 'PART',
 'SYM',
 'X']

In [37]:
y_pred = crf.predict(test_X)
metrics.flat_f1_score(test_Y, y_pred,
                      average='weighted', labels = labels)

0.9179662916673392

In [38]:
print(metrics.flat_classification_report(
    test_Y, y_pred, labels=labels, digits=5
))

              precision    recall  f1-score   support

       PROPN    0.91484   0.93292   0.92379    227608
         AUX    0.99502   1.00000   0.99751     10000
         DET    0.92717   0.91874   0.92294     38839
        NOUN    0.88129   0.89442   0.88781    268329
        PRON    0.96846   0.93447   0.95117     48835
        VERB    0.93068   0.92037   0.92550    124518
         ADP    0.93504   0.94820   0.94157    119589
       PUNCT    0.99918   0.99891   0.99905    182824
         ADV    0.81648   0.81499   0.81573     47760
       CCONJ    0.96229   0.90277   0.93158     37171
       SCONJ    0.75337   0.73696   0.74508     15150
         NUM    0.94127   0.90583   0.92321     41211
         ADJ    0.77309   0.73989   0.75613     45666
        PART    0.87705   0.82745   0.85153      5500
         SYM    1.00000   0.97278   0.98620      3600
           X    0.00000   0.00000   0.00000       150

    accuracy                        0.91810   1216750
   macro avg    0.85470   

In [39]:
import pickle

with open('crf-pos.pkl','wb') as fopen:
    pickle.dump(crf,fopen)