In [1]:
import numpy as np
import re

In [3]:
with open('gsd-ud-train.conllu.txt') as fopen:
    corpus = fopen.read().split('\n')
    
with open('gsd-ud-test.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))
    
with open('gsd-ud-dev.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))

In [6]:
def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [to_title(y.strip()) for y in string]

def to_title(string):
    if string.isupper():
        string = string.title()
    return string

def process_corpus(corpus, until = None):
    sentences, words, depends, labels = [], [], [], []
    temp_sentence, temp_word, temp_depend, temp_label = [], [], [], []
    for sentence in corpus:
        if len(sentence):
            if sentence[0] == '#':
                continue
            sentence = sentence.split('\t')
            temp = process_string(sentence[1])
            if not len(temp):
                sentence[1] = 'EMPTY'
            sentence[1] = process_string(sentence[1])[0]
            temp_word.append(sentence[1])
            temp_depend.append(str(int(sentence[6])))
            temp_label.append(sentence[7])
            temp_sentence.append(sentence[1])
        else:
            words.append(temp_word)
            depends.append(temp_depend)
            labels.append(temp_label)
            sentences.append(temp_sentence)
            temp_word = []
            temp_depend = []
            temp_label = []
            temp_sentence = []
    return sentences[:-1], words[:-1], depends[:-1], labels[:-1]

sentences, words, depends, labels = process_corpus(corpus)

In [33]:
from nltk.tag.util import untag

def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'prev_word-prefix-1': '' if index == 0 else sentence[index - 1][0],
        'prev_word-prefix-2': '' if index == 0 else sentence[index - 1][:2],
        'prev_word-prefix-3': '' if index == 0 else sentence[index - 1][:3],
        'prev_word-suffix-1': '' if index == 0 else sentence[index - 1][-1],
        'prev_word-suffix-2': '' if index == 0 else sentence[index - 1][-2:],
        'prev_word-suffix-3': '' if index == 0 else sentence[index - 1][-3:],
        'next_word-prefix-1': '' if index == len(sentence) - 1 else sentence[index + 1][0],
        'next_word-prefix-2': '' if index == len(sentence) - 1 else sentence[index + 1][:2],
        'next_word-prefix-3': '' if index == len(sentence) - 1 else sentence[index + 1][:3],
        'next_word-suffix-1': '' if index == len(sentence) - 1 else sentence[index + 1][-1],
        'next_word-suffix-2': '' if index == len(sentence) - 1 else sentence[index + 1][-2:],
        'next_word-suffix-3': '' if index == len(sentence) - 1 else sentence[index + 1][-3:],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
    }

def features_crf_dependency(sentence, tag, index):
    return {
        'word': sentence[index],
        'tag': tag[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'prev_word-prefix-1': '' if index == 0 else sentence[index - 1][0],
        'prev_word-prefix-2': '' if index == 0 else sentence[index - 1][:2],
        'prev_word-prefix-3': '' if index == 0 else sentence[index - 1][:3],
        'prev_word-suffix-1': '' if index == 0 else sentence[index - 1][-1],
        'prev_word-suffix-2': '' if index == 0 else sentence[index - 1][-2:],
        'prev_word-suffix-3': '' if index == 0 else sentence[index - 1][-3:],
        'next_word-prefix-1': ''
        if index == len(sentence) - 1
        else sentence[index + 1][0],
        'next_word-prefix-2': ''
        if index == len(sentence) - 1
        else sentence[index + 1][:2],
        'next_word-prefix-3': ''
        if index == len(sentence) - 1
        else sentence[index + 1][:3],
        'next_word-suffix-1': ''
        if index == len(sentence) - 1
        else sentence[index + 1][-1],
        'next_word-suffix-2': ''
        if index == len(sentence) - 1
        else sentence[index + 1][-2:],
        'next_word-suffix-3': ''
        if index == len(sentence) - 1
        else sentence[index + 1][-3:],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
    }

def transform_to_dataset(words, labels):
    X, y = [], []
 
    for no, tagged in enumerate(labels):
        X.append([features(words[no], index) for index in range(len(words[no]))])
        y.append([tag for tag in tagged])
 
    return X, y

def transform_to_dataset_depend(words, labels, depends):
    X, y = [], []
 
    for no, tagged in enumerate(labels):
        X.append([features_crf_dependency(words[no], depends[no], index) for index in range(len(words[no]))])
        y.append([tag for tag in tagged])
 
    return X, y

In [21]:
X, Y = transform_to_dataset(words, labels)

In [24]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [26]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X, Y)

CPU times: user 2min 3s, sys: 347 ms, total: 2min 3s
Wall time: 2min 3s


In [27]:
y_pred = crf.predict(X)
metrics.flat_f1_score(Y, y_pred,
                      average='weighted', labels = list(crf.classes_))

0.9541090910035434

In [28]:
print(metrics.flat_classification_report(
    Y, y_pred, labels=list(crf.classes_), digits=3
))

               precision    recall  f1-score   support

        nsubj      0.931     0.962     0.946      7125
          cop      0.998     1.000     0.999      1055
          det      0.969     0.968     0.969      4475
         root      0.869     0.953     0.909      5593
   nsubj:pass      0.929     0.872     0.900      2216
          acl      0.935     0.890     0.912      3346
         case      0.992     0.994     0.993     11897
          obl      0.876     0.906     0.891      6346
         flat      0.978     0.985     0.982     11402
        punct      0.994     1.000     0.997     18483
        appos      0.933     0.921     0.927      2662
         amod      0.932     0.935     0.934      4566
     compound      0.962     0.956     0.959      7432
       advmod      0.966     0.974     0.970      5288
           cc      0.993     0.987     0.990      3571
          obj      0.948     0.948     0.948      5795
         conj      0.952     0.931     0.941      4806
         

In [31]:
import pickle

with open('crf-label.pkl','wb') as fopen:
    pickle.dump(crf,fopen)

In [36]:
X_depend, Y_depend = transform_to_dataset_depend(words, depends, labels)

In [38]:
%%time
crf_depend = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf_depend.fit(X_depend, Y_depend)

CPU times: user 26min 5s, sys: 2.18 s, total: 26min 7s
Wall time: 26min


In [44]:
y_pred = crf_depend.predict(X_depend)
metrics.flat_f1_score(Y_depend, y_pred,
                      average='weighted', labels = list(crf_depend.classes_))

0.7909675818232262

In [64]:
string = 'Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar sekiranya mengantuk ketika memandu.'

In [65]:
processed = process_string(string)
result = crf.predict_single([features(processed, index) for index in range(len(processed))])
for no, i in enumerate(result):
    print(processed[no],i)

Dr det
Mahathir nsubj
menasihati root
mereka obj
supaya mark
berhenti advcl
berehat xcomp
dan cc
tidur conj
sebentar case
sekiranya nmod
mengantuk acl
ketika mark
memandu advcl


In [66]:
result_d = crf_depend.predict_single([features_crf_dependency(processed, result, index) for index in range(len(processed))])
result_d = [int(i) for i in result_d]

In [67]:
for i in range(len(result)):
    if result_d[i] == 0 and result[i] != 'root':
        result[i] = 'UNK'
    if result_d[i] != 0 and result[i] == 'root':
        result[i] = 'UNK'
    if result_d[i] > len(result):
        result_d[i] = len(result)

In [68]:
for no, i in enumerate(result):
    print(processed[no],result_d[no], i)

Dr 2 det
Mahathir 3 nsubj
menasihati 0 root
mereka 4 obj
supaya 9 mark
berhenti 9 advcl
berehat 9 xcomp
dan 9 cc
tidur 7 conj
sebentar 7 case
sekiranya 7 nmod
mengantuk 1 acl
ketika 3 mark
memandu 3 advcl


In [69]:
import pickle

with open('crf-depend.pkl','wb') as fopen:
    pickle.dump(crf_depend,fopen)