In [1]:
import numpy as np
import re

In [2]:
with open('gsd-ud-train.conllu.txt') as fopen:
    corpus = fopen.read().split('\n')
    
with open('gsd-ud-test.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))
    
with open('gsd-ud-dev.conllu.txt') as fopen:
    corpus.extend(fopen.read().split('\n'))

In [3]:
def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [to_title(y.strip()) for y in string]

def to_title(string):
    if string.isupper():
        string = string.title()
    return string

def process_corpus(corpus, until = None):
    sentences, words, depends, labels = [], [], [], []
    temp_sentence, temp_word, temp_depend, temp_label = [], [], [], []
    for sentence in corpus:
        if len(sentence):
            if sentence[0] == '#':
                continue
            sentence = sentence.split('\t')
            temp = process_string(sentence[1])
            if not len(temp):
                sentence[1] = 'EMPTY'
            sentence[1] = process_string(sentence[1])[0]
            temp_word.append(sentence[1])
            temp_depend.append(str(int(sentence[6])))
            temp_label.append(sentence[7])
            temp_sentence.append(sentence[1])
        else:
            words.append(temp_word)
            depends.append(temp_depend)
            labels.append(temp_label)
            sentences.append(temp_sentence)
            temp_word = []
            temp_depend = []
            temp_label = []
            temp_sentence = []
    return sentences[:-1], words[:-1], depends[:-1], labels[:-1]

sentences, words, depends, labels = process_corpus(corpus)

In [10]:
idx2tag = {0: 'PAD',
 1: 'nsubj',
 2: 'cop',
 3: 'det',
 4: 'root',
 5: 'nsubj:pass',
 6: 'acl',
 7: 'case',
 8: 'obl',
 9: 'flat',
 10: 'punct',
 11: 'appos',
 12: 'amod',
 13: 'compound',
 14: 'advmod',
 15: 'cc',
 16: 'obj',
 17: 'conj',
 18: 'mark',
 19: 'advcl',
 20: 'nmod',
 21: 'nummod',
 22: 'dep',
 23: 'xcomp',
 24: 'ccomp',
 25: 'parataxis',
 26: 'compound:plur',
 27: 'fixed',
 28: 'aux',
 29: 'csubj',
 30: 'iobj',
 31: 'csubj:pass'}

In [11]:
depends[0]

['4',
 '4',
 '4',
 '0',
 '6',
 '4',
 '8',
 '6',
 '8',
 '8',
 '8',
 '11',
 '11',
 '11',
 '14',
 '14',
 '14',
 '4']

In [5]:
words[0]

['Sembungan',
 'adalah',
 'sebuah',
 'desa',
 'yang',
 'terletak',
 'di',
 'kecamatan',
 'Kejajar',
 'Empty',
 'kabupaten',
 'Wonosobo',
 'Empty',
 'Jawa',
 'Tengah',
 'Empty',
 'Indonesia',
 'Empty']

In [6]:
import json

with open('augmented-dependency.json') as fopen:
    augmented = json.load(fopen)

In [7]:
def parse_X(texts):
    sentences = []
    for no, text in enumerate(texts):
        s = process_string(text)
        sentences.append(s)
    return sentences

In [19]:
text_augmented = []
for a in augmented:
    text_augmented.extend(a[0])
    depends.extend([list(map(str, i)) for i in a[1]])
    u = []
    for i in a[2]:
        u.append([idx2tag[a] for a in i])
    labels.extend(u)

In [20]:
new_sentences = parse_X(text_augmented)

In [21]:
words.extend(new_sentences)
sentences.extend(new_sentences)

In [22]:
len(words), len(depends), len(labels), len(sentences)

(50365, 50365, 50365, 50365)

In [23]:
from nltk.tag.util import untag

def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'prev_word-prefix-1': '' if index == 0 else sentence[index - 1][0],
        'prev_word-prefix-2': '' if index == 0 else sentence[index - 1][:2],
        'prev_word-prefix-3': '' if index == 0 else sentence[index - 1][:3],
        'prev_word-suffix-1': '' if index == 0 else sentence[index - 1][-1],
        'prev_word-suffix-2': '' if index == 0 else sentence[index - 1][-2:],
        'prev_word-suffix-3': '' if index == 0 else sentence[index - 1][-3:],
        'next_word-prefix-1': '' if index == len(sentence) - 1 else sentence[index + 1][0],
        'next_word-prefix-2': '' if index == len(sentence) - 1 else sentence[index + 1][:2],
        'next_word-prefix-3': '' if index == len(sentence) - 1 else sentence[index + 1][:3],
        'next_word-suffix-1': '' if index == len(sentence) - 1 else sentence[index + 1][-1],
        'next_word-suffix-2': '' if index == len(sentence) - 1 else sentence[index + 1][-2:],
        'next_word-suffix-3': '' if index == len(sentence) - 1 else sentence[index + 1][-3:],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
    }

def features_crf_dependency(sentence, tag, index):
    return {
        'word': sentence[index],
        'tag': tag[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'prev_word-prefix-1': '' if index == 0 else sentence[index - 1][0],
        'prev_word-prefix-2': '' if index == 0 else sentence[index - 1][:2],
        'prev_word-prefix-3': '' if index == 0 else sentence[index - 1][:3],
        'prev_word-suffix-1': '' if index == 0 else sentence[index - 1][-1],
        'prev_word-suffix-2': '' if index == 0 else sentence[index - 1][-2:],
        'prev_word-suffix-3': '' if index == 0 else sentence[index - 1][-3:],
        'next_word-prefix-1': ''
        if index == len(sentence) - 1
        else sentence[index + 1][0],
        'next_word-prefix-2': ''
        if index == len(sentence) - 1
        else sentence[index + 1][:2],
        'next_word-prefix-3': ''
        if index == len(sentence) - 1
        else sentence[index + 1][:3],
        'next_word-suffix-1': ''
        if index == len(sentence) - 1
        else sentence[index + 1][-1],
        'next_word-suffix-2': ''
        if index == len(sentence) - 1
        else sentence[index + 1][-2:],
        'next_word-suffix-3': ''
        if index == len(sentence) - 1
        else sentence[index + 1][-3:],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
    }

def transform_to_dataset(words, labels):
    X, y = [], []
 
    for no, tagged in enumerate(labels):
        X.append([features(words[no], index) for index in range(len(words[no]))])
        y.append([tag for tag in tagged])
 
    return X, y

def transform_to_dataset_depend(words, labels, depends):
    X, y = [], []
 
    for no, tagged in enumerate(labels):
        X.append([features_crf_dependency(words[no], depends[no], index) for index in range(len(words[no]))])
        y.append([tag for tag in tagged])
 
    return X, y

In [24]:
X, Y = transform_to_dataset(words, labels)

In [25]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [26]:
from sklearn.cross_validation import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(X, Y,test_size=0.1)



In [27]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(train_X, train_Y)

CPU times: user 13min 41s, sys: 328 ms, total: 13min 41s
Wall time: 13min 40s


In [29]:
y_pred = crf.predict(test_X)
metrics.flat_f1_score(test_Y, y_pred,
                      average='weighted', labels = list(crf.classes_))

0.8953045920285301

In [30]:
print(metrics.flat_classification_report(
    test_Y, y_pred, labels=list(crf.classes_), digits=4
))

               precision    recall  f1-score   support

         case     0.9584    0.9687    0.9635     11014
          obl     0.8045    0.8274    0.8158      5810
         flat     0.9469    0.9551    0.9510     10648
           cc     0.9538    0.9652    0.9595      3336
         conj     0.8684    0.8482    0.8582      4560
        punct     0.9848    0.9963    0.9905     17017
   nsubj:pass     0.8336    0.7640    0.7973      2059
         root     0.7960    0.8453    0.8199      5037
       nummod     0.9334    0.9359    0.9347      4088
         mark     0.8739    0.8865    0.8802      1392
        advcl     0.7649    0.6508    0.7033      1200
       advmod     0.8932    0.8924    0.8928      4769
         nmod     0.7762    0.7355    0.7553      4215
        nsubj     0.8600    0.8835    0.8716      6388
          det     0.9020    0.8868    0.8943      4142
     compound     0.8776    0.8974    0.8874      6869
         amod     0.8677    0.8530    0.8602      4128
         

In [31]:
import pickle

with open('crf-label.pkl','wb') as fopen:
    pickle.dump(crf,fopen)

In [32]:
X_depend, Y_depend = transform_to_dataset_depend(words, depends, labels)

In [33]:
from sklearn.cross_validation import train_test_split
train_X_depend, test_X_depend, train_Y_depend, test_Y_depend = train_test_split(X_depend, Y_depend,test_size=0.1)

In [35]:
%%time
crf_depend = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf_depend.fit(train_X_depend, train_Y_depend)

CPU times: user 2h 54min 38s, sys: 412 ms, total: 2h 54min 38s
Wall time: 2h 54min 18s


In [36]:
y_pred = crf_depend.predict(test_X_depend)
metrics.flat_f1_score(test_Y_depend, y_pred,
                      average='weighted', labels = list(crf_depend.classes_))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.5836390645242469

In [38]:
print(metrics.flat_classification_report(
    test_Y_depend, y_pred, labels=list(crf_depend.classes_), digits=4
))

             precision    recall  f1-score   support

          5     0.5452    0.5875    0.5656      5964
          2     0.6193    0.7164    0.6643      4365
          1     0.8839    0.9031    0.8934      4942
          7     0.5181    0.5460    0.5317      5505
          9     0.5569    0.5504    0.5536      4804
         12     0.5421    0.5309    0.5364      3760
         15     0.5556    0.5105    0.5321      3181
          4     0.5195    0.6219    0.5661      6241
          6     0.5346    0.5571    0.5456      5942
         11     0.5350    0.5581    0.5463      4150
         14     0.5425    0.5109    0.5262      3251
          8     0.5463    0.5414    0.5438      5395
         10     0.5705    0.5252    0.5469      4682
         13     0.5506    0.5199    0.5348      3537
          3     0.5871    0.6077    0.5972      5068
         18     0.5613    0.5232    0.5415      2504
         20     0.5772    0.5315    0.5534      2109
         23     0.6065    0.5814    0.5937   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [39]:
string = 'Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar sekiranya mengantuk ketika memandu.'

In [40]:
processed = process_string(string)
result = crf.predict_single([features(processed, index) for index in range(len(processed))])
for no, i in enumerate(result):
    print(processed[no],i)

Dr det
Mahathir nsubj
menasihati conj
mereka obj
supaya case
berhenti xcomp
berehat advcl
dan cc
tidur conj
sebentar advmod
sekiranya advmod
mengantuk root
ketika case
memandu xcomp


In [41]:
result_d = crf_depend.predict_single([features_crf_dependency(processed, result, index) for index in range(len(processed))])
result_d = [int(i) for i in result_d]

In [42]:
for i in range(len(result)):
    if result_d[i] == 0 and result[i] != 'root':
        result[i] = 'UNK'
    if result_d[i] != 0 and result[i] == 'root':
        result[i] = 'UNK'
    if result_d[i] > len(result):
        result_d[i] = len(result)

In [43]:
for no, i in enumerate(result):
    print(processed[no],result_d[no], i)

Dr 5 det
Mahathir 10 nsubj
menasihati 8 conj
mereka 8 obj
supaya 8 case
berhenti 10 xcomp
berehat 10 advcl
dan 14 cc
tidur 4 conj
sebentar 12 advmod
sekiranya 9 advmod
mengantuk 1 UNK
ketika 9 case
memandu 7 xcomp


In [44]:
import pickle

with open('crf-depend.pkl','wb') as fopen:
    pickle.dump(crf_depend,fopen)