In [1]:
from bs4 import BeautifulSoup
import re
import numpy as np

In [2]:
def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [y.strip() for y in string]

def to_title(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

In [3]:
def parse_raw(filename):
    with open(filename, 'r') as fopen:
        entities = fopen.read()
    soup = BeautifulSoup(entities, 'html.parser')
    inside_tag = ''
    texts, labels = [], []
    for sentence in soup.prettify().split('\n'):
        if len(inside_tag):
            splitted = process_string(sentence)
            texts += splitted
            labels += [inside_tag] * len(splitted)
            inside_tag = ''
        else:
            if not sentence.find('</'):
                pass
            elif not sentence.find('<'):
                inside_tag = sentence.split('>')[0][1:]
            else:
                splitted = process_string(sentence)
                texts += splitted
                labels += ['OTHER'] * len(splitted)
    assert (len(texts)==len(labels)), "length texts and labels are not same"
    print('len texts and labels: ', len(texts))
    return texts,labels

In [4]:
train_texts, train_labels = parse_raw('data_train.txt')
test_texts, test_labels = parse_raw('data_test.txt')
train_texts += test_texts
train_labels += test_labels

len texts and labels:  34012
len texts and labels:  9249


In [5]:
np.unique(train_labels,return_counts=True)

(array(['OTHER', 'location', 'organization', 'person', 'quantity', 'time'],
       dtype='<U12'), array([35613,  1536,  1592,  2358,  1336,   826]))

In [6]:
with open('entities-bm-normalize-v3.txt','r') as fopen:
    entities_bm = fopen.read().split('\n')[:-1]
entities_bm = [i.split() for i in entities_bm]
entities_bm = [[i[0],'TIME' if i[0] in 'jam' else i[1]] for i in entities_bm]

In [7]:
replace_by = {'organizaiton':'organization','orgnization':'organization',
             'othoer': 'OTHER'}

with open('NER-part1.txt','r') as fopen:
    nexts = fopen.read().split('\n')[:-1]
nexts = [i.split() for i in nexts]
for i in nexts:
    if len(i) == 2:
        label = i[1].lower()
        if 'other' in label:
            label = label.upper()
        if label in replace_by:
            label = replace_by[label]
        train_labels.append(label)
        train_texts.append(i[0])

In [8]:
replace_by = {'LOC':'location','PRN':'person','NORP':'organization','ORG':'organization','LAW':'law',
             'EVENT':'event','FAC':'organization','TIME':'time','O':'OTHER','ART':'person','DOC':'law'}
for i in entities_bm:
    try:
        string = process_string(i[0])
        if len(string):
            train_labels.append(replace_by[i[1]])
            train_texts.append(process_string(i[0])[0])  
    except Exception as e:
        print(e)
        
assert (len(train_texts)==len(train_labels)), "length texts and labels are not same"

for i in range(len(train_texts)):
    train_texts[i] = to_title(train_texts[i].lower())

'KN'
'KA'


In [10]:
seq_len = 50
def iter_seq(x):
    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

In [12]:
from nltk.tag.util import untag

def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'prev_word-prefix-1': '' if index == 0 else sentence[index - 1][0],
        'prev_word-prefix-2': '' if index == 0 else sentence[index - 1][:2],
        'prev_word-prefix-3': '' if index == 0 else sentence[index - 1][:3],
        'prev_word-suffix-1': '' if index == 0 else sentence[index - 1][-1],
        'prev_word-suffix-2': '' if index == 0 else sentence[index - 1][-2:],
        'prev_word-suffix-3': '' if index == 0 else sentence[index - 1][-3:],
        'next_word-prefix-1': '' if index == len(sentence) - 1 else sentence[index + 1][0],
        'next_word-prefix-2': '' if index == len(sentence) - 1 else sentence[index + 1][:2],
        'next_word-prefix-3': '' if index == len(sentence) - 1 else sentence[index + 1][:3],
        'next_word-suffix-1': '' if index == len(sentence) - 1 else sentence[index + 1][-1],
        'next_word-suffix-2': '' if index == len(sentence) - 1 else sentence[index + 1][-2:],
        'next_word-suffix-3': '' if index == len(sentence) - 1 else sentence[index + 1][-3:],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
    }

def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        X.append([features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
 
    return X, y

In [13]:
combined = list(map(lambda X: (X[0],X[1]), list(zip(train_texts,train_labels))))

In [14]:
combined_seq = to_train_seq(combined)[0]
combined_seq.shape

(61767, 50, 2)

In [15]:
X, Y = transform_to_dataset(combined_seq)

from sklearn.cross_validation import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.2)



In [16]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [17]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(train_X, train_Y)

CPU times: user 8min 38s, sys: 488 ms, total: 8min 38s
Wall time: 8min 38s


In [18]:
labels = list(crf.classes_)
labels.remove('OTHER')
labels

['quantity', 'location', 'time', 'person', 'organization', 'event', 'law']

In [19]:
y_pred = crf.predict(test_X)
metrics.flat_f1_score(test_Y, y_pred,
                      average='weighted', labels = labels)

0.9852734679520555

In [20]:
print(metrics.flat_classification_report(
    test_Y, y_pred, labels=labels, digits=3
))

              precision    recall  f1-score   support

    quantity      0.991     0.991     0.991     13891
    location      0.987     0.989     0.988     20798
        time      0.987     0.977     0.982     13264
      person      0.993     0.987     0.990     43590
organization      0.974     0.973     0.973     25426
       event      0.995     0.983     0.989      2417
         law      0.994     0.988     0.991      1686

 avg / total      0.987     0.983     0.985    121072



In [21]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

Top likely transitions:
quantity -> quantity 4.731903
location -> location 4.547566
organization -> organization 4.322757
OTHER  -> OTHER   4.267569
event  -> event   3.796581
law    -> law     3.234600
person -> person  3.178005
time   -> time    2.716374
location -> OTHER   0.057188
OTHER  -> location -0.033477

Top unlikely transitions:
event  -> person  -4.618084
event  -> quantity -4.649371
time   -> law     -4.748618
organization -> event   -4.763703
event  -> location -4.995439
organization -> law     -5.343159
person -> law     -6.000496
time   -> quantity -6.551308
organization -> time    -6.602369
quantity -> time    -8.047114


In [22]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
15.295689 person   word:pengarah
12.352726 location word:dibuat-buat
11.206675 organization word:pas
10.718764 person   word:solana
10.579257 person   word:anggodo
10.205311 location word:kenya
10.178896 time     word:jumat
10.138113 person   word:terpantas
9.938075 OTHER    word:sudah
9.896239 location word:pakistan
9.884769 location word:sandakan
9.582762 organization word:pdi-perjuangan
9.488252 organization word:hamas
9.469484 person   word:saan
9.420459 time     next_word:dihargai
9.386450 location word:berlin
9.210472 organization next_word:partai-partai
9.164932 organization word:mesa
9.045899 person   next_word:silahkan
9.008229 person   word:berkelulusan
8.812708 person   word:dinaungi
8.772850 location word:rusia
8.771755 organization word:suruhanjaya
8.768771 location word:polandia
8.729911 event    word:jakarta-palembang
8.662130 person   word:johan
8.523424 organization word:interpol
8.409965 location word:iran
8.399465 person   word:dipertuan
8.385403 organi

In [23]:
string = 'KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar  sekiranya mengantuk ketika memandu.'

In [25]:
processed_str = to_title(string.lower())
processed_str

'kuala lumpur sempena sambutan aidilfitri minggu depan perdana menteri tun dr mahathir mohamad dan menteri pengangkutan anthony loke siew fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing dalam video pendek terbitan jabatan keselamatan jalan raya jkjr itu dr mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar sekiranya mengantuk ketika memandu'

In [30]:
%%time

splitted = processed_str.split()
test = [features(splitted, index) for index in range(len(splitted))]
for no, i in enumerate(crf.predict_single(test)):
    print(splitted[no], i)

kuala location
lumpur location
sempena OTHER
sambutan event
aidilfitri event
minggu OTHER
depan OTHER
perdana person
menteri person
tun person
dr person
mahathir person
mohamad person
dan OTHER
menteri OTHER
pengangkutan OTHER
anthony person
loke person
siew person
fook person
menitipkan OTHER
pesanan OTHER
khas OTHER
kepada OTHER
orang OTHER
ramai OTHER
yang OTHER
mahu OTHER
pulang OTHER
ke OTHER
kampung location
halaman location
masing-masing OTHER
dalam OTHER
video OTHER
pendek OTHER
terbitan OTHER
jabatan organization
keselamatan organization
jalan organization
raya organization
jkjr organization
itu OTHER
dr person
mahathir person
menasihati OTHER
mereka OTHER
supaya OTHER
berhenti OTHER
berehat OTHER
dan OTHER
tidur OTHER
sebentar OTHER
sekiranya OTHER
mengantuk OTHER
ketika OTHER
memandu OTHER
CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 2.77 ms


In [32]:
import pickle

with open('crf-entities.pkl','wb') as fopen:
    pickle.dump(crf,fopen)

In [33]:
with open('crf-entities.pkl', 'rb') as fopen:
    test_crf = pickle.load(fopen)

In [34]:
for no, i in enumerate(test_crf.predict_single(test)):
    print(splitted[no], i)

kuala location
lumpur location
sempena OTHER
sambutan event
aidilfitri event
minggu OTHER
depan OTHER
perdana person
menteri person
tun person
dr person
mahathir person
mohamad person
dan OTHER
menteri OTHER
pengangkutan OTHER
anthony person
loke person
siew person
fook person
menitipkan OTHER
pesanan OTHER
khas OTHER
kepada OTHER
orang OTHER
ramai OTHER
yang OTHER
mahu OTHER
pulang OTHER
ke OTHER
kampung location
halaman location
masing-masing OTHER
dalam OTHER
video OTHER
pendek OTHER
terbitan OTHER
jabatan organization
keselamatan organization
jalan organization
raya organization
jkjr organization
itu OTHER
dr person
mahathir person
menasihati OTHER
mereka OTHER
supaya OTHER
berhenti OTHER
berehat OTHER
dan OTHER
tidur OTHER
sebentar OTHER
sekiranya OTHER
mengantuk OTHER
ketika OTHER
memandu OTHER
