In [1]:
from bs4 import BeautifulSoup
import re
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB



In [2]:
def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [y.strip() for y in string]

In [3]:
def parse_raw(filename):
    with open(filename, 'r') as fopen:
        entities = fopen.read()
    soup = BeautifulSoup(entities, 'html.parser')
    inside_tag = ''
    texts, labels = [], []
    for sentence in soup.prettify().split('\n'):
        if len(inside_tag):
            splitted = process_string(sentence)
            texts += splitted
            labels += [inside_tag] * len(splitted)
            inside_tag = ''
        else:
            if not sentence.find('</'):
                pass
            elif not sentence.find('<'):
                inside_tag = sentence.split('>')[0][1:]
            else:
                splitted = process_string(sentence)
                texts += splitted
                labels += ['OTHER'] * len(splitted)
    assert (len(texts)==len(labels)), "length texts and labels are not same"
    print('len texts and labels: ', len(texts))
    return texts,labels

In [4]:
train_texts, train_labels = parse_raw('data_train.txt')

len texts and labels:  34012


In [5]:
test_texts, test_labels = parse_raw('data_test.txt')
train_texts += test_texts
train_labels += test_labels

len texts and labels:  9249


In [6]:
np.unique(train_labels,return_counts=True)

(array(['OTHER', 'location', 'organization', 'person', 'quantity', 'time'],
       dtype='<U12'), array([35613,  1536,  1592,  2358,  1336,   826]))

In [7]:
with open('entities-bm-normalize-v3.txt','r') as fopen:
    entities_bm = fopen.read().split('\n')[:-1]
entities_bm = [i.split() for i in entities_bm]
entities_bm = [[i[0],'TIME' if i[0] in 'jam' else i[1]] for i in entities_bm]

In [8]:
replace_by = {'LOC':'location','PRN':'person','NORP':'organization','ORG':'organization','LAW':'law',
             'EVENT':'OTHER','FAC':'organization','TIME':'time','O':'OTHER','ART':'person','DOC':'law'}
for i in entities_bm:
    try:
        string = process_string(i[0])
        if len(string):
            train_labels.append(replace_by[i[1]])
            train_texts.append(process_string(i[0])[0])  
    except Exception as e:
        print(e)
        
assert (len(train_texts)==len(train_labels)), "length texts and labels are not same"

'KN'
'KA'


In [9]:
np.unique(train_labels,return_counts=True)

(array(['OTHER', 'law', 'location', 'organization', 'person', 'quantity',
        'time'], dtype='<U12'),
 array([47406,   107,  2010,  2435,  3913,  1336,  1240]))

In [10]:
target = LabelEncoder().fit_transform(train_labels)
bow_chars = CountVectorizer(ngram_range=(1, 4), analyzer='char').fit(train_texts)
vectors = bow_chars.transform(train_texts)
vectors.shape

(58447, 17758)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)
del vectors

In [12]:
from sklearn import metrics

In [13]:
train_d = xgb.DMatrix(train_X, train_Y)
test_d = xgb.DMatrix(test_X, test_Y)
params_xgd = {
    'min_child_weight': 10.0,
    'max_depth': 14,
    'objective': 'multi:softprob',
    'max_delta_step': 1.8,
    'num_class': len(np.unique(target)),
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'learning_rate': 0.1,
    'gamma': 0.65,
    'silent':True,
    'eval_metric': 'mlogloss'
}
model = xgb.train(params_xgd, train_d, 100000, evals=[(test_d, 'validation')], 
                  early_stopping_rounds=100, verbose_eval=50)

[0]	validation-mlogloss:1.71443
Will train until validation-mlogloss hasn't improved in 100 rounds.
[50]	validation-mlogloss:0.454539
[100]	validation-mlogloss:0.402853
[150]	validation-mlogloss:0.383168
[200]	validation-mlogloss:0.371301
[250]	validation-mlogloss:0.362994
[300]	validation-mlogloss:0.356895
[350]	validation-mlogloss:0.351588
[400]	validation-mlogloss:0.347524
[450]	validation-mlogloss:0.344712
[500]	validation-mlogloss:0.342025
[550]	validation-mlogloss:0.339598
[600]	validation-mlogloss:0.337423
[650]	validation-mlogloss:0.335829
[700]	validation-mlogloss:0.334537
[750]	validation-mlogloss:0.333315
[800]	validation-mlogloss:0.332229
[850]	validation-mlogloss:0.331261
[900]	validation-mlogloss:0.330584
[950]	validation-mlogloss:0.329828
[1000]	validation-mlogloss:0.329271
[1050]	validation-mlogloss:0.328545
[1100]	validation-mlogloss:0.32779
[1150]	validation-mlogloss:0.327664
[1200]	validation-mlogloss:0.327273
[1250]	validation-mlogloss:0.326907
[1300]	validation-mlo

In [14]:
predicted = np.argmax(model.predict(xgb.DMatrix(test_X),ntree_limit=model.best_ntree_limit),axis=1)
print(metrics.classification_report(test_Y, predicted, target_names = np.unique(train_labels)))

              precision    recall  f1-score   support

       OTHER       0.93      0.97      0.95      9505
         law       1.00      0.43      0.60        14
    location       0.68      0.61      0.64       389
organization       0.63      0.49      0.55       490
      person       0.83      0.71      0.76       779
    quantity       0.63      0.61      0.62       251
        time       0.84      0.60      0.70       262

 avg / total       0.90      0.90      0.90     11690



In [15]:
news = 'ikat penyedia perkhidmatan jalur lebar Telekom Malaysia (TM) perlu mencari jalan penyelesaian bagi meningkatkan akses capaian Internet ke seluruh negara, kata Menteri Komunikasi dan Multimedia, Gobind Singh Deo. Beliau berkata menjadi dasar kerajaan untuk membekalkan akses Internet jalur lebar kepada semua dan memberi penekanan kepada kualiti perkhidmatan yang terbaik. "Dasar kerajaan untuk bekalkan akses kepada semua bukan sekadar pembekalan sahaja tetapi beri penekanan kepada kualiti perkhidmatan yang baik dan dapat bersaing dengan negara lain pada tahap antarabangsa," kata Gobind Singh menerusi catatan di laman rasmi Twitter beliau, malam tadi. Beliau berkata demikian sebagai respons terhadap aduan beberapa pengguna Twitter berhubung akses Internet yang masih tidak stabil serta harga yang tidak berpatutan di beberapa lokasi di seluruh negara.'

In [17]:
import pickle
with open('xgb-entities.pkl','wb') as fopen:
    pickle.dump(model,fopen)
with open('xgb-bow-entities.pkl','wb') as fopen:
    pickle.dump(bow_chars,fopen)