In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive/


In [0]:
import os
import numpy as np
import codecs
import pickle
import nltk
import pickle
from nltk.classify.maxent import MaxentClassifier, BinaryMaxentFeatureEncoding
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report

In [3]:
# Global variables
rawdata_path = "/content/gdrive/My Drive/ml/data/rawdata/"
data_path = "/content/gdrive/My Drive/ml/data/data/"
model_path = "/content/gdrive/My Drive/ml/model/"
labels = ['B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'O']
labels_dict = {labels[i]: i for i in range(len(labels))}
eval_labels = ['B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
print(labels_dict)

{'B-PER': 0, 'I-PER': 1, 'B-ORG': 2, 'I-ORG': 3, 'B-LOC': 4, 'I-LOC': 5, 'O': 6}


In [0]:
def remove_xml_tags(filename):
  ''' 
  Remove xml tag in file in data folder(raw data)
  Args:
    filename: The name of the data file in dataVLSP folder
  Return:
    File of the same name has removed xml tags in data folder
  Example:
    <editor>Vietlex team, 8-2016</editor>
    -DOCSTART-
    <s>				
    Đó	P	B-NP	O	O
    là	V	B-VP	O	O
    con	Nc	B-NP	O	O
  :converted into:
    Đó	P	B-NP	O	O
    là	V	B-VP	O	O
    con	Nc	B-NP	O	O

    saved in dataVLSP folder(processed data)
  '''
  f1 = open(rawdata_path + filename, 'r',encoding='utf-8')
  f2 = open(data_path + filename, 'w+',encoding='utf-8')
  for line in f1:
    line.strip()
    if(('<title>' in line) or line.startswith('<e') or line.startswith('-D') or line.startswith('<s>')):
      pass
    elif(line.startswith('</')):
      f2.write(line.replace(line,'\n'))
    else:
      f2.write(line)
  f1.close()
  f2.close()

In [0]:
def clean_data(path):
  ''' 
  Remove xml tags of all files in the dataVLSP folder
  Processed data saved in data
  '''
  list_files = os.listdir(path)
  for file in list_files:
    remove_xml_tags(file)

In [0]:
def prepare_data(path):
    ''' Create training data and testing data
        Format of data: CoNLL

        Args:
        path: path of data folder
        scale: test size
        index_attri: Represents the number of attributes and the associated attribute type
            index_attri == 1 : The number of attributes = 1 - only ner label. ex: [('Huế', 'B_LOC'), ('là', 'O'), ('thành_phố', 'O'), ('đẹp', 'O')]
            index_attri == 2.1 : The number of attributes = 2(pos-tagging label, ner label). ex: [('Đó', 'P', 'O'), ('là', 'V',  'O'), ('con', 'Nc', 'O'), ('đường', 'N', , 'O')]
            index_attri = 2.2 : The number of attributes = 2(chunking label, ner label). ex: [('Đó', 'B-NP', 'O'), ('là', 'B-VP', 'O'), ('con', 'B-NP', 'O'), ('đường', 'B-NP', 'O')]
            index_attri = 3 : The number of attributes = 3(pos-tagging label,chunking, ner label). ex: [('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')]
            if index_attri not in {1,2.1,2,2,3} index_attri = 2.1
        Return:
        train_sents, test_sents
        
        Example of format data:
        [[('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')],
        [('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')],
    '''    
    list_files = os.listdir(path)
    all_data = []
    ''' Convert data format to CoNll '''
    #training data
    c = 0;
    pos_tag = []
    chunk_tag = []
    ne_tag = []
    for file in list_files:
        with codecs.open(path + file,'r',encoding='utf8') as f:
            sentence = []
            remove = False
            for line in f:
                line = line.split()
                if len(line) > 3:
                    #label_set.append(line[3])
                    if line[3] not in labels:
                        remove = True
                    else:
                        pos_tag.append(line[1])
                        chunk_tag.append(line[2])
                    sentence.append((line[0],line[1],line[2],line[3]))
                else:
                    if len(sentence) > 0:
                        if remove == False:                            
                            all_data.append(sentence)
                        else:
                            remove = False
                        sentence = []
            f.close()

    pos_tag = set(pos_tag)
    chunk_tag = set(chunk_tag)
    return  all_data, pos_tag, chunk_tag

In [0]:
def shape_feature(word):
    is_lower            = 'is_lower'
    is_capital          = 'is_capital' 
    is_title            = 'is_title' 
    is_mix              = 'is_mix' 
    is_capital_period   = 'is_capital_period' 
    is_digit            = 'is_digit' 
    end_digit           = 'end_digit' 
    has_hyphen          = 'has_hyphen' 
    is_code             = 'is_code' 
    num_syllabus        = 'num_syllabus'
    is_name             = 'is_name' 

    check_code = False
    for char in word:
        if char.isdigit():
            check_code = True
            break;

    ft = {
        'bias'                : 1,
        is_lower            : word.islower(),
        is_capital          : word.isupper(),
        is_title            : word.istitle(),
        is_mix              : not(word.islower() and word.isupper()),
        is_capital_period   : (('.' in word) and word[0].isupper()),
        is_digit            : word.isdigit(),
        end_digit           : word[-1].isdigit(),
        has_hyphen          : ('-' in word),
        is_code             : check_code,
        num_syllabus        : (word.count('_') + 1),
        is_name             : word[0].isupper()
    }   
    return ft

def word_feature(sent, i, pre_state, pre_pre_state, sent_re_ft):
    word = sent[i][0]
    ft = dict()
    ### basic feature 
    # current word
    ft['w0'] = word
    # previous entity tag
    ft['s-1'] = pre_state
    ft['s-2'] = pre_pre_state
    ### basic shape feature
    ft.update(shape_feature(word))
    ### basic joint feature
    if i > 0:
        ft['w-1'] = sent[i-1][0]
    else:
        ft['w-1'] = 'BOS'
    if i > 1:
        ft['w-2'] = sent[i-2][0]
    else:
        ft['w-2'] = 'BOS'
    if i < len(sent)-1:
        ft['w+1'] = sent[i+1][0]
    else:
        ft['w+1'] = 'EOS'
    if i < len(sent)-2:
        ft['w+2'] = sent[i+2][0]
    else:
        ft['w+2'] = 'EOS'
    ### regular expression type
    ft['r0'] = sent_re_ft[i]
    if i > 0:
        ft['r-1'] = sent_re_ft[i-1]
    else:
        ft['r-1'] = 'BOS'
    if i < len(sent)-1:
        ft['r+1'] = sent_re_ft[i+1]
    else:        
        ft['r+1'] = 'EOS'
    if i > 1:
        ft['r-2'] = sent_re_ft[i-2]
    else:
        ft['r-2'] = 'BOS'
    if i < len(sent)-2:
        ft['r+2'] = sent_re_ft[i+2]
    else:
        ft['r+2'] = 'EOS'
    return ft
re_adm_div      = ['ấp', 'buôn', 'bản', 'huyện', 'làng', 'miền', 'nước', 
                   'phường', 'quận', 'tỉnh', 'thành_phố', 'thị_trấn', 'thị_xã', 
                   'thôn', 'TT', 'TP', 'TX', 'TT.', 'TP.', 'TX.', 'xứ', 'xã', 
                   'xóm']
re_org          = ['báo', 'bệnh_viện', 'bệnh_xá', 'công_ty', 'công_ti', 'đài', 'đảng', 'đoàn', 'hội', 'hợp_tác_xã', 'khách_sạn', 'nhà_máy', 'nhà_xuất_bản', 'ngân_hàng', 'quỹ', 'tạp_chí', 'tập đoàn', 'thông_tấn_xã', 'tờ', 'trạm_xá', 'xí_nghiệp','ủy_ban']
re_school       = ['mẫu_giáo', 'tiểu_học', 'trung_học', 'trung_học_cơ_sở', 
                   'trung_học_phổ_thông', 'cao_đẳng', 'trung_cấp', 
                   'trung_cấp_nghề', 'đại_học']
re_street       = ['đại_lộ', 'đường', 'hẻm', 'ngách', 'ngõ', 'nhà', 'phố', 'quốc_lộ']
re_place        = ['ao', 'am', 'bến', 'bến_cảng', 'bến_phà','biển', 'cảng', 
                   'cầu', 'công_viên', 'chợ', 'chùa', 'dãy', 'đảo', 'đầm', 'đèo', 
                   'đền', 'đình', 'đồi', 'động', 'đồng_bằng', 'gềnh', 'gò', 'khu', 'hòn', 'hồ', 
                   'lăng', 'miếu', 'miền', 'nhà_ga', 'núi', 'phà', 'quần_đảo', 
                   'sân_bay', 'sông', 'suối', 'vùng']
re_office       = ['ban', 'bộ', 'chi_cục', 'cục', 'hạt', 'sở']
re_army         = ['binh_đoàn', 'đại_đội', 'đặc_khu', 'đơn_vị', 'lữ_đoàn', 'quân_đoàn', 'quân_đội', 'quân_khu','sư_đoàn', 'tiểu_đội', 'tiểu_đoàn', 'trung_đội']

def re_word(word):
    """
        Return a dict of (regexp Name, regexp Value) of a word
        :type word: string
        :param word: a word in sentence
    """

    check_code = False
    for char in word:
        if char.isdigit():
            check_code = True
            break

    re_dict = dict()
    re_dict['org'] = word.lower() in re_org
    re_dict['name'] = word[0].isupper()
    re_dict['capital'] = word.isupper()
    re_dict['adm_div'] = word.lower() in re_adm_div
    re_dict['is_school'] = word.lower() == 'trường'
    re_dict['school'] = word.lower() in re_school
    re_dict['street'] = word.lower() in re_street
    re_dict['digit'] = word.isdigit()
    re_dict['code'] = check_code
    re_dict['place'] =  word in re_place
    re_dict['office'] = word in re_office
    re_dict['army'] = word in re_army
    return re_dict 

re_type_name = [ 
    ('ofice_name_admdiv_name', ['office', 'name', 'adm_div', 'name']),
    ('school_type_name_name'     , ['is_school', 'school', 'name', 'name']),
    ('school_capital_name_name'  , ['is_school', 'capital', 'name', 'name']),
    ('org_cap_name_name'         , ['org', 'capital', 'name', 'name']),
    ('org_adm_div'          , ['capital', 'adm_div', 'name']),
    ('school_type_name'     , ['is_school', 'school', 'name']),
    ('school_capital_name'  , ['is_school', 'capital', 'name']),
    ('org_cap_name'         , ['org', 'capital', 'name']),
    ('place_name_name'           , ['place', 'name', 'name']),
    ('place_name'           , ['place', 'name']),
    ('org_name'                  , ['org', 'name', 'name']),
    ('school_name_name'          , ['school', 'name', 'name']),
    ('office_name_name'               , ['office', 'name', 'name']),
    ('street_name_name'          , ['street', 'name', 'name']),
    ('org'                  , ['org', 'name']),
    ('school_name'          , ['school', 'name']),
    ('adm_div'              , ['adm_div', 'name']),
    ('office_name'               , ['office', 'name']),
    ('street_name'          , ['street', 'name']),
    ('street_digit'         , ['street', 'digit']),
    ('street_code'          , ['street', 'code']),
    ('army_name'            , ['army', 'code']),
    ('army_name'            , ['army', 'digit']),
    ('army_name'            , ['army', 'name'])
]

def sent_re_feature(sent):
    l = len(sent)
    sent_re_ft = ['NA'] * l
    re_dict_word = [re_word(word[0]) for word in sent]
    for type_name in re_type_name:
        tl = len(type_name[1])
        for i in range(l):
            if (i + tl <= l):
                if (set(['NA']) == set([sent_re_ft[i+j] for j in range(tl)])) and (set([True]) == set([re_dict_word[i+ll][type_name[1][ll]] for ll in range(tl)])):
                    for k in range(tl):
                        sent_re_ft[i+k] = type_name[0]
    return sent_re_ft

def sent_feature_train(sent):
    sent_ft_train = list()
    sent_re_ft = sent_re_feature(sent)
    for i in range(len(sent)):
        if i < 1:
            sent_ft_train.append((word_feature(sent, i, 'BOS', 'BOS', sent_re_ft),
                                  labels_dict[sent[i][3]]))
        elif i < 2:
            sent_ft_train.append((word_feature(sent, i, sent[i-1][3], 'BOS', sent_re_ft),
                                  labels_dict[sent[i][3]]))
        else:
            sent_ft_train.append((word_feature(sent, i, sent[i-1][3], sent[i-2][3], sent_re_ft),
                                  labels_dict[sent[i][3]]))    
    return sent_ft_train          

In [0]:
def sent_feature_test(sent, pre_state, pre_pre_state):
    sent_ft_test = list()
    sent_re_ft = sent_re_feature(sent)
    for i in range(len(sent)):
        sent_ft_test.append(word_feature(sent, i, pre_state, pre_pre_state, sent_re_ft))    
    return sent_ft_test   

In [0]:
def viterbi_decoder(model, sent):
    sent_re_ft = sent_re_feature(sent)
    alpha = [([0] * len(labels)) for i in range(len(sent))]
    trace = np.full(shape=(len(sent), len(labels)), fill_value=-1)

    # start probability
    pdist = model.prob_classify(word_feature(sent, 0, 'BOS', 'BOS', sent_re_ft))    
    alpha[0] = [pdist.prob(l) for l in labels]
    
    for i in range(1, len(sent)):
        alpha[i] = [0] * len(labels)
        for j in range(len(labels)):
            pre_state = labels[j];
            pre_pre_state = 'BOS'
            if i > 1:
                pre_pre_state = labels[trace[i-1][j]];
            feature = word_feature(sent, i, pre_state, pre_pre_state, sent_re_ft)
            pdist = model.prob_classify(feature)                
            posterior = [pdist.prob(l) for l in labels]
            for k in range(len(labels)):
                if alpha[i][k] < (posterior[k] * alpha[i-1][j]):
                    alpha[i][k] = posterior[k] * alpha[i-1][j]
                    trace[i][k] = j
    m = alpha[-1][0]
    idx = 0
    for i in range(1, len(alpha[-1])):
        if (alpha[-1][i] > m):
            m = alpha[-1][i]
            idx = i;
    predict = list()
    for i in range(len(sent)-1, -1, -1):
        predict.append(labels[idx])
        idx = trace[i][idx]
    # print(alpha)
    return reversed(predict)

def predict_sent(model, sent):
    y_test_sent = [sent[i][3] for i in range(len(sent))]   
    y_pred_sent = viterbi_decoder(model, sent)
    return y_test_sent, y_pred_sent

def predict(model, sents):
    y_test = []
    y_pred = []
    for sent in sents:
        test, pred = predict_sent(model, sent)
        y_test.extend(test)
        y_pred.extend(pred)
    return y_test, y_pred            

In [10]:
all_data, pos_tag, chunk_tag = prepare_data(data_path)
train_sents, test_sents = train_test_split(all_data, test_size = 0.15, random_state=42)
print("train_sents", len(train_sents))
print("test_sents", len(test_sents))

train_sents 14087
test_sents 2486


In [11]:
train_data = []
for sent in train_sents:
    for feature, label in sent_feature_train(sent):
        train_data.append((feature, labels[label]))     

print('train_data length', len(train_data))
print(train_data[0])

train_data length 305800
({'w0': 'Ngoài', 's-1': 'BOS', 's-2': 'BOS', 'bias': 1, 'is_lower': False, 'is_capital': False, 'is_title': True, 'is_mix': True, 'is_capital_period': False, 'is_digit': False, 'end_digit': False, 'has_hyphen': False, 'is_code': False, 'num_syllabus': 1, 'is_name': True, 'w-1': 'BOS', 'w-2': 'BOS', 'w+1': 'một_số', 'w+2': 'nhỏ', 'r0': 'NA', 'r-1': 'BOS', 'r+1': 'NA', 'r-2': 'BOS', 'r+2': 'NA'}, 'O')


In [12]:
%%time 
max_iter = 10
encoding = BinaryMaxentFeatureEncoding.train(train_data, count_cutoff=3, labels = labels, alwayson_features=True)
model = MaxentClassifier.train(train_data, algorithm = 'iis', trace=3, encoding=encoding, max_iter=max_iter)
# save model
pickle.dump(model, open(model_path + "mem-single-classifier-featureset2-binaryfeature-maxiter10.model", "wb"))

  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.94591        0.937
             2          -0.10393        0.937
             3          -0.09984        0.937
             4          -0.09194        0.938
             5          -0.08306        0.945
             6          -0.07500        0.953
             7          -0.06811        0.960
             8          -0.06233        0.967
             9          -0.05748        0.973
         Final          -0.05338        0.977
CPU times: user 51min 19s, sys: 5.1 s, total: 51min 24s
Wall time: 51min 32s


In [15]:
%%time
# test model
test_model = pickle.load(open(model_path + "mem-single-classifier-featureset2-binaryfeature-maxiter10.model", "rb"))
y_test, y_pred = predict(test_model, test_sents)
precision, recall, fscore, support = score(y_test, y_pred, labels=eval_labels)
print('labels:    {}'.format(eval_labels))
print('precision: {}'.format([str(round(p*100,2)) + '%' for p in precision]))
print('recall:    {}'.format([str(round(r*100,2)) + '%' for r in recall]))
print('fscore:    {}'.format([str(round(f*100,2)) + '%' for f in fscore]))
print('support:   {}'.format(support))
total_precision = metrics.precision_score(y_test, y_pred, average='weighted', labels=eval_labels)
total_recall = metrics.recall_score(y_test, y_pred, average='weighted', labels=eval_labels)
total_fscore = metrics.f1_score(y_test, y_pred, average='weighted', labels=eval_labels)
print('total precision (weighted): {}'.format(str(round(total_precision*100,2)) + '%'))
print('total recall (weighted): {}'.format(str(round(total_recall*100,2)) + '%'))
print('total fscore (weighted): {}'.format(str(round(total_fscore*100,2)) + '%'))

labels:    ['B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
precision: ['91.25%', '97.34%', '86.93%', '93.5%', '82.22%', '82.8%']
recall:    ['64.14%', '76.43%', '35.21%', '43.09%', '22.42%', '26.46%']
fscore:    ['75.33%', '85.62%', '50.12%', '58.99%', '35.24%', '40.1%']
support:   [1057  526  869  434  165  291]
total precision (weighted): 90.2%
total recall (weighted): 50.48%
total fscore (weighted): 63.23%
CPU times: user 1min 20s, sys: 27 ms, total: 1min 20s
Wall time: 1min 20s


In [16]:
test_model.show_most_informative_features()

  -3.565 s-1=='O' and label is 'I-PER'
  -3.310 s-1=='O' and label is 'I-ORG'
  -3.138 is_name==False and label is 'B-PER'
  -3.072 is_lower==True and label is 'B-PER'
  -2.842 s-1=='O' and label is 'I-LOC'
  -2.768 s-1=='B-PER' and label is 'B-PER'
  -2.453 w0=='VN' and label is 'O'
  -2.255 is_title==False and label is 'B-PER'
  -2.188 w-1=='Nguyễn' and label is 'O'
   2.184 w0=='sư' and label is 'B-ORG'


In [18]:
%%time 
max_iter = 50
encoding = BinaryMaxentFeatureEncoding.train(train_data, count_cutoff=3, labels = labels, alwayson_features=True)
model = MaxentClassifier.train(train_data, algorithm = 'iis', trace=3, encoding=encoding, max_iter=max_iter)
# save model
pickle.dump(model, open(model_path + "mem-single-classifier-featureset2-binaryfeature-maxiter50.model", "wb"))

  ==> Training (50 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.94591        0.937
             2          -0.10393        0.937
             3          -0.09984        0.937
             4          -0.09194        0.938
             5          -0.08306        0.945
             6          -0.07500        0.953
             7          -0.06811        0.960
             8          -0.06233        0.967
             9          -0.05748        0.973
            10          -0.05338        0.977
            11          -0.04990        0.980
            12          -0.04691        0.982
            13          -0.04433        0.984
            14          -0.04208        0.985
            15          -0.04010        0.986
            16          -0.03835        0.987
            17          -0.03679        0.988
            18          -0.03539        0.989
            19          -0.03413        0.989
  

In [20]:
%%time
# test model
test_model = pickle.load(open(model_path + "mem-single-classifier-featureset2-binaryfeature-maxiter50.model", "rb"))
y_test, y_pred = predict(test_model, test_sents)
precision, recall, fscore, support = score(y_test, y_pred, labels=eval_labels)
print('labels:    {}'.format(eval_labels))
print('precision: {}'.format([str(round(p*100,2)) + '%' for p in precision]))
print('recall:    {}'.format([str(round(r*100,2)) + '%' for r in recall]))
print('fscore:    {}'.format([str(round(f*100,2)) + '%' for f in fscore]))
print('support:   {}'.format(support))
total_precision = metrics.precision_score(y_test, y_pred, average='weighted', labels=eval_labels)
total_recall = metrics.recall_score(y_test, y_pred, average='weighted', labels=eval_labels)
total_fscore = metrics.f1_score(y_test, y_pred, average='weighted', labels=eval_labels)
print('total precision (weighted): {}'.format(str(round(total_precision*100,2)) + '%'))
print('total recall (weighted): {}'.format(str(round(total_recall*100,2)) + '%'))
print('total fscore (weighted): {}'.format(str(round(total_fscore*100,2)) + '%'))

labels:    ['B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
precision: ['89.89%', '95.99%', '89.56%', '90.71%', '80.34%', '77.98%']
recall:    ['90.82%', '91.06%', '79.98%', '67.51%', '56.97%', '58.42%']
fscore:    ['90.35%', '93.46%', '84.5%', '77.41%', '66.67%', '66.8%']
support:   [1057  526  869  434  165  291]
total precision (weighted): 89.36%
total recall (weighted): 80.52%
total fscore (weighted): 84.42%
CPU times: user 1min 19s, sys: 16 ms, total: 1min 19s
Wall time: 1min 19s


In [21]:
test_model.show_most_informative_features()

  -9.241 s-1=='O' and label is 'I-PER'
  -8.704 s-1=='O' and label is 'I-ORG'
   8.527 w-1=='Ẩn' and label is 'B-PER'
  -7.773 s-1=='B-PER' and label is 'B-PER'
  -7.646 w0=='VN' and label is 'O'
  -7.542 s-1=='O' and label is 'I-LOC'
   6.913 w0=='tổ_hợp' and label is 'B-ORG'
   6.855 w-1=='Nhà' and label is 'I-ORG'
   6.701 w-1=='văn_nghệ' and label is 'B-ORG'
   6.255 w0=='út' and label is 'B-PER'
