In [3]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [0]:
import os
import numpy as np
import codecs
import pickle
import nltk
import pickle
from nltk.classify.maxent import MaxentClassifier, BinaryMaxentFeatureEncoding
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report

In [5]:
# Global variables
rawdata_path = "/content/gdrive/My Drive/ml/data/rawdata/"
data_path = "/content/gdrive/My Drive/ml/data/data/"
model_path = "/content/gdrive/My Drive/ml/model/"
labels = ['B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'O']
labels_dict = {labels[i]: i for i in range(len(labels))}
eval_labels = ['B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
print(labels_dict)

{'B-PER': 0, 'I-PER': 1, 'B-ORG': 2, 'I-ORG': 3, 'B-LOC': 4, 'I-LOC': 5, 'O': 6}


In [0]:
def remove_xml_tags(filename):
  ''' 
  Remove xml tag in file in data folder(raw data)
  Args:
    filename: The name of the data file in dataVLSP folder
  Return:
    File of the same name has removed xml tags in data folder
  Example:
    <editor>Vietlex team, 8-2016</editor>
    -DOCSTART-
    <s>				
    Đó	P	B-NP	O	O
    là	V	B-VP	O	O
    con	Nc	B-NP	O	O
  :converted into:
    Đó	P	B-NP	O	O
    là	V	B-VP	O	O
    con	Nc	B-NP	O	O

    saved in dataVLSP folder(processed data)
  '''
  f1 = open(rawdata_path + filename, 'r',encoding='utf-8')
  f2 = open(data_path + filename, 'w+',encoding='utf-8')
  for line in f1:
    line.strip()
    if(('<title>' in line) or line.startswith('<e') or line.startswith('-D') or line.startswith('<s>')):
      pass
    elif(line.startswith('</')):
      f2.write(line.replace(line,'\n'))
    else:
      f2.write(line)
  f1.close()
  f2.close()

In [0]:
def clean_data(path):
  ''' 
  Remove xml tags of all files in the dataVLSP folder
  Processed data saved in data
  '''
  list_files = os.listdir(path)
  for file in list_files:
    remove_xml_tags(file)

In [0]:
def prepare_data(path):
    ''' Create training data and testing data
        Format of data: CoNLL

        Args:
        path: path of data folder
        scale: test size
        index_attri: Represents the number of attributes and the associated attribute type
            index_attri == 1 : The number of attributes = 1 - only ner label. ex: [('Huế', 'B_LOC'), ('là', 'O'), ('thành_phố', 'O'), ('đẹp', 'O')]
            index_attri == 2.1 : The number of attributes = 2(pos-tagging label, ner label). ex: [('Đó', 'P', 'O'), ('là', 'V',  'O'), ('con', 'Nc', 'O'), ('đường', 'N', , 'O')]
            index_attri = 2.2 : The number of attributes = 2(chunking label, ner label). ex: [('Đó', 'B-NP', 'O'), ('là', 'B-VP', 'O'), ('con', 'B-NP', 'O'), ('đường', 'B-NP', 'O')]
            index_attri = 3 : The number of attributes = 3(pos-tagging label,chunking, ner label). ex: [('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')]
            if index_attri not in {1,2.1,2,2,3} index_attri = 2.1
        Return:
        train_sents, test_sents
        
        Example of format data:
        [[('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')],
        [('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')],
    '''    
    list_files = os.listdir(path)
    all_data = []
    ''' Convert data format to CoNll '''
    #training data
    c = 0;
    pos_tag = []
    chunk_tag = []
    ne_tag = []
    for file in list_files:
        with codecs.open(path + file,'r',encoding='utf8') as f:
            sentence = []
            remove = False
            for line in f:
                line = line.split()
                if len(line) > 3:
                    #label_set.append(line[3])
                    if line[3] not in labels:
                        remove = True
                    else:
                        pos_tag.append(line[1])
                        chunk_tag.append(line[2])
                    sentence.append((line[0],line[1],line[2],line[3]))
                else:
                    if len(sentence) > 0:
                        if remove == False:                            
                            all_data.append(sentence)
                        else:
                            remove = False
                        sentence = []
            f.close()

    pos_tag = set(pos_tag)
    chunk_tag = set(chunk_tag)
    return  all_data, pos_tag, chunk_tag

In [0]:
def shape_feature(word):
    is_lower            = 'is_lower'
    is_capital          = 'is_capital' 
    is_title            = 'is_title' 
    is_mix              = 'is_mix' 
    is_capital_period   = 'is_capital_period' 
    is_digit            = 'is_digit' 
    end_digit           = 'end_digit' 
    has_hyphen          = 'has_hyphen' 
    is_code             = 'is_code' 
    num_syllabus        = 'num_syllabus'
    is_name             = 'is_name' 

    check_code = False
    for char in word:
        if char.isdigit():
            check_code = True
            break;

    ft = {
        'bias'                : 1,
        is_lower            : word.islower(),
        is_capital          : word.isupper(),
        is_title            : word.istitle(),
        is_mix              : not(word.islower() and word.isupper()),
        is_capital_period   : (('.' in word) and word[0].isupper()),
        is_digit            : word.isdigit(),
        end_digit           : word[-1].isdigit(),
        has_hyphen          : ('-' in word),
        is_code             : check_code,
        num_syllabus        : (word.count('_') + 1),
        is_name             : word[0].isupper()
    }   
    return ft

def word_feature(sent, i, pre_state, pre_pre_state):
    word = sent[i][0]
    ft = dict()
    ### basic feature 
    # current word
    ft['w0'] = word
    # previous entity tag
    ft['s-1'] = pre_state
    ### basic shape feature
    ft.update(shape_feature(word))
    #### basic joint feature
    if i > 0:
        ft['w-1'] = sent[i-1][0]
    else:
        ft['w-1'] = 'BOS'
    if i > 1:
        ft['w-2'] = sent[i-2][0]
    else:
        ft['w-2'] = 'BOS'
    if i < len(sent)-1:
        ft['w+1'] = sent[i+1][0]
    else:
        ft['w+1'] = 'EOS'
    if i < len(sent)-2:
        ft['w+2'] = sent[i+2][0]
    else:
        ft['w+2'] = 'EOS'
    return ft

def sent_feature_train(sent):
    sent_ft_train = list()
    for i in range(len(sent)):
        if i < 1:
            sent_ft_train.append((word_feature(sent, i, 'BOS', 'BOS'),
                                  labels_dict[sent[i][3]]))
        elif i < 2:
            sent_ft_train.append((word_feature(sent, i, sent[i-1][3], 'BOS'),
                                  labels_dict[sent[i][3]]))
        else:
            sent_ft_train.append((word_feature(sent, i, sent[i-1][3], sent[i-2][3]),
                                  labels_dict[sent[i][3]]))    
    return sent_ft_train

In [0]:
def sent_feature_test(sent, pre_state, pre_pre_state):
    sent_ft_test = list()
    for i in range(len(sent)):
        sent_ft_test.append(word_feature(sent, i, pre_state, pre_pre_state))    
    return sent_ft_test   

In [0]:
def viterbi_decoder(model, sent):
    alpha = [([0] * len(labels)) for i in range(len(sent))]
    trace = np.full(shape=(len(sent), len(labels)), fill_value=-1)

    # start probability
    pdist = model.prob_classify(word_feature(sent, 0, 'BOS', 'BOS'))    
    alpha[0] = [pdist.prob(l) for l in labels]
    
    for i in range(1, len(sent)):
        alpha[i] = [0] * len(labels)
        for j in range(len(labels)):
            pre_state = labels[j];
            pre_pre_state = 'BOS'
            if i > 1:
                pre_pre_state = labels[trace[i-1][j]];
            feature = word_feature(sent, i, pre_state, pre_pre_state)
            pdist = model.prob_classify(feature)                
            posterior = [pdist.prob(l) for l in labels]
            for k in range(len(labels)):
                if alpha[i][k] < (posterior[k] * alpha[i-1][j]):
                    alpha[i][k] = posterior[k] * alpha[i-1][j]
                    trace[i][k] = j
    m = alpha[-1][0]
    idx = 0
    for i in range(1, len(alpha[-1])):
        if (alpha[-1][i] > m):
            m = alpha[-1][i]
            idx = i;
    predict = list()
    for i in range(len(sent)-1, -1, -1):
        predict.append(labels[idx])
        idx = trace[i][idx]
    # print(alpha)
    return reversed(predict)

def predict_sent(model, sent):
    y_test_sent = [sent[i][3] for i in range(len(sent))]   
    y_pred_sent = viterbi_decoder(model, sent)
    return y_test_sent, y_pred_sent

def predict(model, sents):
    y_test = []
    y_pred = []
    for sent in sents:
        test, pred = predict_sent(model, sent)
        y_test.extend(test)
        y_pred.extend(pred)
    return y_test, y_pred   

In [12]:
all_data, pos_tag, chunk_tag = prepare_data(data_path)
train_sents, test_sents = train_test_split(all_data, test_size = 0.15, random_state=42)
print("train_sents", len(train_sents))
print("test_sents", len(test_sents))

train_sents 14087
test_sents 2486


In [16]:
train_data = []
for sent in train_sents:
    for feature, label in sent_feature_train(sent):
        train_data.append((feature, labels[label]))     

print('train_data length', len(train_data))
print(train_data[0])

train_data length 305800
({'w0': 'Ngoài', 's-1': 'BOS', 'bias': 1, 'is_lower': False, 'is_capital': False, 'is_title': True, 'is_mix': True, 'is_capital_period': False, 'is_digit': False, 'end_digit': False, 'has_hyphen': False, 'is_code': False, 'num_syllabus': 1, 'is_name': True, 'w-1': 'BOS', 'w-2': 'BOS', 'w+1': 'một_số', 'w+2': 'nhỏ'}, 'O')


In [17]:
%%time 
max_iter = 10
encoding = BinaryMaxentFeatureEncoding.train(train_data, count_cutoff=3, labels = labels, alwayson_features=True)
model = MaxentClassifier.train(train_data, algorithm = 'iis', trace=3, encoding=encoding, max_iter=max_iter)
# save model
pickle.dump(model, open(model_path + "mem-single-classifier-featureset1-binaryfeature-maxiter10.model", "wb"))

  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.94591        0.937
             2          -0.10011        0.937
             3          -0.09519        0.937
             4          -0.08642        0.941
             5          -0.07782        0.950
             6          -0.07047        0.960
             7          -0.06436        0.968
             8          -0.05929        0.974
             9          -0.05506        0.978
         Final          -0.05150        0.981
CPU times: user 34min 27s, sys: 3.47 s, total: 34min 30s
Wall time: 34min 34s


In [26]:
%%time
# test model
test_model = pickle.load(open(model_path + "mem-single-classifier-featureset1-binaryfeature-maxiter10.model", "rb"))
y_test, y_pred = predict(test_model, test_sents)
precision, recall, fscore, support = score(y_test, y_pred, labels=eval_labels)
print('labels:    {}'.format(eval_labels))
print('precision: {}'.format([str(round(p*100,2)) + '%' for p in precision]))
print('recall:    {}'.format([str(round(r*100,2)) + '%' for r in recall]))
print('fscore:    {}'.format([str(round(f*100,2)) + '%' for f in fscore]))
print('support:   {}'.format(support))
total_precision = metrics.precision_score(y_test, y_pred, average='weighted', labels=eval_labels)
total_recall = metrics.recall_score(y_test, y_pred, average='weighted', labels=eval_labels)
total_fscore = metrics.f1_score(y_test, y_pred, average='weighted', labels=eval_labels)
print('total precision (weighted): {}'.format(str(round(total_precision*100,2)) + '%'))
print('total recall (weighted): {}'.format(str(round(total_recall*100,2)) + '%'))
print('total fscore (weighted): {}'.format(str(round(total_fscore*100,2)) + '%'))

labels:    ['B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
precision: ['91.03%', '93.65%', '88.12%', '87.5%', '88.57%', '90.28%']
recall:    ['74.93%', '78.52%', '30.72%', '9.68%', '18.79%', '22.34%']
fscore:    ['82.2%', '85.42%', '45.56%', '17.43%', '31.0%', '35.81%']
support:   [1057  526  869  434  165  291]
total precision (weighted): 90.04%
total recall (weighted): 48.17%
total fscore (weighted): 58.2%
CPU times: user 57.8 s, sys: 17 ms, total: 57.8 s
Wall time: 57.9 s


In [0]:
test_model.show_most_informative_features()

  -4.536 s-1=='O' and label is 'I-PER'
  -4.420 s-1=='O' and label is 'I-ORG'
  -3.836 is_name==False and label is 'B-PER'
  -3.750 s-1=='O' and label is 'I-LOC'
  -3.600 is_lower==True and label is 'B-PER'
  -3.546 s-1=='B-PER' and label is 'B-PER'
  -3.102 w0=='VN' and label is 'O'
  -2.956 s-1=='I-PER' and label is 'B-LOC'
   2.867 w0=='sư' and label is 'B-ORG'
  -2.755 w+1=='EOS' and label is 'B-LOC'
