In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Word2Vec

#tune hyperparam
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

import numpy as np

In [0]:
class Config():
  # directory of raw VLSP data
  rawdata_path = '/content/drive/My Drive/NLP/NER/dataVLSP/'
  # directory of VLSP data  has been removed the xml tags
  data_path = '/content/drive/My Drive/NLP/NER/data/'
  # word2vec trained file
  embedd_pathfile = '/content/drive/My Drive/NLP/NER/word2vec.ipynb'
  #labels
  labels = ['B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'O']

config = Config()

In [0]:
#clean data

import codecs
import os

config = Config()

def remove_xml_tags(filename):
  ''' 
  Remove xml tag in file in data folder(raw data)
  Args:
    filename: The name of the data file in dataVLSP folder
  Return:
    File of the same name has removed xml tags in data folder
  Example:
    <editor>Vietlex team, 8-2016</editor>
    -DOCSTART-
    <s>				
    Đó	P	B-NP	O	O
    là	V	B-VP	O	O
    con	Nc	B-NP	O	O
  :converted into:
    Đó	P	B-NP	O	O
    là	V	B-VP	O	O
    con	Nc	B-NP	O	O

    saved in dataVLSP folder(processed data)
  '''
  f1 = open(config.rawdata_path + filename, 'r',encoding='utf-8')
  f2 = open(config.data_path + filename, 'w+',encoding='utf-8')
  for line in f1:
    line.strip()
    if(('<title>' in line) or line.startswith('<e') or line.startswith('-D') or line.startswith('<s>')):
      pass
    elif(line.startswith('</')):
      f2.write(line.replace(line,'\n'))
    else:
      f2.write(line)
  f1.close()
  f2.close()

def clean_data(path):
  ''' 
  Remove xml tags of all files in the dataVLSP folder
  Processed data saved in data
  '''
  list_files = os.listdir(path)
  for file in list_files:
    remove_xml_tags(file)

  clean_data(config.rawdata_path)

In [0]:
from sklearn.model_selection import train_test_split
import os
import codecs
config = Config()
def prepare_data(path, scale,index_attri):
  ''' Create training data and testing data
      Format of data: CoNLL

      Args:
        path: path of data folder
        scale: test size
        index_attri: Represents the number of attributes and the associated attribute type
          index_attri == 1 : The number of attributes = 1 - only ner label. ex: [('Huế', 'B_LOC'), ('là', 'O'), ('thành_phố', 'O'), ('đẹp', 'O')]
          index_attri == 2.1 : The number of attributes = 2(pos-tagging label, ner label). ex: [('Đó', 'P', 'O'), ('là', 'V',  'O'), ('con', 'Nc', 'O'), ('đường', 'N', , 'O')]
          index_attri = 2.2 : The number of attributes = 2(chunking label, ner label). ex: [('Đó', 'B-NP', 'O'), ('là', 'B-VP', 'O'), ('con', 'B-NP', 'O'), ('đường', 'B-NP', 'O')]
          index_attri = 3 : The number of attributes = 3(pos-tagging label,chunking, ner label). ex: [('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')]
          if index_attri not in {1,2.1,2,2,3} index_attri = 2.1
      Return:
        train_sents, test_sents
      
      Example of format data:
      [[('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')],
      [('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')],
      ...
      ]

  '''

  # check index_attri
  if index_attri not in {1,2.1,2,2,3} :
    index_attri = 2.1
  # split data by file
  list_files = os.listdir(path)
  # train_files, test_files = train_test_split(list_files,test_size=scale,random_state=42)
  all_data = []

  ''' Convert data format to CoNll '''
  #training data
  for file in list_files:
    with codecs.open(path + file,'r',encoding='utf8') as f:
      sentence = []
      remove = False;
      for line in f:
        line = line.split()
        if len(line) > 3 :
          if(line[3] not in config.labels):
            remove = True
          else:
            if index_attri == 1:
                sentence.append((line[0],line[3]))
            elif index_attri == 2.2:
              sentence.append((line[0],line[2],line[3]))
            elif index_attri == 3:
              sentence.append((line[0],line[1],line[2],line[3]))
            else:
              sentence.append((line[0],line[1],line[3]))
        else:
          if len(sentence) > 0:
            if remove == False:                            
              all_data.append(sentence)
            else:
              remove = False
            sentence = []
    f.close()
  
  train_sent_data, test_sent_data = train_test_split(all_data,test_size=scale,random_state=42)
  return  train_sent_data,test_sent_data
  

In [0]:
train_sents,test_sents = prepare_data(config.data_path,0.15,2.1)
print(len(train_sents))
print(len(test_sents))

14087
2486


In [0]:
pip install sklearn_crfsuite



In [0]:
pip install eli5



In [0]:
cd '/content/drive/My Drive/NLP/NER/'

/content/drive/My Drive/NLP/NER


In [0]:
def is_mix(word):
  return not(word.islower() and word.isupper())   

def check_code(word):
  check_code = False
  for char in word:
      if char.isdigit():
          check_code = True
          break; 
  return check_code    

In [0]:
import pandas as pd
import pycrfsuite
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
import re
def word2feature(sent, i):
  word = sent[i][0]
  
  # if '_' in word:
    # word.replace('_', ' ')
  #print(sent[i],'\n')
  Tag = sent[i][2]

  features = ([
      'bias',
      'sentence_[-3:]='+word[-3:],
      'sentence_[-2:]='+word[-2:],
      'sentence_.lower=%s'%word.lower(),
      'sentence_.isupper=%s'%word.isupper(),
      'sentence_.istitle=%s' % word.istitle(),
      'sentence_.is_mix=%s' % is_mix(word),
      'sentence_.is_capital_period=%s' % (('.' in word) and word[0].isupper()),
      'sentence_.isdigit=%s' % word.isdigit(),
      'sentence_end_digit=%s' %word[-1].isdigit(),
      'sentence_.hashyphen=%s' % word.find('-'),
      'sentence_.is_code=%s' % check_code(word),
      'sentence_.num_syllabus=%s' %(word.count('_') + 1),
      'sentence_.is_name=%s' % word[0].isupper(),
       #'tag='+ Tag
  ])
  '''
  wordembdding=get_features(word)
  for iv,value in enumerate(wordembdding):
    features.extend(['v{}'.format(iv) +': '+ str(value)])
  '''
  if i > 0:
    word1 = sent[i - 1][0]
    if '_' in word1:
      word.replace('_', ' ')
    #Tag1 = sent[i - 1][2]
    features.extend([
        '-1:sentence_.lower=' + word1.lower(),
        '-1:sentence_.istitle=%s' % word1.istitle(),
        '-1:sentence_.isupper=%s' % word1.isupper(),
        '-1:sentence_.isdigit=%s' % word1.isdigit(),
        #'-1:tag=' + Tag1,
        # '-1:tag[:2]=' + Tag1[:2],
    ])
  else:
    features.append('BOS')
  if i < len(sent)-1:
    word1 = sent[i+1][0]
    if '_' in word1:
      word.replace('_', ' ')
    #Tag1 = sent[i+1][2]
    # print(word1,'_',Tag1)

    features.extend([
        '+1:sentence_.lower=' + word1.lower(),
        '+1:sentence_.istitle=%s' % word1.istitle(),
        '+1:sentence_.isupper=%s' % word1.isupper(),
        '+1:sentence_.isdigit=%s' % word1.isdigit(),
        #'+1:tag=' + Tag1,
        # '+1:tag[:2]=' + Tag1[:2],
    ])
  else:
    features.append('EOS')

  return features



def sent2features(sent):
  return [word2feature(sent, i) for i in range(len(sent))]

def sent2labels(sent):
  return [label for token, postag, label in sent]

def sent2tokens(sent):
  return [token for token, postag, label in sent]

In [0]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

In [0]:
X_train[0][1]

['bias',
 'sentence_[-3:]=đẹp',
 'sentence_[-2:]=ẹp',
 'sentence_.lower=xinh_đẹp',
 'sentence_.isupper=False',
 'sentence_.istitle=False',
 'sentence_.is_mix=True',
 'sentence_.is_capital_period=False',
 'sentence_.isdigit=False',
 'sentence_end_digit=False',
 'sentence_.hashyphen=-1',
 'sentence_.is_code=False',
 'sentence_.num_syllabus=2',
 'sentence_.is_name=False',
 '-1:sentence_.lower=đào',
 '-1:sentence_.istitle=True',
 '-1:sentence_.isupper=False',
 '-1:sentence_.isdigit=False',
 '+1:sentence_.lower=,',
 '+1:sentence_.istitle=False',
 '+1:sentence_.isupper=False',
 '+1:sentence_.isdigit=False']

In [0]:
import nltk
import sklearn_crfsuite
import eli5
crf = sklearn_crfsuite.CRF(
  algorithm='lbfgs',
  c1=0.1,
  c2= 1e-17,
  max_iterations=50,
)


In [0]:
a = 1331
#crf.fit(X_train,y_train)
#crf.fit(X_train[:a],y_train[:a])
crf.fit(X_train[a+1:]+ X_train[:a], y_train[a+1:]+y_train[:a])
#crf.fit(X_train[a+1:], y_train[a+1:])



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=None,
    averaging=None, c=None, c1=0.1, c2=1e-17, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=50,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [0]:
# Save Model Using joblib
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import joblib

# save the model to disk
filename = 'base_model1.sav'
joblib.dump(crf, filename)
  
# load the model from disk
#loaded_model = joblib.load(filename)
# result = loaded_model.score(X_test, Y_test)
#print(result)

['base_model1.sav']

In [0]:
eli5.show_weights(crf, top=30)



From \ To,O,B-LOC,I-LOC,B-ORG,I-ORG,B-PER,I-PER
O,0.911,0.895,-5.015,0.587,-6.142,0.793,-4.939
B-LOC,-0.617,-0.341,5.18,-0.551,0.0,-3.362,0.0
I-LOC,-0.336,0.44,5.415,0.0,0.0,-2.719,0.0
B-ORG,-3.175,-2.35,-0.4,-0.249,4.425,-3.174,0.0
I-ORG,-1.874,-1.943,0.0,0.787,5.539,-0.528,0.0
B-PER,-0.162,0.0,0.0,0.0,0.0,-3.687,5.98
I-PER,-0.174,0.0,0.0,0.0,0.0,0.0,7.286

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6
+9.216,sentence_.lower=chỉ_đạo,,,,,
+8.849,sentence_[-3:]=tác,,,,,
+6.564,sentence_.lower=mình,,,,,
+6.268,-1:sentence_.lower=chiếc,,,,,
+6.186,sentence_[-2:]=52,,,,,
+5.708,-1:sentence_.lower=sân_khấu,,,,,
+5.702,sentence_.lower=cấp,,,,,
+5.573,+1:sentence_.lower=sáu,,,,,
+5.532,sentence_.lower=lâm_nhi,,,,,
+5.482,sentence_.lower=phải,,,,,

Weight?,Feature
+9.216,sentence_.lower=chỉ_đạo
+8.849,sentence_[-3:]=tác
+6.564,sentence_.lower=mình
+6.268,-1:sentence_.lower=chiếc
+6.186,sentence_[-2:]=52
+5.708,-1:sentence_.lower=sân_khấu
+5.702,sentence_.lower=cấp
+5.573,+1:sentence_.lower=sáu
+5.532,sentence_.lower=lâm_nhi
+5.482,sentence_.lower=phải

Weight?,Feature
+9.127,sentence_.lower=đài_loan
+8.709,+1:sentence_.lower=thao_thao_bất_tuyệt
+7.803,sentence_.lower=isan
+7.769,sentence_.lower=campuchia
+7.141,sentence_.lower=biển_đông
+6.570,+1:sentence_.lower=sông_nước
+6.368,-1:sentence_.lower=tiền_giang
+6.336,sentence_.lower=dankia
+6.282,sentence_.lower=cổng_trắng
+6.194,sentence_.lower=bom_loọng

Weight?,Feature
+6.751,+1:sentence_.lower=trà_mai
+6.649,-1:sentence_.lower=vương_tửu
+6.576,-1:sentence_.lower=đảo
+6.172,-1:sentence_.lower=thành_phố
+6.143,-1:sentence_.lower=địa_đạo
+6.048,+1:sentence_.lower=xẻ
+5.837,-1:sentence_.lower=bar
+5.733,-1:sentence_.lower=phố
+5.688,-1:sentence_.lower=q.
+5.590,-1:sentence_.lower=lệ_viên

Weight?,Feature
+6.948,sentence_.lower=phong_phú
+6.615,sentence_.lower=thành_công
+6.608,sentence_.lower=unilever
+6.473,sentence_[-2:]=ex
+6.427,sentence_.lower=môi_trường
+6.242,sentence_.lower=khai_minh
+6.150,sentence_.lower=nhị_xuân
+5.884,sentence_.lower=vksnd_tối_cao
+5.871,-1:sentence_.lower=dntn
+5.793,+1:sentence_.lower=ư

Weight?,Feature
+6.426,+1:sentence_.lower=ẩn_danh
+4.776,sentence_.lower=kỳ_quang
+4.240,-1:sentence_.lower=thpt
+4.227,+1:sentence_.lower=tncs
+3.977,sentence_.lower=corporation
+3.962,sentence_.lower=thế_giới
+3.890,-1:sentence_.lower=cienco
+3.782,-1:sentence_.lower=phathet
+3.610,-1:sentence_.lower=nghề
+3.517,-1:sentence_.lower=phân_xã

Weight?,Feature
+7.991,-1:sentence_.lower=ẩn
+7.933,-1:sentence_.lower=vì_sao
+7.091,sentence_.lower=james
+6.467,+1:sentence_.lower=tranh_thủ
+6.276,sentence_.lower=nong_tum
+6.154,-1:sentence_.lower=bà
+6.021,-1:sentence_.lower=chị
+5.999,-1:sentence_.lower=vương
+5.763,sentence_.lower=phật_bà
+5.657,-1:sentence_.lower=vợ_chồng

Weight?,Feature
+6.157,sentence_.lower=luận
+4.867,-1:sentence_.lower=bựa
+4.670,sentence_[-3:]=tây
+3.964,-1:sentence_.lower=xà_lách
+3.962,sentence_.lower=xà_lách
+3.761,-1:sentence_.lower=điện
+3.531,-1:sentence_.lower=khắc
+3.486,-1:sentence_.lower=lang
+3.301,sentence_.lower=bụt
+3.301,sentence_[-3:]=Bụt


In [0]:
eli5.show_weights(crf, top=5, show=['transition_features'])



From \ To,O,B-LOC,I-LOC,B-ORG,I-ORG,B-PER,I-PER
O,0.911,0.895,-5.015,0.587,-6.142,0.793,-4.939
B-LOC,-0.617,-0.341,5.18,-0.551,0.0,-3.362,0.0
I-LOC,-0.336,0.44,5.415,0.0,0.0,-2.719,0.0
B-ORG,-3.175,-2.35,-0.4,-0.249,4.425,-3.174,0.0
I-ORG,-1.874,-1.943,0.0,0.787,5.539,-0.528,0.0
B-PER,-0.162,0.0,0.0,0.0,0.0,-3.687,5.98
I-PER,-0.174,0.0,0.0,0.0,0.0,0.0,7.286


In [0]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [0]:
labels = list(crf.classes_)
labels

['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']

In [0]:
labels = [
 'B-LOC',
 'I-LOC',
 'B-ORG',
 'I-ORG',
 'B-PER',
 'I-PER']

In [0]:
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [0]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,average='weighted', labels=labels)

0.9275999703919506

In [0]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.937     0.924     0.930       852
       I-LOC      0.904     0.892     0.898       371
       B-ORG      0.840     0.676     0.749       148
       I-ORG      0.862     0.751     0.803       249
       B-PER      0.976     0.962     0.969      1125
       I-PER      0.968     0.968     0.968       497

   micro avg      0.943     0.915     0.929      3242
   macro avg      0.914     0.862     0.886      3242
weighted avg      0.941     0.915     0.928      3242



In [0]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train[a+1:]+ X_train[:a], y_train[a+1:]+y_train[:a])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 31.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 100.0min finished


CPU times: user 1h 33min 8s, sys: 1min 47s, total: 1h 34min 56s
Wall time: 1h 40min 32s


In [0]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.004402298222743836, 'c2': 0.05837238034057523}
best CV score: 0.8955341537047298
model size: 1.61M


In [0]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print('f1_scorer:')
print(metrics.flat_f1_score(y_test, y_pred,average='weighted', labels=labels))
print('precision:')
print(metrics.flat_precision_score(y_test, y_pred,average='weighted', labels=labels))
print('recall:')
print(metrics.flat_recall_score(y_test, y_pred,average='weighted', labels=labels))
print('accuracy:')
print(metrics.flat_accuracy_score(y_test, y_pred))

f1_scorer:
0.9243921287032493
precision:
0.9366997393064722
recall:
0.9136335595311537
accuracy:
0.993615058128806


In [0]:
filename = 'model1.sav'
joblib.dump(crf, filename)

['model1.sav']

In [0]:
# crf = rs.best_estimator_
# y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.930     0.924     0.927       852
       I-LOC      0.899     0.865     0.882       371
       B-ORG      0.847     0.676     0.752       148
       I-ORG      0.853     0.743     0.794       249
       B-PER      0.972     0.967     0.970      1125
       I-PER      0.964     0.968     0.966       497

   micro avg      0.939     0.914     0.926      3242
   macro avg      0.911     0.857     0.882      3242
weighted avg      0.937     0.914     0.924      3242

