In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Word2Vec

#tune hyperparam
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

import numpy as np

In [0]:
class Config():
  # directory of raw VLSP data
  rawdata_path = '/content/drive/My Drive/NLP/NER/dataVLSP/'
  # directory of VLSP data  has been removed the xml tags
  data_path = '/content/drive/My Drive/NLP/NER/data/'
  # word2vec trained file
  embedd_pathfile = '/content/drive/My Drive/NLP/NER/word2vec.ipynb'
  #labels
  labels = ['B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'O']

config = Config()

In [0]:
#clean data

import codecs
import os

config = Config()

def remove_xml_tags(filename):
  ''' 
  Remove xml tag in file in data folder(raw data)
  Args:
    filename: The name of the data file in dataVLSP folder
  Return:
    File of the same name has removed xml tags in data folder
  Example:
    <editor>Vietlex team, 8-2016</editor>
    -DOCSTART-
    <s>				
    Đó	P	B-NP	O	O
    là	V	B-VP	O	O
    con	Nc	B-NP	O	O
  :converted into:
    Đó	P	B-NP	O	O
    là	V	B-VP	O	O
    con	Nc	B-NP	O	O

    saved in dataVLSP folder(processed data)
  '''
  f1 = open(config.rawdata_path + filename, 'r',encoding='utf-8')
  f2 = open(config.data_path + filename, 'w+',encoding='utf-8')
  for line in f1:
    line.strip()
    if(('<title>' in line) or line.startswith('<e') or line.startswith('-D') or line.startswith('<s>')):
      pass
    elif(line.startswith('</')):
      f2.write(line.replace(line,'\n'))
    else:
      f2.write(line)
  f1.close()
  f2.close()

def clean_data(path):
  ''' 
  Remove xml tags of all files in the dataVLSP folder
  Processed data saved in data
  '''
  list_files = os.listdir(path)
  for file in list_files:
    remove_xml_tags(file)

  clean_data(config.rawdata_path)

In [0]:
from sklearn.model_selection import train_test_split
import os
import codecs
config = Config()
def prepare_data(path, scale,index_attri):
  ''' Create training data and testing data
      Format of data: CoNLL

      Args:
        path: path of data folder
        scale: test size
        index_attri: Represents the number of attributes and the associated attribute type
          index_attri == 1 : The number of attributes = 1 - only ner label. ex: [('Huế', 'B_LOC'), ('là', 'O'), ('thành_phố', 'O'), ('đẹp', 'O')]
          index_attri == 2.1 : The number of attributes = 2(pos-tagging label, ner label). ex: [('Đó', 'P', 'O'), ('là', 'V',  'O'), ('con', 'Nc', 'O'), ('đường', 'N', , 'O')]
          index_attri = 2.2 : The number of attributes = 2(chunking label, ner label). ex: [('Đó', 'B-NP', 'O'), ('là', 'B-VP', 'O'), ('con', 'B-NP', 'O'), ('đường', 'B-NP', 'O')]
          index_attri = 3 : The number of attributes = 3(pos-tagging label,chunking, ner label). ex: [('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')]
          if index_attri not in {1,2.1,2,2,3} index_attri = 2.1
      Return:
        train_sents, test_sents
      
      Example of format data:
      [[('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')],
      [('Đó', 'P', 'B-NP', 'O'), ('là', 'V', 'B-VP', 'O'), ('con', 'Nc', 'B-NP', 'O'), ('đường', 'N', 'B-NP', 'O')],
      ...
      ]

  '''

  # check index_attri
  if index_attri not in {1,2.1,2,2,3} :
    index_attri = 2.1
  # split data by file
  list_files = os.listdir(path)
  # train_files, test_files = train_test_split(list_files,test_size=scale,random_state=42)
  all_data = []

  ''' Convert data format to CoNll '''
  #training data
  for file in list_files:
    with codecs.open(path + file,'r',encoding='utf8') as f:
      sentence = []
      remove = False;
      for line in f:
        line = line.split()
        if len(line) > 3 :
          if(line[3] not in config.labels):
            remove = True
          else:
            if index_attri == 1:
                sentence.append((line[0],line[3]))
            elif index_attri == 2.2:
              sentence.append((line[0],line[2],line[3]))
            elif index_attri == 3:
              sentence.append((line[0],line[1],line[2],line[3]))
            else:
              sentence.append((line[0],line[1],line[3]))
        else:
          if len(sentence) > 0:
            if remove == False:                            
              all_data.append(sentence)
            else:
              remove = False
            sentence = []
    f.close()
  
  train_sent_data, test_sent_data = train_test_split(all_data,test_size=scale,random_state=42)
  return  train_sent_data,test_sent_data
  

In [0]:
train_sents,test_sents = prepare_data(config.data_path,0.15,2.1)
print(len(train_sents))
print(len(test_sents))

14087
2486


In [0]:
pip install sklearn_crfsuite



In [0]:
pip install eli5



In [0]:
cd '/content/drive/My Drive/NLP/NER/'

/content/drive/My Drive/NLP/NER


In [0]:
re_adm_div      = ['ấp', 'buôn', 'bản', 'huyện', 'làng', 'miền', 'nước', 
                   'phường', 'quận', 'tỉnh', 'thành_phố', 'thị_trấn', 'thị_xã', 
                   'thôn', 'TT', 'TP', 'TX', 'TT.', 'TP.', 'TX.', 'xứ', 'xã', 
                   'xóm']
re_org          = ['báo', 'bệnh_viện', 'bệnh_xá', 'công_ty', 'công_ti', 'đài', 'đảng', 'đoàn', 'hội', 'hợp_tác_xã', 'khách_sạn', 'nhà_máy', 'nhà_xuất_bản', 'ngân_hàng', 'quỹ', 'tạp_chí', 'tập đoàn', 'thông_tấn_xã', 'tờ', 'trạm_xá', 'xí_nghiệp','ủy_ban']
re_school       = ['mẫu_giáo', 'tiểu_học', 'trung_học', 'trung_học_cơ_sở', 
                   'trung_học_phổ_thông', 'cao_đẳng', 'trung_cấp', 
                   'trung_cấp_nghề', 'đại_học']
re_street       = ['đại_lộ', 'đường', 'hẻm', 'ngách', 'ngõ', 'nhà', 'phố', 'quốc_lộ']
re_place        = ['ao', 'am', 'bến', 'bến_cảng', 'bến_phà','biển', 'cảng', 
                   'cầu', 'công_viên', 'chợ', 'chùa', 'dãy', 'đảo', 'đầm', 'đèo', 
                   'đền', 'đình', 'đồi', 'động', 'đồng_bằng', 'gềnh', 'gò', 'khu', 'hòn', 'hồ', 
                   'lăng', 'miếu', 'miền', 'nhà_ga', 'núi', 'phà', 'quần_đảo', 
                   'sân_bay', 'sông', 'suối', 'vùng']
re_office       = ['ban', 'bộ', 'chi_cục', 'cục', 'hạt', 'sở']
re_army         = ['binh_đoàn', 'đại_đội', 'đặc_khu', 'đơn_vị', 'lữ_đoàn', 'quân_đoàn', 'quân_đội', 'quân_khu','sư_đoàn', 'tiểu_đội', 'tiểu_đoàn', 'trung_đội']

def re_word(word):
    """
        Return a dict of (regexp Name, regexp Value) of a word
        :type word: string
        :param word: a word in sentence
    """

    check_code = False
    for char in word:
        if char.isdigit():
            check_code = True
            break

def re_word_org(word):
  return word.lower() in re_org

def re_word_name(word):
  return word[0].isupper()

def re_word_capital(word):
  return word.isupper()

def re_word_adm_div(word):
  return word.lower() in re_adm_div

def re_word_is_school(word):
  return word.lower() == 'trường'

def re_word_street(word):
  return word.lower() in re_street

def re_word_place(word):
  return word.lower() in re_place

def re_word_office(word):
  return word in re_office

def re_word_army(word):
  return word in re_army


In [0]:
def check_code(word):
  check_code = False
  for char in word:
      if char.isdigit():
          check_code = True
          break; 
  return check_code    

In [0]:
def is_mix(word):
  return not(word.islower() and word.isupper())   

In [0]:
import pandas as pd
import pycrfsuite
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
import re
def word2feature(sent, i):
  word = sent[i][0]
  
  # if '_' in word:
    # word.replace('_', ' ')
  #print(sent[i],'\n')
  #Tag = sent[i][2]

  features = ([
      'bias',
      'sentence_[-3:]='+word[-3:],
      'sentence_[-2:]='+word[-2:],
      'sentence_.lower=%s'%word.lower(),
      'sentence_.isupper=%s'%word.isupper(),
      'sentence_.istitle=%s' % word.istitle(),
      'sentence_.is_mix=%s' % is_mix(word),
      'sentence_.is_capital_period=%s' % (('.' in word) and word[0].isupper()),
      'sentence_.isdigit=%s' % word.isdigit(),
      'sentence_end_digit=%s' %word[-1].isdigit(),
      'sentence_.hashyphen=%s' % word.find('-'),
      'sentence_.is_code=%s' % check_code(word),
      'sentence_.num_syllabus=%s' %(word.count('_') + 1),
      'sentence_.is_name=%s' % word[0].isupper(),
      'sentence_.re_word_org=%s'%re_word_org(word),
      'sentence_.re_word_name=%s'%re_word_name(word),
      'sentence_.re_word_capital=%s'%re_word_capital(word),
      'sentence_.re_word_adm_div=%s'%re_word_adm_div(word),
      'sentence_.re_word_is_school=%s'%re_word_is_school(word),
      'sentence_.re_word_street=%s'%re_word_street(word),
      'sentence_.re_word_place=%s'%re_word_place(word),
      'sentence_.re_word_office=%s'%re_word_office(word),
      'sentence_.re_word_army=%s'%re_word_army(word),
       #'tag='+ Tag
  ])
  '''
  wordembdding=get_features(word)
  for iv,value in enumerate(wordembdding):
    features.extend(['v{}'.format(iv) +': '+ str(value)])
  '''
  if i > 0:
    word1 = sent[i - 1][0]
    if '_' in word1:
      word.replace('_', ' ')
    Tag1 = sent[i - 1][2]
    features.extend([
        '-1:sentence_.lower=' + word1.lower(),
        '-1:sentence_.istitle=%s' % word1.istitle(),
        '-1:sentence_.isupper=%s' % word1.isupper(),
        '-1:sentence_.isdigit=%s' % word1.isdigit(),
        #'-1:tag=' + Tag1,
        # '-1:tag[:2]=' + Tag1[:2],
    ])
  else:
    features.append('BOS')
  if i < len(sent)-1:
    word1 = sent[i+1][0]
    if '_' in word1:
      word.replace('_', ' ')
    #Tag1 = sent[i+1][2]
    # print(word1,'_',Tag1)

    features.extend([
        '+1:sentence_.lower=' + word1.lower(),
        '+1:sentence_.istitle=%s' % word1.istitle(),
        '+1:sentence_.isupper=%s' % word1.isupper(),
        '+1:sentence_.isdigit=%s' % word1.isdigit(),
        #'+1:tag=' + Tag1,
        # '+1:tag[:2]=' + Tag1[:2],
    ])
  else:
    features.append('EOS')

  return features



def sent2features(sent):
  return [word2feature(sent, i) for i in range(len(sent))]

def sent2labels(sent):
  return [label for token, postag, label in sent]

def sent2tokens(sent):
  return [token for token, postag, label in sent]

In [0]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

In [0]:
X_train[0][0]

['bias',
 'sentence_[-3:]=Đào',
 'sentence_[-2:]=ào',
 'sentence_.lower=đào',
 'sentence_.isupper=False',
 'sentence_.istitle=True',
 'sentence_.is_mix=True',
 'sentence_.is_capital_period=False',
 'sentence_.isdigit=False',
 'sentence_end_digit=False',
 'sentence_.hashyphen=-1',
 'sentence_.is_code=False',
 'sentence_.num_syllabus=1',
 'sentence_.is_name=True',
 'sentence_.re_word_org=False',
 'sentence_.re_word_name=True',
 'sentence_.re_word_capital=False',
 'sentence_.re_word_adm_div=False',
 'sentence_.re_word_is_school=False',
 'sentence_.re_word_street=False',
 'sentence_.re_word_place=False',
 'sentence_.re_word_office=False',
 'sentence_.re_word_army=False',
 'BOS',
 '+1:sentence_.lower=xinh_đẹp',
 '+1:sentence_.istitle=False',
 '+1:sentence_.isupper=False',
 '+1:sentence_.isdigit=False']

In [0]:
import nltk
import sklearn_crfsuite
import eli5
crf = sklearn_crfsuite.CRF(
  algorithm='lbfgs',
  c1=0.1,
  c2= 1e-17,
  max_iterations=50,
)


In [0]:
a = 1331
#crf.fit(X_train,y_train)
#crf.fit(X_train[:a],y_train[:a])
crf.fit(X_train[a+1:]+ X_train[:a], y_train[a+1:]+y_train[:a])
#crf.fit(X_train[a+1:], y_train[a+1:])



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=None,
    averaging=None, c=None, c1=0.1, c2=1e-17, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=50,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [0]:
# Save Model Using joblib
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import joblib

# save the model to disk
filename = 'base_model2.sav'
joblib.dump(crf, filename)
  
# load the model from disk
#loaded_model = joblib.load(filename)
# result = loaded_model.score(X_test, Y_test)
#print(result)

['base_model2.sav']

In [0]:
eli5.show_weights(crf, top=30)



From \ To,O,B-LOC,I-LOC,B-ORG,I-ORG,B-PER,I-PER
O,0.669,0.943,-5.269,0.72,-4.45,0.942,-4.113
B-LOC,-0.849,-0.329,4.891,-0.643,0.0,-2.823,0.0
I-LOC,-0.329,0.219,4.808,0.0,0.0,-2.502,0.0
B-ORG,-1.772,-1.11,0.037,0.578,6.099,-2.933,0.0
I-ORG,-1.444,-1.415,0.0,1.244,5.873,-0.429,0.0
B-PER,0.013,0.0,0.0,0.0,0.0,-3.999,5.954
I-PER,-0.119,0.0,0.0,0.0,0.0,0.0,6.927

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6
+7.104,sentence_[-3:]=tác,,,,,
+6.064,sentence_.lower=phải,,,,,
+5.921,-1:sentence_.lower=chiếc,,,,,
+5.845,sentence_.lower=muay_thái,,,,,
+5.668,sentence_.lower=mình,,,,,
+5.607,sentence_.lower=lâm_nhi,,,,,
+5.219,sentence_.lower=cùng,,,,,
+5.196,-1:sentence_.lower=từng,,,,,
+4.828,sentence_.lower=không,,,,,
+4.739,sentence_.lower=nhưng,,,,,

Weight?,Feature
+7.104,sentence_[-3:]=tác
+6.064,sentence_.lower=phải
+5.921,-1:sentence_.lower=chiếc
+5.845,sentence_.lower=muay_thái
+5.668,sentence_.lower=mình
+5.607,sentence_.lower=lâm_nhi
+5.219,sentence_.lower=cùng
+5.196,-1:sentence_.lower=từng
+4.828,sentence_.lower=không
+4.739,sentence_.lower=nhưng

Weight?,Feature
+8.799,sentence_.lower=đài_loan
+7.242,sentence_.lower=biển_đông
+6.551,sentence_.lower=dankia
+6.289,sentence_.lower=cẩm_nhân
+5.919,sentence_.lower=bom_loọng
+5.768,sentence_.lower=pháp
+5.612,sentence_.lower=campuchia
+5.432,sentence_.lower=quỳnh_thanh
+5.333,sentence_.lower=cái_nước
+5.224,sentence_.lower=phan_thiết

Weight?,Feature
+6.986,-1:sentence_.lower=stadium
+5.995,-1:sentence_.lower=vương_tửu
+5.615,-1:sentence_.lower=đường
+5.516,+1:sentence_.lower=xẻ
+5.353,-1:sentence_.lower=chung_cư
+5.149,-1:sentence_.lower=đảo
+5.136,-1:sentence_.lower=ntls
+4.967,-1:sentence_.lower=q.
+4.612,-1:sentence_.lower=bar
+3.985,-1:sentence_.lower=đèo

Weight?,Feature
+6.220,sentence_.lower=cao_thắng
+6.048,sentence_.lower=phong_phú
+5.618,-1:sentence_.lower=dntn
+5.426,sentence_.lower=nhị_xuân
+5.149,sentence_.lower=khai_minh
+5.095,-1:sentence_.lower=đại_diện
+5.080,+1:sentence_.lower=sản_xuất
+4.348,sentence_[-2:]=ex
+4.341,+1:sentence_.lower=tuyên_huấn
+4.156,sentence_[-2:]=co

Weight?,Feature
+4.134,sentence_.lower=thế_giới
+3.825,+1:sentence_.lower=cấp
+3.327,sentence_.lower=corporation
+3.294,-1:sentence_.lower=báo
+3.235,sentence_.lower=kỳ_quang
+3.195,-1:sentence_.lower=liên_hiệp
+3.058,+1:sentence_.lower=ẩn_danh
+3.053,-1:sentence_.lower=tiểu_ban
+3.035,sentence_.lower=cộng_hoà
+2.951,sentence_.lower=văn_lang

Weight?,Feature
+6.002,-1:sentence_.lower=vì_sao
+5.629,sentence_.lower=loan
+5.577,-1:sentence_.lower=chị
+5.330,-1:sentence_.lower=vợ_chồng
+5.266,sentence_.lower=yàng
+5.257,sentence_[-2:]=T.
+5.242,+1:sentence_.lower=ôm
+5.208,sentence_.lower=dũng
+5.163,sentence_.lower=hường
+5.032,sentence_[-3:]=Lan

Weight?,Feature
+4.778,-1:sentence_.lower=bựa
+3.725,-1:sentence_.lower=hổ
+3.701,sentence_[-3:]=tây
+3.632,sentence_.lower=luận
+3.581,-1:sentence_.lower=xà_lách
+3.535,-1:sentence_.lower=vũ
+3.143,sentence_.lower=xà_lách
+3.067,+1:sentence_.lower=đĩnh
+2.942,sentence_.lower=bụt
+2.942,sentence_[-3:]=Bụt


In [0]:
eli5.show_weights(crf, top=5, show=['transition_features'])



From \ To,O,B-LOC,I-LOC,B-ORG,I-ORG,B-PER,I-PER
O,0.669,0.943,-5.269,0.72,-4.45,0.942,-4.113
B-LOC,-0.849,-0.329,4.891,-0.643,0.0,-2.823,0.0
I-LOC,-0.329,0.219,4.808,0.0,0.0,-2.502,0.0
B-ORG,-1.772,-1.11,0.037,0.578,6.099,-2.933,0.0
I-ORG,-1.444,-1.415,0.0,1.244,5.873,-0.429,0.0
B-PER,0.013,0.0,0.0,0.0,0.0,-3.999,5.954
I-PER,-0.119,0.0,0.0,0.0,0.0,0.0,6.927


In [0]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [0]:
labels = list(crf.classes_)
labels

['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']

In [0]:
labels = [
 'B-LOC',
 'I-LOC',
 'B-ORG',
 'I-ORG',
 'B-PER',
 'I-PER']

In [0]:
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [0]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,average='weighted', labels=labels)

0.9211496308053838

In [0]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.931     0.912     0.921       852
       I-LOC      0.893     0.854     0.873       371
       B-ORG      0.821     0.682     0.745       148
       I-ORG      0.881     0.743     0.806       249
       B-PER      0.970     0.962     0.966      1125
       I-PER      0.957     0.976     0.966       497

   micro avg      0.937     0.909     0.923      3242
   macro avg      0.909     0.855     0.880      3242
weighted avg      0.935     0.909     0.921      3242



In [0]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train[a+1:]+ X_train[:a], y_train[a+1:]+y_train[:a])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 47.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 152.8min finished


CPU times: user 2h 26min 31s, sys: 3min, total: 2h 29min 32s
Wall time: 2h 33min 44s


In [0]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.027665607640481617, 'c2': 0.042027566521931764}
best CV score: 0.895642104051181
model size: 1.11M


In [0]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print('f1_scorer:')
print(metrics.flat_f1_score(y_test, y_pred,average='weighted', labels=labels))
print('precision:')
print(metrics.flat_precision_score(y_test, y_pred,average='weighted', labels=labels))
print('recall:')
print(metrics.flat_recall_score(y_test, y_pred,average='weighted', labels=labels))
print('accuracy:')
print(metrics.flat_accuracy_score(y_test, y_pred))

f1_scorer:
0.9294307304330858
precision:
0.9417365734805938
recall:
0.9185687847008019
accuracy:
0.9939287691455988


In [0]:
filename = 'model2.sav'
joblib.dump(crf, filename)

['model2.sav']

In [0]:
# crf = rs.best_estimator_
# y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.938     0.924     0.931       852
       I-LOC      0.920     0.873     0.896       371
       B-ORG      0.840     0.676     0.749       148
       I-ORG      0.855     0.759     0.804       249
       B-PER      0.972     0.969     0.971      1125
       I-PER      0.968     0.982     0.975       497

   micro avg      0.944     0.919     0.931      3242
   macro avg      0.916     0.864     0.888      3242
weighted avg      0.942     0.919     0.929      3242

