In [14]:
import warnings
warnings.filterwarnings('ignore')
import sklearn_crfsuite 
import pandas as pd
import eli5
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

In [15]:
CHAR_TYPE = {
    u'กขฃคฆงจชซญฎฏฐฑฒณดตถทธนบปพฟภมยรลวศษสฬอ': 'c',
    u'ฅฉผฟฌหฝฮฤ': 'n',
    u'ัะาำิีืึุู': 'v',  # า ะ ำ ิ ี ึ ื ั ู ุ
    u'เแโใไ': 'w',
    u'่้๊๋็': 't', # วรรณยุกต์ ่ ้ ๊ ๋
    u'์ๆฯ.': 's', # ์  ๆ ฯ .
    u'0123456789๑๒๓๔๕๖๗๘๙': 'd',
    u'"': 'q',
    u"‘": 'q',
    u"’": 'q',
    u"'": 'q',
    u' ': 'p',
    u'<>`~๐;:-({)},./+*/-?!@#$%^&=][': 'p',
    u'abcdefghijklmnopqrstuvwxyz': 's_e',
    u'ABCDEFGHIJKLMNOPQRSTUVWXYZ': 'b_e'
}
CHAR_TYPE_FLATTEN = {}
for ks, v in CHAR_TYPE.items():
    for k in ks:
        CHAR_TYPE_FLATTEN[k] = v

CHAR_TYPES = [
    'b_e', 'c', 'd', 'n', 'o',
    'p', 'q', 's', 's_e', 't',
    'v', 'w'
]
CHAR_TYPES_MAP = {v: k for k, v in enumerate(CHAR_TYPES)}

<h1>Try CRF:STOAs

In [16]:
import pycrfsuite
import numpy as np

In [17]:
#best_df = pd.read_csv('corpus/wisesigth_entropy_deepcut_bsws.csv') 
#best_df = pd.read_csv('corpus/wisesigth_entropy_deepcut_bl.csv') #Normally we used this
#best_df = pd.read_csv('corpus/wisesigth_entropy_deepcut_ws_only.csv')  #Entropy from DC ws only
best_df = pd.read_csv('corpus/wisesigth_entropy_deepcut_bl.csv')
best_df

Unnamed: 0,Char_sequence,Entropy,true_pred,pred
0,Euc,0.000210,1,1
1,Euce,0.000000,0,0
2,Eucer,0.000000,0,0
3,Euceri,0.000000,0,0
4,Eucerin,0.000096,0,0
...,...,...,...,...
75130,อบคุณคร,0.000000,0,0
75131,บคุณครั,0.003988,1,1
75132,คุณครับ,0.000000,0,0
75133,ุณครับ,0.000000,0,0


In [18]:
X = best_df[['Char_sequence','Entropy','pred']].values.tolist()
y = best_df['true_pred'].values.tolist()

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [54]:
def feature_preprocessing(X_items):
    dic_return = []
    for items in X_items:
        list_backup = []
        feature = {}
        feature = {
            "entropy" : items[1]
        }    
        feature.update({
                f"pred_" : items[2],
        })
        for idx,ch in enumerate(items[0]):
            try:
                feature.update({
                    f"char_{idx+1}" : ch,
                    #f"vowel_{idx+1}" : CHAR_TYPES_MAP[CHAR_TYPE_FLATTEN[ch]]
                    #f"vowel_{idx+1}" : CHAR_TYPES_MAP[ch]
                })
            except:
                feature.update({
                    f"char_{idx+1}" : ch,
                    #f"vowel_{idx+1}" : CHAR_TYPES_MAP['p']
                })
        list_backup.append(feature)
        dic_return.append(list_backup)
    return dic_return

In [55]:
X_train = feature_preprocessing(X_train)
X_test = feature_preprocessing(X_test)
        
y_train = [str(x) for x in y_train]
y_test = [str(x) for x in y_test]

In [56]:
X_train[0]

[{'entropy': 0.0,
  'pred_': 0,
  'char_1': 'า',
  'char_2': 'ซ',
  'char_3': '่',
  'char_4': 'อ',
  'char_5': 'น',
  'char_6': 'อ',
  'char_7': 'ย'}]

In [41]:
# Train model

trainer = pycrfsuite.Trainer(verbose=True)
#trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 0.01,
    'c2': 0.01,
    'max_iterations': 200,
    'feature.possible_transitions': True,
})

trainer.train('model/crf_wisesight_entropyfrom_dc_bl_2.save') ####### STOA !!!!!!!! #########


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 2598
Seconds required: 0.046

L-BFGS optimization
c1: 0.010000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 32923.665926
Feature norm: 0.062500
Error norm: 68821.012025
Active features: 2502
Line search trials: 5
Line search step: 0.000000
Seconds required for this iteration: 0.134

***** Iteration #2 *****
Loss: 32229.143368
Feature norm: 0.048931
Error norm: 27968.323550
Active features: 2450
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #3 *****
Loss: 31977.095294
Feature norm: 0.047650
Error norm: 23792.986273
Active features: 2434
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #57 *****
Loss: 4393.251350
Feature norm: 52.542663
Error norm: 161.357803
Active features: 2399
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #58 *****
Loss: 4385.672815
Feature norm: 52.970264
Error norm: 512.789177
Active features: 2375
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #59 *****
Loss: 4378.680816
Feature norm: 53.647105
Error norm: 479.181933
Active features: 2371
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #60 *****
Loss: 4375.414780
Feature norm: 54.037640
Error norm: 188.145790
Active features: 2372
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #61 *****
Loss: 4372.432097
Feature norm: 53.978154
Error norm: 58.218367
Active features: 2368
Line search trials: 1
Line search step: 1.000000
Seconds require

***** Iteration #100 *****
Loss: 4322.900377
Feature norm: 57.688345
Error norm: 96.394046
Active features: 2316
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #101 *****
Loss: 4322.545139
Feature norm: 57.714828
Error norm: 88.690784
Active features: 2316
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #102 *****
Loss: 4321.855638
Feature norm: 57.802963
Error norm: 41.874912
Active features: 2317
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #103 *****
Loss: 4321.626420
Feature norm: 57.859499
Error norm: 164.549561
Active features: 2314
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #104 *****
Loss: 4321.243552
Feature norm: 57.891056
Error norm: 131.218494
Active features: 2318
Line search trials: 1
Line search step: 1.000000
Seconds requ

***** Iteration #151 *****
Loss: 4310.614921
Feature norm: 59.339815
Error norm: 29.739884
Active features: 2310
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #152 *****
Loss: 4310.599424
Feature norm: 59.340625
Error norm: 45.486412
Active features: 2310
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #153 *****
Loss: 4310.571999
Feature norm: 59.342486
Error norm: 22.011269
Active features: 2310
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #154 *****
Loss: 4310.473837
Feature norm: 59.350130
Error norm: 111.022167
Active features: 2311
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #155 *****
Loss: 4310.348094
Feature norm: 59.364353
Error norm: 175.154301
Active features: 2311
Line search trials: 1
Line search step: 1.000000
Seconds requ

In [57]:
tagger = pycrfsuite.Tagger()
tagger.open('model/crf_wisesight_entropyfrom_dc_bl_2.save')
y_pred = [tagger.tag(xseq) for xseq in X_test]

FileNotFoundError: [Errno 2] No such file or directory: 'model/crf_wisesight_entropyfrom_dc_bl_2.save'

In [43]:
# Evaluate

labels = {'1': 1, "0": 0} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

print(classification_report(
    truths, predictions,
    target_names=["B", "I"]))

              precision    recall  f1-score   support

           B       0.98      0.97      0.98     15747
           I       0.94      0.95      0.95      6794

    accuracy                           0.97     22541
   macro avg       0.96      0.96      0.96     22541
weighted avg       0.97      0.97      0.97     22541



<h1> 5 Character

In [65]:
import pycrfsuite
import numpy as np

In [87]:
best_df = pd.read_csv('corpus/wisesigth_entropy_100per_deepcut_5char.csv')
best_df = best_df.dropna()
best_df

Unnamed: 0,Char_sequence,Entropy,true_pred,pred
0,Euc,1.395830e-02,1,1
1,Euce,3.874120e-06,0,0
2,Eucer,0.000000e+00,0,0
3,uceri,1.745200e-05,0,0
4,cerin,5.029277e-05,0,0
...,...,...,...,...
13524,ะมึงง,0.000000e+00,0,0
13525,มึงง่,5.462372e-07,0,0
13526,ึงง่ะ,4.700074e-03,1,1
13527,งง่ะ,0.000000e+00,0,0


In [88]:
X = best_df[['Char_sequence','Entropy','pred']].values.tolist()
y = best_df['true_pred'].values.tolist()

In [89]:
def feature_preprocessing(X_items):
    dic_return = []
    for items in X_items:
        list_backup = []
        feature = {}
        feature = {
            "entropy" : items[1]
        }    
        feature.update({
                f"pred_" : items[2],
        })
        for idx,ch in enumerate(items[0]):
            try:
                feature.update({
                    f"char_{idx+1}" : ch,
                    f"vowel_{idx+1}" : CHAR_TYPES_MAP[ch]
                })
            except:
                feature.update({
                    f"char_{idx+1}" : ch,
                    f"vowel_{idx+1}" : CHAR_TYPES_MAP['p']
                })
        list_backup.append(feature)
        dic_return.append(list_backup)
    return dic_return

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [92]:
X_train = feature_preprocessing(X_train)
X_test = feature_preprocessing(X_test)
        
y_train = [str(x) for x in y_train]
y_test = [str(x) for x in y_test]

In [94]:
# Train model

trainer = pycrfsuite.Trainer(verbose=True)
#trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 0.01,
    'c2': 0.01,
    'max_iterations': 200,
    'feature.possible_transitions': True,
})

trainer.train('model/crf_wisesigth_deepcut_p_art_method_5char.save') ####### STOA !!!!!!!! #########

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 1284
Seconds required: 0.012

L-BFGS optimization
c1: 0.010000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5759.831567
Feature norm: 0.062500
Error norm: 4775.407883
Active features: 1230
Line search trials: 5
Line search step: 0.000002
Seconds required for this iteration: 0.025

***** Iteration #2 *****
Loss: 5731.900205
Feature norm: 0.056756
Error norm: 2947.410729
Active features: 1198
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #3 *****
Loss: 5697.485511
Feature norm: 0.055498
Error norm: 3566.874793
Active features: 1198
Line search trials: 1
Line search step: 1.000000
Seconds required for this iter

***** Iteration #40 *****
Loss: 487.376951
Feature norm: 38.330438
Error norm: 34.877116
Active features: 1189
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #41 *****
Loss: 485.448236
Feature norm: 38.656477
Error norm: 34.684856
Active features: 1172
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.015

***** Iteration #42 *****
Loss: 483.876911
Feature norm: 38.825162
Error norm: 76.196533
Active features: 1167
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #43 *****
Loss: 481.919862
Feature norm: 39.118380
Error norm: 33.954230
Active features: 1154
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #44 *****
Loss: 480.566652
Feature norm: 39.344398
Error norm: 111.154338
Active features: 1142
Line search trials: 1
Line search step: 1.000000
Seconds required for th

***** Iteration #82 *****
Loss: 465.915138
Feature norm: 44.237956
Error norm: 2.133377
Active features: 1080
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #83 *****
Loss: 465.859422
Feature norm: 44.290873
Error norm: 48.118328
Active features: 1080
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #84 *****
Loss: 465.803804
Feature norm: 44.325381
Error norm: 3.964666
Active features: 1078
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #85 *****
Loss: 465.775774
Feature norm: 44.336821
Error norm: 16.051950
Active features: 1078
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.004

***** Iteration #86 *****
Loss: 465.709878
Feature norm: 44.357549
Error norm: 6.711045
Active features: 1074
Line search trials: 1
Line search step: 1.000000
Seconds required for this i

***** Iteration #131 *****
Loss: 464.776994
Feature norm: 44.971156
Error norm: 2.234655
Active features: 1058
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #132 *****
Loss: 464.774368
Feature norm: 44.980697
Error norm: 12.302586
Active features: 1058
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.009

***** Iteration #133 *****
Loss: 464.767198
Feature norm: 44.990462
Error norm: 6.614440
Active features: 1057
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #134 *****
Loss: 464.762140
Feature norm: 44.998157
Error norm: 7.827464
Active features: 1059
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #135 *****
Loss: 464.756474
Feature norm: 45.003442
Error norm: 7.898856
Active features: 1059
Line search trials: 1
Line search step: 1.000000
Seconds required for th

***** Iteration #175 *****
Loss: 464.606881
Feature norm: 45.145327
Error norm: 7.134006
Active features: 1055
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.010

***** Iteration #176 *****
Loss: 464.605046
Feature norm: 45.150483
Error norm: 13.489221
Active features: 1056
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #177 *****
Loss: 464.601440
Feature norm: 45.152271
Error norm: 7.610629
Active features: 1058
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #178 *****
Loss: 464.599234
Feature norm: 45.155748
Error norm: 4.481450
Active features: 1058
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #179 *****
Loss: 464.597481
Feature norm: 45.157851
Error norm: 2.460037
Active features: 1058
Line search trials: 1
Line search step: 1.000000
Seconds required for th

In [98]:
tagger = pycrfsuite.Tagger()
tagger.open('model/crf_wisesigth_deepcut_p_art_method_5char.save')
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [99]:
# Evaluate

labels = {'1': 1, "0": 0} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

print(classification_report(
    truths, predictions,
    target_names=["B", "I"]))

              precision    recall  f1-score   support

           B       0.98      0.98      0.98      2877
           I       0.94      0.96      0.95      1182

    accuracy                           0.97      4059
   macro avg       0.96      0.97      0.96      4059
weighted avg       0.97      0.97      0.97      4059



<h1>use entropy from DC(best+ws)

In [36]:
best_df = pd.read_csv('corpus/wisesigth_entropy_deepcut_bsws.csv') #Normally we used this
best_df

Unnamed: 0,Char_sequence,Entropy,true_pred,pred
0,Euc,0.013958,1,1
1,Euce,0.000004,0,0
2,Eucer,0.000000,0,0
3,Euceri,0.000017,0,0
4,Eucerin,0.000050,0,0
...,...,...,...,...
75130,อบคุณคร,0.000000,0,0
75131,บคุณครั,0.001257,1,1
75132,คุณครับ,0.000000,0,0
75133,ุณครับ,0.000000,0,0


In [37]:
X = best_df[['Char_sequence','Entropy','pred']].values.tolist()
y = best_df['true_pred'].values.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [38]:
def feature_preprocessing(X_items):
    dic_return = []
    for items in X_items:
        list_backup = []
        feature = {}
        feature = {
            "entropy" : items[1]
        }    
        feature.update({
                f"pred_" : items[2],
        })
        for idx,ch in enumerate(items[0]):
            try:
                feature.update({
                    f"char_{idx+1}" : ch,
                    #f"vowel_{idx+1}" : CHAR_TYPES_MAP[CHAR_TYPE_FLATTEN[ch]]
                    #f"vowel_{idx+1}" : CHAR_TYPES_MAP[ch]
                })
            except:
                feature.update({
                    f"char_{idx+1}" : ch,
                    #f"vowel_{idx+1}" : CHAR_TYPES_MAP['p']
                })
        list_backup.append(feature)
        dic_return.append(list_backup)
    return dic_return

In [39]:
X_train = feature_preprocessing(X_train)
X_test = feature_preprocessing(X_test)
        
y_train = [str(x) for x in y_train]
y_test = [str(x) for x in y_test]

In [12]:
# Train model

trainer = pycrfsuite.Trainer(verbose=True)
#trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 0.01,
    'c2': 0.01,
    'max_iterations': 200,
    'feature.possible_transitions': True,
})

trainer.train('model/crf_ws1000_entropyfrom_dc_bsws_no_chartype.save') ####### STOA !!!!!!!! #########


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 2604
Seconds required: 0.031

L-BFGS optimization
c1: 0.010000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 25554.414706
Feature norm: 1.000000
Error norm: 8982.637577
Active features: 2488
Line search trials: 1
Line search step: 0.000074
Seconds required for this iteration: 0.031

***** Iteration #2 *****
Loss: 11028.002861
Feature norm: 3.581084
Error norm: 3123.040708
Active features: 2416
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.018

***** Iteration #3 *****
Loss: 7808.860331
Feature norm: 4.952114
Error norm: 1586.091006
Active features: 2526
Line search trials: 1
Line search step: 1.000000
Seconds required for this it

***** Iteration #56 *****
Loss: 2812.644634
Feature norm: 51.658673
Error norm: 11.804783
Active features: 2232
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #57 *****
Loss: 2812.402855
Feature norm: 51.728594
Error norm: 11.652824
Active features: 2233
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #58 *****
Loss: 2812.286623
Feature norm: 51.779887
Error norm: 19.382906
Active features: 2231
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #59 *****
Loss: 2811.965172
Feature norm: 51.865822
Error norm: 13.218656
Active features: 2234
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #60 *****
Loss: 2811.786594
Feature norm: 51.900710
Error norm: 14.603397
Active features: 2234
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #108 *****
Loss: 2805.812643
Feature norm: 53.456312
Error norm: 8.370742
Active features: 2236
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #109 *****
Loss: 2805.745593
Feature norm: 53.492652
Error norm: 6.712762
Active features: 2238
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #110 *****
Loss: 2805.693774
Feature norm: 53.513299
Error norm: 7.447181
Active features: 2243
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #111 *****
Loss: 2805.644319
Feature norm: 53.556879
Error norm: 8.346935
Active features: 2242
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #112 *****
Loss: 2805.584377
Feature norm: 53.578791
Error norm: 7.613680
Active features: 2238
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #161 *****
Loss: 2804.201160
Feature norm: 54.166649
Error norm: 2.994652
Active features: 2229
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.016

***** Iteration #162 *****
Loss: 2804.195103
Feature norm: 54.167462
Error norm: 4.781307
Active features: 2223
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #163 *****
Loss: 2804.174788
Feature norm: 54.178734
Error norm: 2.252247
Active features: 2224
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.018

***** Iteration #164 *****
Loss: 2804.168162
Feature norm: 54.180725
Error norm: 4.416973
Active features: 2224
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.017

***** Iteration #165 *****
Loss: 2804.153188
Feature norm: 54.192746
Error norm: 2.720485
Active features: 2222
Line search trials: 1
Line search step: 1.000000
Seconds required fo

In [78]:
tagger = pycrfsuite.Tagger()
tagger.open('model/crf_ws1000_entropyfrom_dc_bsws_no_chartype.save')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Evaluate

labels = {'1': 1, "0": 0} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

print(classification_report(
    truths, predictions,
    target_names=["B", "I"]))

              precision    recall  f1-score   support

           B       0.99      0.98      0.99     15792
           I       0.96      0.98      0.97      6749

    accuracy                           0.98     22541
   macro avg       0.98      0.98      0.98     22541
weighted avg       0.98      0.98      0.98     22541



<h1>Pickle

In [1]:
import warnings
warnings.filterwarnings('ignore')
import sklearn_crfsuite 
import pandas as pd
import eli5
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from preprocessing import preprocess #Our class
prepro = preprocess()

Using TensorFlow backend.


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [2]:
import pickle
import extract_features
import pycrfsuite
import numpy as np

In [3]:
with open('x_true.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
    x_true = pickle.load(f)
    
with open('y_true.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
    y_true = pickle.load(f)
    
with open('y_original.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
    y_original = pickle.load(f)
    
with open('y_entropy_original.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
    y_entropy_original = pickle.load(f) 
    
with open('y_prob_original.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
    y_prob_original = pickle.load(f)   

In [4]:
# with open('x_true_best.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     x_true = pickle.load(f)
    
# with open('y_true_best.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_true = pickle.load(f)
    
# with open('y_original_best.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_original = pickle.load(f)
    
# with open('y_entropy_original_best.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_entropy_original = pickle.load(f) 
    
# with open('y_prob_original_best.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_prob_original = pickle.load(f)   

In [5]:
# with open('x_true_tnhc.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     x_true = pickle.load(f)
    
# with open('y_true_tnhc.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_true = pickle.load(f)
    
# with open('y_original_tnhc.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_original = pickle.load(f)
    
# with open('y_entropy_original_tnhc.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_entropy_original = pickle.load(f) 
    
# with open('y_prob_original_tnhc.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_prob_original = pickle.load(f)   

In [4]:
X_data = []
for idx,item in enumerate(x_true):
    X_data.append(extract_features.extract_features_crf(x_true[idx],idx,y_entropy_original,y_prob_original))
    
y_data = [list(map(str, l)) for l in y_true]

In [5]:
X_data_ = [j for sub in X_data for j in sub]
y_data_ = [j for sub in y_data for j in sub]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_data_, y_data_, test_size=0.2, random_state=99)

In [7]:
X_test[:10]

[[{'bias': 'b',
   'char': '่',
   'entropy': 0.0,
   'prob': 0.0,
   'start': False,
   'end': False,
   'char_[-1]': 'ม',
   'ctype[-1]': 'c',
   'char_[-2]': 'ไ',
   'ctype[-2]': 'w',
   'char_[-3]': 'ล',
   'ctype[-3]': 'c',
   'char_[-4]': 'ั',
   'ctype[-4]': 'v',
   'char_[+1]': 'ส',
   'ctype[+1]': 'c',
   'char_[+2]': 'า',
   'ctype[+2]': 'v',
   'dict_start': False,
   'dict_end': True}],
 [{'bias': 'b',
   'char': 'ล',
   'entropy': 3.4177119977607217e-06,
   'prob': 2.086162567138672e-07,
   'start': False,
   'end': False,
   'char_[-1]': 'ก',
   'ctype[-1]': 'c',
   'char_[-2]': 'ว',
   'ctype[-2]': 'c',
   'char_[-3]': 'า',
   'ctype[-3]': 'v',
   'char_[-4]': '้',
   'ctype[-4]': 't',
   'char_[+1]': 'ั',
   'ctype[+1]': 'v',
   'char_[+2]': 'บ',
   'ctype[+2]': 'c',
   'dict_start': True,
   'dict_end': True}],
 [{'bias': 'b',
   'char': 'ด',
   'entropy': 5.462372153728447e-07,
   'prob': 2.9802322387695312e-08,
   'start': False,
   'end': False,
   'char_[-1]': 'ก',

In [7]:
# Train model

trainer = pycrfsuite.Trainer(verbose=True)
#trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 0.01,
    'c2': 0.01,
    'max_iterations': 1000,
    'feature.possible_transitions': True,
})

trainer.train('model/I_just_want_to_try.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 2782
Seconds required: 0.086

L-BFGS optimization
c1: 0.010000
c2: 0.010000
num_memories: 6
max_iterations: 1000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 26091.102782
Feature norm: 1.000000
Error norm: 17357.279445
Active features: 2656
Line search trials: 1
Line search step: 0.000036
Seconds required for this iteration: 0.056

***** Iteration #2 *****
Loss: 15978.784350
Feature norm: 1.755144
Error norm: 7293.015690
Active features: 2648
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.040

***** Iteration #3 *****
Loss: 11344.446926
Feature norm: 2.624214
Error norm: 4247.216823
Active features: 2716
Line search trials: 1
Line search step: 1.000000
Seconds required for this

***** Iteration #41 *****
Loss: 4517.910397
Feature norm: 52.646851
Error norm: 56.063114
Active features: 2473
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.038

***** Iteration #42 *****
Loss: 4515.680529
Feature norm: 52.944375
Error norm: 38.724406
Active features: 2471
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.037

***** Iteration #43 *****
Loss: 4513.867285
Feature norm: 53.148673
Error norm: 17.806327
Active features: 2466
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.037

***** Iteration #44 *****
Loss: 4511.643605
Feature norm: 53.464967
Error norm: 22.820549
Active features: 2456
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.030

***** Iteration #45 *****
Loss: 4509.580405
Feature norm: 53.686169
Error norm: 40.205260
Active features: 2458
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #88 *****
Loss: 4488.488777
Feature norm: 57.998224
Error norm: 34.264855
Active features: 2437
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.038

***** Iteration #89 *****
Loss: 4488.333305
Feature norm: 57.996756
Error norm: 4.143595
Active features: 2435
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.041

***** Iteration #90 *****
Loss: 4488.256975
Feature norm: 57.994430
Error norm: 6.522185
Active features: 2441
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.031

***** Iteration #91 *****
Loss: 4488.106873
Feature norm: 57.975335
Error norm: 13.260323
Active features: 2441
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.027

***** Iteration #92 *****
Loss: 4488.011365
Feature norm: 57.966778
Error norm: 12.016692
Active features: 2441
Line search trials: 1
Line search step: 1.000000
Seconds required for 

***** Iteration #134 *****
Loss: 4485.369192
Feature norm: 58.231142
Error norm: 17.658358
Active features: 2426
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.026

***** Iteration #135 *****
Loss: 4485.328032
Feature norm: 58.234516
Error norm: 11.072319
Active features: 2424
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.039

***** Iteration #136 *****
Loss: 4485.305239
Feature norm: 58.242292
Error norm: 16.354759
Active features: 2422
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.034

***** Iteration #137 *****
Loss: 4485.272887
Feature norm: 58.249353
Error norm: 12.692285
Active features: 2424
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.027

***** Iteration #138 *****
Loss: 4485.256538
Feature norm: 58.259110
Error norm: 18.307294
Active features: 2428
Line search trials: 1
Line search step: 1.000000
Seconds requir

***** Iteration #180 *****
Loss: 4484.322041
Feature norm: 58.522470
Error norm: 12.356640
Active features: 2429
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.031

***** Iteration #181 *****
Loss: 4484.304965
Feature norm: 58.526738
Error norm: 6.803036
Active features: 2431
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.029

***** Iteration #182 *****
Loss: 4484.293407
Feature norm: 58.529044
Error norm: 5.032408
Active features: 2431
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.027

***** Iteration #183 *****
Loss: 4484.282959
Feature norm: 58.532397
Error norm: 3.061643
Active features: 2429
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.029

***** Iteration #184 *****
Loss: 4484.271692
Feature norm: 58.538956
Error norm: 6.979202
Active features: 2429
Line search trials: 1
Line search step: 1.000000
Seconds required f

In [9]:
tagger = pycrfsuite.Tagger()
tagger.open('model/I_just_want_to_try.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Evaluate
labels = {'1': 1, "0": 0} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

print(classification_report(
    truths, predictions,
    target_names=["B", "I"]))

              precision    recall  f1-score   support

           B       0.98      0.97      0.98     10493
           I       0.94      0.95      0.95      4534

    accuracy                           0.97     15027
   macro avg       0.96      0.96      0.96     15027
weighted avg       0.97      0.97      0.97     15027



<h1>LR/CRF ensemble CRF

In [3]:
import warnings
warnings.filterwarnings('ignore')
import sklearn_crfsuite 
import pandas as pd
import eli5
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from preprocessing import preprocess #Our class
prepro = preprocess()

import pickle
import extract_features
import pycrfsuite
import numpy as np

crf_model_og = pycrfsuite.Tagger() 

In [4]:
crf_model_og.open('model/crf_ws1000_entropyfrom_dc_bl_full_socialDict.model') # For baseline #Char type performwell on baseline
with open('x_true.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
    x_true = pickle.load(f)
    
with open('y_true.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
    y_true = pickle.load(f)
    
with open('y_original.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
    y_original = pickle.load(f)
    
with open('y_entropy_original.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
    y_entropy_original = pickle.load(f) 
    
with open('y_prob_original.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
    y_prob_original = pickle.load(f)   

In [5]:
#crf_model_og.open('model/crf_best_entropyfrom_dc_bl_full_socialDict.model')
# with open('x_true_best.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     x_true = pickle.load(f)
    
# with open('y_true_best.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_true = pickle.load(f)
    
# with open('y_original_best.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_original = pickle.load(f)
    
# with open('y_entropy_original_best.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_entropy_original = pickle.load(f) 
    
# with open('y_prob_original_best.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_prob_original = pickle.load(f)   

In [6]:
#crf_model_og.open('model/crf_tnhc_entropyfrom_dc_bl_full_SocialDict.model') 
# with open('x_true_tnhc.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     x_true = pickle.load(f)
    
# with open('y_true_tnhc.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_true = pickle.load(f)
    
# with open('y_original_tnhc.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_original = pickle.load(f)
    
# with open('y_entropy_original_tnhc.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_entropy_original = pickle.load(f) 
    
# with open('y_prob_original_tnhc.pkl', 'rb') as f:  # Python 3: open(..., 'wb')
#     y_prob_original = pickle.load(f)   

In [7]:
X_data = []
for idx,item in enumerate(x_true):
    X_data.append(extract_features.extract_features_crf(x_true[idx],idx,y_entropy_original,y_prob_original))
    
y_data = [list(map(str, l)) for l in y_true]

X_data_ = [j for sub in X_data for j in sub]
y_data_ = [j for sub in y_data for j in sub]

X_train, X_test, y_train, y_test = train_test_split(X_data_, y_data_, test_size=0.2, random_state=99)

In [8]:
def crf2lr(x_seq):
    prob = [crf_model_og.probability(crf_model_og.tag(x_seq)),crf_model_og.tag(x_seq)]
    if int(prob[1][0]) == 1:
        prob = [1-prob[0],prob[0]]
    else:
        prob = [prob[0],1-prob[0]]
    entropy = prepro.find_entropy([prob])
    x_seq[0]['entropy'] = entropy[0]
    x_seq[0]['prob'] = prob[1]
    return x_seq

In [11]:
X_train = list(map(crf2lr,X_train))
X_test = list(map(crf2lr,X_test))

In [12]:
# Train model

trainer = pycrfsuite.Trainer(verbose=True)
#trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 0.01,
    'c2': 0.01,
    'max_iterations': 1000,
    'feature.possible_transitions': True,
})

trainer.train('model/I_just_want_to_try.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 2933
Seconds required: 0.075

L-BFGS optimization
c1: 0.010000
c2: 0.010000
num_memories: 6
max_iterations: 1000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 33190.026169
Feature norm: 1.000000
Error norm: 34089.716851
Active features: 2805
Line search trials: 1
Line search step: 0.000028
Seconds required for this iteration: 0.063

***** Iteration #2 *****
Loss: 23763.356765
Feature norm: 1.058820
Error norm: 21333.501147
Active features: 2795
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.042

***** Iteration #3 *****
Loss: 13851.716113
Feature norm: 1.865930
Error norm: 10551.225073
Active features: 2767
Line search trials: 1
Line search step: 1.000000
Seconds required for th

***** Iteration #51 *****
Loss: 4501.613307
Feature norm: 28.158260
Error norm: 29.652003
Active features: 2604
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #52 *****
Loss: 4501.182959
Feature norm: 28.276302
Error norm: 41.549341
Active features: 2579
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #53 *****
Loss: 4500.812877
Feature norm: 28.352763
Error norm: 37.218783
Active features: 2581
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #54 *****
Loss: 4500.514945
Feature norm: 28.397787
Error norm: 16.932371
Active features: 2583
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #55 *****
Loss: 4500.179085
Feature norm: 28.457473
Error norm: 21.659395
Active features: 2567
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #96 *****
Loss: 4495.996350
Feature norm: 29.880682
Error norm: 14.334792
Active features: 2544
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #97 *****
Loss: 4495.963294
Feature norm: 29.928844
Error norm: 18.994237
Active features: 2548
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.034

***** Iteration #98 *****
Loss: 4495.924698
Feature norm: 29.955819
Error norm: 15.532208
Active features: 2547
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #99 *****
Loss: 4495.893593
Feature norm: 29.989254
Error norm: 16.927333
Active features: 2544
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #100 *****
Loss: 4495.856597
Feature norm: 30.001638
Error norm: 9.565170
Active features: 2545
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #149 *****
Loss: 4495.079955
Feature norm: 30.359690
Error norm: 13.426234
Active features: 2532
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #150 *****
Loss: 4495.062314
Feature norm: 30.361766
Error norm: 7.969751
Active features: 2532
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #151 *****
Loss: 4495.054791
Feature norm: 30.367461
Error norm: 9.858926
Active features: 2530
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #152 *****
Loss: 4495.040949
Feature norm: 30.370902
Error norm: 8.038397
Active features: 2530
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #153 *****
Loss: 4495.035049
Feature norm: 30.380256
Error norm: 10.148744
Active features: 2530
Line search trials: 1
Line search step: 1.000000
Seconds required 

***** Iteration #202 *****
Loss: 4494.631264
Feature norm: 30.446938
Error norm: 4.371007
Active features: 2522
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.034

***** Iteration #203 *****
Loss: 4494.629133
Feature norm: 30.452068
Error norm: 4.979708
Active features: 2521
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.033

***** Iteration #204 *****
Loss: 4494.624166
Feature norm: 30.451463
Error norm: 4.396106
Active features: 2521
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.034

L-BFGS terminated with the stopping criteria
Total seconds required for training: 6.822

Storing the model
Number of active features: 2521 (2933)
Number of active attributes: 1432 (1767)
Number of active labels: 2 (2)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.001



In [13]:
tagger = pycrfsuite.Tagger()
tagger.open('model/I_just_want_to_try.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Evaluate
labels = {'1': 1, "0": 0} # classification_report() needs values in 0s and 1s
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

print(classification_report(
    truths, predictions,
    target_names=["B", "I"]))

              precision    recall  f1-score   support

           B       0.98      0.97      0.97     10493
           I       0.92      0.94      0.93      4534

    accuracy                           0.96     15027
   macro avg       0.95      0.96      0.95     15027
weighted avg       0.96      0.96      0.96     15027

