In [22]:
import dill
import pandas as pd
from sklearn.model_selection import train_test_split

In [202]:
path_name = "../dataset/data/"

with open(path_name + 'comment-pos.data', 'rb') as file:
    datatofile = dill.load(file)

tagged_sents = []
for data in datatofile:
    text_inside = []
    for word, pos, label in data:
        if word.strip() == '':
            text_inside.append(('[SP]', label))
        else:
            text_inside.append((word, label))
    tagged_sents.append(text_inside)

train_sents, test_sents = train_test_split(tagged_sents, test_size=0.2, random_state=42)
print(len(train_sents))
print(len(test_sents))
print(train_sents[1])

552
138
[('ถ้า', 'B-c'), ('เดินทาง', 'I-c'), ('กลางคืน', 'I-c'), ('ก็', 'I-c'), ('รถทัวร์', 'I-c'), ('ครับ', 'I-c'), ('[SP]', 'I-c'), ('[SP]', 'O'), ('เพราะ', 'B-p'), ('รถ', 'I-p'), ('ไม่', 'I-p'), ('เยอะ', 'I-p'), ('[SP]', 'I-p'), ('ความเสี่ยง', 'I-p'), ('การ', 'I-p'), ('เกิด', 'I-p'), ('อุบัติ', 'I-p'), ('ห', 'I-p'), ('ตุ', 'I-p'), ('ก็', 'I-p'), ('น้อย', 'I-p'), ('(', 'I-p'), ('มั้ง', 'I-p'), (')', 'I-p'), ('[SP]', 'I-p'), ('[SP]', 'O'), ('ถ้า', 'B-c'), ('กลางวัน', 'I-c'), ('ก็', 'I-c'), ('เครื่องบิน', 'I-c'), ('ครับ', 'I-c'), ('[SP]', 'I-c'), ('[SP]', 'O'), ('เพราะ', 'B-p'), ('[SP]', 'I-p'), ('มัน', 'I-p'), ('ใช้เวลา', 'I-p'), ('น้อย', 'I-p'), ('จะ', 'I-p'), ('ได้', 'I-p'), ('มี', 'I-p'), ('เวลา', 'I-p'), ('ระหว่าง', 'I-p'), ('วัน', 'I-p'), ('เยอะ', 'I-p'), ('ๆ', 'I-p'), ('[SP]', 'I-p')]


In [203]:
_NER_TAGS = [
        "O",
        "B_C",
        "B_P",
        "I_C",
        "I_P"
    ]

In [204]:
def convert_to_simple_transformer_format(sentences):
    sentence_id = []
    words = []
    labels = []

    for idx, sents in enumerate(sentences):
        for word, label in sents:
            label = label.upper().replace("-", "_")
            sentence_id.append(idx)
            words.append(word)
            labels.append(label)
    return pd.DataFrame(
        {"sentence_id": sentence_id, "words": words, "labels": labels}
    )    
            


In [205]:
train_ = convert_to_simple_transformer_format(train_sents)
train_

Unnamed: 0,sentence_id,words,labels
0,0,อะไหล่,B_C
1,0,เทอร์โบ,I_C
2,0,[SP],I_C
3,0,อี,I_C
4,0,ซุ,I_C
...,...,...,...
37405,551,ทำ,I_P
37406,551,อะไร,I_P
37407,551,ได้,I_P
37408,551,หลายอย่าง,I_P


In [206]:
test_ = convert_to_simple_transformer_format(test_sents)

In [10]:
import torch
from simpletransformers.ner import NERModel, NERArgs

# Configure the model
ner_args = NERArgs()
ner_args.train_batch_size = 12
ner_args.evaluate_during_training = False
ner_args.overwrite_output_dir = True
ner_args.num_train_epochs = 100 #10


model = NERModel(
    "bert", "monsoon-nlp/bert-base-thai", args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS
)

# Train the modelk
model.train_model(train_)

Some weights of the model checkpoint at monsoon-nlp/bert-base-thai were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized

In [None]:
# # Evaluate the model
# result, model_outputs, preds_list = test_ner.eval_model(test_)
# result

# Predict

In [287]:
idx = 10
test_pred = " ".join(list(map(lambda word: word[0], test_sents[idx])))
print(test_pred)
print(test_sents[idx])

ผม เห็น ปัญหา นี้ เยอะ มา ก. .. พ่อแม่ [SP] สั่งสอน ลูก ไม่ ได้ [SP] ยิ่ง เด็ก ตัวเล็ก [SP] ๆ [SP] พ่อแม่ [SP] โยน มือถือ ให้ เพราะ ไม่ อยาก ให้ รบกวน ตัวเอง [SP] พอให้ เรียน ออนไลน์ บอก ไม่ มี อุปกรณ์ [SP] ปล. จาก ที่ ผม เห็น มา จริง ๆ [SP] นะ ครับ
[('ผม', 'B-c'), ('เห็น', 'I-c'), ('ปัญหา', 'I-c'), ('นี้', 'I-c'), ('เยอะ', 'I-c'), ('มา', 'I-c'), ('ก.', 'I-c'), ('..', 'I-c'), ('พ่อแม่', 'I-c'), ('[SP]', 'I-c'), ('สั่งสอน', 'I-c'), ('ลูก', 'I-c'), ('ไม่', 'I-c'), ('ได้', 'I-c'), ('[SP]', 'I-c'), ('ยิ่ง', 'I-c'), ('เด็ก', 'I-c'), ('ตัวเล็ก', 'I-c'), ('[SP]', 'I-c'), ('ๆ', 'I-c'), ('[SP]', 'I-c'), ('พ่อแม่', 'I-c'), ('[SP]', 'I-c'), ('โยน', 'I-c'), ('มือถือ', 'I-c'), ('ให้', 'I-c'), ('เพราะ', 'B-p'), ('ไม่', 'I-p'), ('อยาก', 'I-p'), ('ให้', 'I-p'), ('รบกวน', 'I-p'), ('ตัวเอง', 'I-p'), ('[SP]', 'I-p'), ('พอให้', 'I-p'), ('เรียน', 'I-p'), ('ออนไลน์', 'I-p'), ('บอก', 'I-p'), ('ไม่', 'I-p'), ('มี', 'I-p'), ('อุปกรณ์', 'I-p'), ('[SP]', 'O'), ('ปล.', 'O'), ('จาก', 'O'), ('ที่', 'O'), ('ผม', 'O'

In [234]:
# Make predictions with the model
predictions, raw_outputs = model.predict([test_pred])
print(predictions[0]) 

100%|██████████| 1/1 [00:07<00:00,  7.30s/it]
Running Prediction: 100%|██████████| 1/1 [00:00<00:00, 31.23it/s]

[{'หาก': 'B_C'}, {'พิจารณา': 'B_C'}, {'จาก': 'B_C'}, {'การเข้าสู่': 'I_P'}, {'การ': 'B_C'}, {'เป็น': 'B_C'}, {'[SP]': 'B_C'}, {'AEC': 'B_C'}, {'[SP]': 'B_C'}, {'ใน': 'B_C'}, {'สิ้นปี': 'B_C'}, {'นี้': 'B_C'}, {'แล้ว': 'B_C'}, {'[SP]': 'O'}, {'ผม': 'I_P'}, {'ว่า': 'B_P'}, {'ความสามารถ': 'B_C'}, {'ทาง': 'B_C'}, {'ด้าน': 'B_C'}, {'ภาษาอังกฤษ': 'I_P'}, {'จะ': 'B_C'}, {'ใช้ประโยชน์': 'B_C'}, {'ได้': 'B_C'}, {'มากกว่า': 'B_C'}, {'คณิตศาสตร์': 'B_C'}, {'เยอะ': 'B_C'}, {'มาก': 'B_C'}, {'ครับ': 'B_C'}, {'[SP]': 'B_C'}, {'ได้': 'B_C'}, {'ทั้ง': 'B_C'}, {'ด้าน': 'B_C'}, {'ติดต่อ': 'B_C'}, {'ธุรกิจ': 'I_P'}, {'[SP]': 'B_C'}, {'การท่องเที่ยว': 'I_P'}, {'[SP]': 'B_C'}, {'การศึกษา': 'B_C'}]





# test predict

In [288]:
_test = (list(map(lambda word: word[0], test_sents[10])))
predictions, raw_outputs = model.predict([_test], split_on_space=False)
print(predictions[0]) 


100%|██████████| 1/1 [00:07<00:00,  7.24s/it]
Running Prediction: 100%|██████████| 1/1 [00:00<00:00,  7.70it/s]

[{'ผม': 'B_C'}, {'เห็น': 'B_C'}, {'ปัญหา': 'B_C'}, {'นี้': 'I_P'}, {'เยอะ': 'B_C'}, {'มา': 'B_C'}, {'ก.': 'B_C'}, {'..': 'B_C'}, {'พ่อแม่': 'B_C'}, {'[SP]': 'B_C'}, {'สั่งสอน': 'B_C'}, {'ลูก': 'B_C'}, {'ไม่': 'B_C'}, {'ได้': 'O'}, {'[SP]': 'B_C'}, {'ยิ่ง': 'B_P'}, {'เด็ก': 'B_C'}, {'ตัวเล็ก': 'B_C'}, {'[SP]': 'B_C'}, {'ๆ': 'B_P'}, {'[SP]': 'B_C'}, {'พ่อแม่': 'B_C'}, {'[SP]': 'B_C'}, {'โยน': 'B_C'}, {'มือถือ': 'B_C'}, {'ให้': 'B_C'}, {'เพราะ': 'B_C'}, {'ไม่': 'B_C'}, {'อยาก': 'B_C'}, {'ให้': 'B_C'}, {'รบกวน': 'B_C'}, {'ตัวเอง': 'B_C'}, {'[SP]': 'B_C'}, {'พอให้': 'I_P'}, {'เรียน': 'B_C'}, {'ออนไลน์': 'B_C'}, {'บอก': 'B_C'}, {'ไม่': 'B_C'}, {'มี': 'B_C'}, {'อุปกรณ์': 'B_C'}, {'[SP]': 'B_C'}, {'ปล.': 'B_C'}, {'จาก': 'B_C'}, {'ที่': 'B_C'}, {'ผม': 'I_P'}, {'เห็น': 'B_C'}, {'มา': 'B_C'}, {'จริง ๆ': 'B_C'}, {'[SP]': 'B_C'}, {'นะ': 'B_C'}, {'ครับ': 'I_P'}]





# Evaluate the model

In [209]:
import torch
from simpletransformers.ner import NERModel, NERArgs
ner_args = NERArgs()
model = NERModel(
    "bert", "monsoon-nlp/bert-base-thai", args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS
)


test_ner = NERModel("bert", './weight3/checkpoint-4600-epoch-100', args=ner_args)

Some weights of the model checkpoint at monsoon-nlp/bert-base-thai were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized

In [260]:
y_test = []
for sent in test_sents:
    labels = []
    for word, label in sent:
        label = label.upper().replace("-", "_")
        labels.append(label)
    y_test.append(labels)
    
print(y_test[1])


['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'B_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [261]:
test_list = []
for sent in test_sents:
    words = []
    for word, label in sent:
        words.append(word)
    test_list.append(" ".join(words))

predictions, raw_outputs = test_ner.predict(test_list)

y_pred = []
for preds in predictions:
    y_pred.append([list(pred.items())[0][1] for pred in preds])

print(y_pred[1])

100%|██████████| 1/1 [00:07<00:00,  7.66s/it]
Running Prediction: 100%|██████████| 18/18 [00:02<00:00,  7.22it/s]


['B_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'I_C', 'O', 'O', 'O', 'O', 'I_P', 'I_P', 'I_P', 'I_P', 'O', 'I_P', 'O', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P', 'O', 'I_P', 'O', 'I_P', 'O', 'O', 'O', 'O', 'O', 'O', 'I_P', 'O', 'I_P', 'O', 'O', 'O', 'I_P', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I_C', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_P', 'I_P', 'I_P', 'O', 'B_P', 'I_P', 'I_P', 'I_P', 'O', 'I_P', 'O', 'O', 'I_P', 'O', 'I_P', 'I_P', 'I_P', 'I_P', 'I_P']


# Test ==============

In [257]:
idx_ = 115 
temp = []
for i in range(len(test_sents[idx_])):
    word_test = test_sents[idx_][i][0]
    word_pred = predictions_[idx_][i][0]
    if word_test != word_pred:
        print(i, word_test, '||', word_pred)

IndexError: list index out of range

In [258]:
print(test_sents[idx_])
print(len(test_sents[idx_]))

[('เห็นด้วย', 'B-c'), ('กับ', 'I-c'), ('[SP]', 'I-c'), ('คห.', 'I-c'), ('[SP]', 'I-c'), ('บน', 'I-c'), ('[SP]', 'I-c'), ('ว่า', 'I-c'), ('ไม่', 'I-c'), ('ใช่', 'I-c'), ('การ', 'I-c'), ('สอบ', 'I-c'), ('ได้', 'I-c'), ('เพราะ', 'B-p'), ('ใคร', 'I-p'), ('ไป', 'I-p'), ('สอบ', 'I-p'), ('ส่วนใหญ่', 'I-p'), ('ก็ได้', 'I-p'), ('มัน', 'I-p'), ('เหมือน', 'I-p'), ('พ่อแม่', 'I-p'), ('จ่าย', 'I-p'), ('เงิน', 'I-p'), ('ให้', 'I-p'), ('ลูก', 'I-p'), ('ไป', 'I-p'), ('เรียน', 'I-p'), ('[SP]', 'I-p'), ('highschool', 'I-p'), ('[SP]', 'I-p'), ('ที่', 'I-p'), ('[SP]', 'I-p'), ('ตปท.', 'I-p'), ('[SP]', 'I-p'), ('1', 'I-p'), ('[SP]', 'I-p'), ('ปี', 'I-p'), ('[SP]', 'O'), ('ส่วนตัว', 'O'), ('เคย', 'O'), ('เป็น', 'O'), ('[SP]', 'O'), ('นร.', 'O'), ('[SP]', 'O'), ('แลกเปลี่ยน', 'O'), ('เมื่อ', 'O'), ('[SP]', 'O'), ('20', 'O'), ('ปี', 'O'), ('ที่แล้ว', 'O'), ('[SP]', 'O'), ('เป็น', 'O'), ('ประสบการณ์', 'O'), ('ที่', 'O'), ('ดีมาก', 'O'), ('ๆ', 'O'), ('นอกจาก', 'O'), ('ได้', 'O'), ('ภาษา', 'O'), ('[SP]', 'O'), (

In [259]:
predictions_ = []
for sent in predictions:
    sent_ = []
    for word_lable in sent:
        sent_.append(list(word_lable.items())[0])
    predictions_.append(sent_)

print(predictions_[idx_])
print(len(predictions_[idx_]))


[('เห็นด้วย', 'B_C'), ('กับ', 'I_C'), ('[SP]', 'O'), ('คห.', 'I_C'), ('[SP]', 'O'), ('บน', 'I_C'), ('[SP]', 'I_P'), ('ว่า', 'I_C'), ('ไม่', 'I_C'), ('ใช่', 'I_C'), ('การ', 'I_P'), ('สอบ', 'I_C'), ('ได้', 'I_P'), ('เพราะ', 'B_P'), ('ใคร', 'I_P'), ('ไป', 'I_P'), ('สอบ', 'I_P'), ('ส่วนใหญ่', 'I_P'), ('ก็ได้', 'I_P'), ('มัน', 'I_P'), ('เหมือน', 'I_P'), ('พ่อแม่', 'I_P'), ('จ่าย', 'I_P'), ('เงิน', 'I_P'), ('ให้', 'I_P'), ('ลูก', 'I_P'), ('ไป', 'I_P'), ('เรียน', 'I_P'), ('[SP]', 'I_P'), ('highschool', 'I_P'), ('[SP]', 'I_P'), ('ที่', 'I_P'), ('[SP]', 'I_P'), ('ตปท.', 'I_P'), ('[SP]', 'I_P'), ('1', 'I_P'), ('[SP]', 'I_P'), ('ปี', 'I_P'), ('[SP]', 'O'), ('ส่วนตัว', 'B_C'), ('เคย', 'O'), ('เป็น', 'O'), ('[SP]', 'O'), ('นร.', 'O'), ('[SP]', 'O'), ('แลกเปลี่ยน', 'O'), ('เมื่อ', 'O'), ('[SP]', 'O'), ('20', 'O'), ('ปี', 'O'), ('ที่แล้ว', 'O'), ('[SP]', 'O'), ('เป็น', 'O'), ('ประสบการณ์', 'O'), ('ที่', 'O'), ('ดีมาก', 'O'), ('ๆ', 'O'), ('นอกจาก', 'O'), ('ได้', 'O'), ('ภาษา', 'O'), ('[SP]', 'O'), ('ย

## ปัญหา
#### ติด ๆ แยกจากกันตรง index ที่ 10
#### ติด เวลา predict มีการตัดข้อความทำให้ ข้อความ predict ไม่เท่ากับ test

# End test =======

In [256]:
y_pred_ = []
y_test_ = []
for i in range(len(y_test)):
    if len(y_pred[i]) != len(y_test[i]):
        # print(len(y_pred[i]), len(y_test[i]))
        print(i, end=" ")
        continue;
    y_pred_.append(y_pred[i])
    y_test_.append(y_test[i])

1 6 8 10 18 19 21 23 29 34 36 37 39 42 44 55 60 62 71 81 85 97 98 99 101 106 115 120 121 123 124 127 128 133 134 

In [62]:
print(len(y_pred_), len(y_test_))

99 99


In [250]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

print("accuracy:" ,accuracy_score(y_test_, y_pred_))
print(classification_report(y_test_, y_pred_))

accuracy: 0.7135542980328753
              precision    recall  f1-score   support

          _C       0.24      0.37      0.29       111
          _P       0.16      0.41      0.23       111

   micro avg       0.19      0.39      0.26       222
   macro avg       0.20      0.39      0.26       222
weighted avg       0.20      0.39      0.26       222





In [252]:
# model_continue = NERModel("bert", './weigth_SP', args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS)
# model_continue.train_model(train_)