In [1]:
import dill
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
path_name = "../dataset/data/"

with open(path_name + 'comment-pos.data', 'rb') as file:
    datatofile = dill.load(file)

tagged_sents = []
for data in datatofile:
    text_inside = []
    for word, pos, label in data:
        if word.strip() == '':
            text_inside.append(('_', label))
        else:
            text_inside.append((word, label))
    tagged_sents.append(text_inside)

train_sents, test_sents = train_test_split(tagged_sents, test_size=0.2, random_state=42)
print(len(train_sents))
print(len(test_sents))
print(train_sents[1])

552
138
[('ถ้า', 'B-c'), ('เดินทาง', 'I-c'), ('กลางคืน', 'I-c'), ('ก็', 'I-c'), ('รถทัวร์', 'I-c'), ('ครับ', 'I-c'), ('_', 'I-c'), ('_', 'O'), ('เพราะ', 'B-p'), ('รถ', 'I-p'), ('ไม่', 'I-p'), ('เยอะ', 'I-p'), ('_', 'I-p'), ('ความเสี่ยง', 'I-p'), ('การ', 'I-p'), ('เกิด', 'I-p'), ('อุบัติ', 'I-p'), ('ห', 'I-p'), ('ตุ', 'I-p'), ('ก็', 'I-p'), ('น้อย', 'I-p'), ('(', 'I-p'), ('มั้ง', 'I-p'), (')', 'I-p'), ('_', 'I-p'), ('_', 'O'), ('ถ้า', 'B-c'), ('กลางวัน', 'I-c'), ('ก็', 'I-c'), ('เครื่องบิน', 'I-c'), ('ครับ', 'I-c'), ('_', 'I-c'), ('_', 'O'), ('เพราะ', 'B-p'), ('_', 'I-p'), ('มัน', 'I-p'), ('ใช้เวลา', 'I-p'), ('น้อย', 'I-p'), ('จะ', 'I-p'), ('ได้', 'I-p'), ('มี', 'I-p'), ('เวลา', 'I-p'), ('ระหว่าง', 'I-p'), ('วัน', 'I-p'), ('เยอะ', 'I-p'), ('ๆ', 'I-p'), ('_', 'I-p')]


In [3]:
def convert_to_simple_transformer_format(sentences):
    sentence_id = []
    words = []
    labels = []

    for idx, sents in enumerate(sentences):
        for word, label in sents:
            label = label.upper().replace("-", "_")
            sentence_id.append(idx)
            words.append(word)
            labels.append(label)
    return pd.DataFrame(
        {"sentence_id": sentence_id, "words": words, "labels": labels}
    )    
            


In [4]:
train_ = convert_to_simple_transformer_format(train_sents)
test_ = convert_to_simple_transformer_format(test_sents)
train_

Unnamed: 0,sentence_id,words,labels
0,0,อะไหล่,B_C
1,0,เทอร์โบ,I_C
2,0,_,I_C
3,0,อี,I_C
4,0,ซุ,I_C
...,...,...,...
37405,551,ทำ,I_P
37406,551,อะไร,I_P
37407,551,ได้,I_P
37408,551,หลายอย่าง,I_P


# Train

In [7]:
import torch
from simpletransformers.ner import NERModel, NERArgs

_NER_TAGS = ["O", "B_C", "B_P", "I_C", "I_P"]

# Configure the model
ner_args = NERArgs()
ner_args.train_batch_size = 12
ner_args.evaluate_during_training = False
ner_args.overwrite_output_dir = True
ner_args.max_seq_length = 512
ner_args.num_train_epochs = 100 #10


model = NERModel(
    "bert", "monsoon-nlp/bert-base-thai", args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS
)

# Train the modelk
model.train_model(train_)

# Predict Test

In [5]:
import torch
from simpletransformers.ner import NERModel, NERArgs

_NER_TAGS = ["O", "B_C", "B_P", "I_C", "I_P"]
ner_args = NERArgs()
ner_args.max_seq_length = 512
test_ner = NERModel("bert", 'outputs/checkpoint-4600-epoch-3', args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS)

In [6]:
idx = 1
test_pred = (list(map(lambda word_lable: word_lable[0], test_sents[idx])))
predictions, raw_outputs = test_ner.predict([test_pred], split_on_space=False)
print(predictions[0]) 

100%|██████████| 1/1 [00:07<00:00,  7.22s/it]
Running Prediction: 100%|██████████| 1/1 [00:02<00:00,  2.04s/it]

[{'อย่า': 'B_C'}, {'ไป': 'I_C'}, {'มอง': 'I_C'}, {'เรื่อง': 'I_P'}, {'ความ': 'I_C'}, {'คุ้ม': 'I_C'}, {'ความ': 'I_C'}, {'กำไร': 'I_C'}, {'อะไร': 'O'}, {'เลย': 'I_C'}, {'ครับ': 'I_C'}, {'_': 'O'}, {'เอา': 'O'}, {'ที่': 'I_P'}, {'มีประโยชน์': 'I_P'}, {'ต่อ': 'I_P'}, {'เรา': 'B_P'}, {'มาก': 'I_P'}, {'ๆ': 'I_P'}, {'_': 'O'}, {'ดีกว่า': 'I_P'}, {'_': 'O'}, {'ต้อง': 'I_P'}, {'ถาม': 'O'}, {'ก่อน': 'O'}, {'ว่า': 'O'}, {'คุณ': 'I_P'}, {'จะ': 'I_P'}, {'เปลี่ยน': 'O'}, {'ที่ทำงาน': 'I_P'}, {'อีก': 'I_P'}, {'ไหม': 'O'}, {'จะ': 'I_P'}, {'ไป': 'O'}, {'จังหวัด': 'O'}, {'อื่น': 'O'}, {'หรือ': 'O'}, {'ป่าว': 'O'}, {'อยู่': 'I_P'}, {'คนเดียว': 'O'}, {'หรือ': 'O'}, {'มีครอบครัว': 'O'}, {'พ่อแม่': 'O'}, {'_': 'O'}, {'พี่น้อง': 'O'}, {'_': 'O'}, {'มี': 'O'}, {'แฟน': 'O'}, {'ไหม': 'O'}, {'_': 'O'}, {'ถ้า': 'O'}, {'มี': 'O'}, {'แล้ว': 'O'}, {'แฟน': 'O'}, {'มี': 'O'}, {'รถ': 'O'}, {'ไหม': 'O'}, {'หรือ': 'O'}, {'มี': 'O'}, {'บ้าน': 'O'}, {'ก็': 'O'}, {'ว่า': 'O'}, {'ไป': 'O'}, {'_': 'O'}, {'แล้ว': 'O'}, {'ค่อย




# Predict Input Text

In [7]:
from pythainlp.tokenize import word_tokenize

text = "ฉันชอบหมาเพราะมันน่ารักมาก"
text_token = word_tokenize(text)
predictions, raw_outputs = test_ner.predict([text_token], split_on_space=False)
print(predictions[0]) 

100%|██████████| 1/1 [00:07<00:00,  7.33s/it]
Running Prediction: 100%|██████████| 1/1 [00:00<00:00, 10.18it/s]

[{'ฉัน': 'B_C'}, {'ชอบ': 'I_C'}, {'หมา': 'I_C'}, {'เพราะ': 'B_P'}, {'มัน': 'I_P'}, {'น่ารัก': 'I_P'}, {'มาก': 'I_P'}]





# Evaluate the model

In [8]:
# get label test list
y_test = []
for sent in test_sents:
    labels = []
    for word, label in sent:
        label = label.upper().replace("-", "_")
        labels.append(label)
    y_test.append(labels)
    
# print(y_test[1])


In [9]:
# get label pred list
test_list = []
for sent in test_sents:
    words = []
    for word, label in sent:
        words.append(word)
    test_list.append(words)

predictions, raw_outputs = test_ner.predict(test_list, split_on_space=False)

y_pred = []
for preds in predictions:
    y_pred.append([list(pred.items())[0][1] for pred in preds])

# print(y_pred[1])

100%|██████████| 1/1 [00:07<00:00,  7.86s/it]
Running Prediction: 100%|██████████| 18/18 [00:11<00:00,  1.63it/s]


In [10]:
y_pred_ = []
y_test_ = []
for i in range(len(y_test)):
    if len(y_pred[i]) != len(y_test[i]):
        continue;
    y_pred_.append(y_pred[i])
    y_test_.append(y_test[i])

In [11]:
def convert_format_lable(list_lable):
    result = []
    for list_ in list_lable:
        result.append(list(map(lambda lable: lable.lower().capitalize().replace("_", "-"), list_)))
    return result

y_test_ = convert_format_lable(y_test_)
y_pred_ = convert_format_lable(y_pred_)



In [12]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

print("accuracy:" ,accuracy_score(y_test_, y_pred_))
print(classification_report(y_test_, y_pred_))

accuracy: 0.5752765752765753
              precision    recall  f1-score   support

           c       0.07      0.26      0.11       154
           p       0.04      0.23      0.06       155

   micro avg       0.05      0.24      0.08       309
   macro avg       0.05      0.24      0.09       309
weighted avg       0.05      0.24      0.09       309



# train continue

In [5]:
import torch
from simpletransformers.ner import NERModel, NERArgs

_NER_TAGS = ["O", "B_C", "B_P", "I_C", "I_P"]

ner_args = NERArgs()
ner_args.train_batch_size = 12
ner_args.evaluate_during_training = False
ner_args.overwrite_output_dir = True
ner_args.max_seq_length = 512
ner_args.num_train_epochs = 100 #10

model_continue = NERModel("bert", './weight/checkpoint-4462-epoch-97', args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS)
model_continue.train_model(train_)

100%|██████████| 2/2 [00:08<00:00,  4.26s/it]
Epoch 1 of 100:  97%|█████████▋| 97/100 [00:00<00:00, 550.18it/s]