In [1]:
import dill
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
path_name = "../../dataset/data/"

with open(path_name + 'comment-pos.data', 'rb') as file:
    datatofile = dill.load(file)

tagged_sents = []
for data in datatofile:
    text_inside = []
    for word, pos, label in data:
        if word.strip() == '':
            text_inside.append(('_', label))
        else:
            text_inside.append((word, label))
    tagged_sents.append(text_inside)

train_sents, test_sents = train_test_split(tagged_sents, test_size=0.2, random_state=42)
print(len(train_sents))
print(len(test_sents))
print(train_sents[1])

552
138
[('ผญ.', 'B-c'), ('_', 'I-c'), ('เวน', 'I-c'), ('ดี้', 'I-c'), ('_', 'I-c'), ('เพราะ', 'B-p'), ('หน้า', 'I-p'), ('สวย', 'I-p'), ('ชอบ', 'I-p'), ('ผญเเบบ', 'I-p'), ('เวน', 'I-p'), ('ดี้', 'I-p'), ('เสียง', 'I-p'), ('ใส', 'I-p'), ('พูด', 'I-p'), ('อิ้ง', 'I-p'), ('เก่ง', 'I-p'), ('มาก', 'I-p'), ('_', 'I-p'), ('_', 'O'), ('ผช.', 'B-c'), ('_', 'I-c'), ('อยาก', 'I-c'), ('เป็น', 'I-c'), ('จี', 'I-c'), ('มิ', 'I-c'), ('น', 'I-c'), ('_', 'I-c'), ('อยาก', 'B-p'), ('เป็น', 'I-p'), ('หนุ่ม', 'I-p'), ('เ', 'I-p'), ('เพ', 'I-p'), ('รว', 'I-p'), ('พราว', 'I-p'), ('ให้', 'I-p'), ('ผญ.', 'I-p'), ('ใจ', 'I-p'), ('ละลาย', 'I-p'), ('เล่น', 'I-p'), ('ดู', 'I-p'), ('_', 'I-p'), ('555555555', 'I-p'), ('_', 'I-p')]


In [3]:
def convert_to_simple_transformer_format(sentences):
    sentence_id = []
    words = []
    labels = []

    for idx, sents in enumerate(sentences):
        for word, label in sents:
            label = label.upper().replace("-", "_")
            sentence_id.append(idx)
            words.append(word)
            labels.append(label)
    return pd.DataFrame(
        {"sentence_id": sentence_id, "words": words, "labels": labels}
    )    
            


In [4]:
train_ = convert_to_simple_transformer_format(train_sents)
test_ = convert_to_simple_transformer_format(test_sents)
train_

Unnamed: 0,sentence_id,words,labels
0,0,ถ้า,B_C
1,0,ตัว,I_C
2,0,เริ่มต้น,I_C
3,0,_,I_C
4,0,ไป,I_C
...,...,...,...
37308,551,มาก,I_P
37309,551,ไท,I_P
37310,551,ป์,I_P
37311,551,เรา,I_P


In [5]:
import torch
from simpletransformers.ner import NERModel, NERArgs

_NER_TAGS = ["O", "B_C", "B_P", "I_C", "I_P"]
ner_args = NERArgs()
ner_args.max_seq_length = 480
test_ner = NERModel("camembert", 'pitiwat/argument_wangchanberta2', args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS)

2022-04-21 14:17:49.771485: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-21 14:17:49.771528: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [6]:
idx = 5
test_pred = (list(map(lambda word_lable: word_lable[0], test_sents[idx])))
predictions, raw_outputs = test_ner.predict([test_pred], split_on_space=False)
print(predictions[0]) 

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'ใช้': 'B_P'}, {'เรียน': 'I_P'}, {'_': 'I_P'}, {'พิมพ์': 'I_P'}, {'งาน': 'I_P'}, {'_': 'I_P'}, {'ใช้': 'I_P'}, {'_': 'I_P'}, {'excel': 'I_P'}, {'_': 'I_P'}, {'เยอะ': 'I_P'}, {'ๆ': 'I_P'}, {'_': 'I_P'}, {'_': 'O'}, {'โน้ตบุ๊ก': 'B_C'}, {'ดีกว่า': 'I_C'}, {'_': 'I_C'}, {'รุ่น': 'O'}, {'_': 'O'}, {'15.6': 'O'}, {'_': 'O'}, {'นิ้ว': 'O'}, {'มี': 'O'}, {'แป้น': 'O'}, {'ตัวเลข': 'O'}, {'_': 'O'}, {'หนัก': 'O'}, {'หน่อย': 'O'}, {'_': 'O'}, {'แต่': 'O'}, {'สะดวก': 'O'}, {'ดี': 'O'}, {'หรือ': 'O'}, {'จะ': 'O'}, {'ซื้อ': 'O'}, {'จอ': 'O'}, {'_': 'O'}, {'13.3': 'O'}, {'-': 'O'}, {'14': 'O'}, {'_': 'O'}, {'นิ้ว': 'O'}, {'แล้': 'O'}, {'วหา': 'O'}, {'แป้น': 'O'}, {'ตัวเลข': 'O'}, {'แบบ': 'O'}, {'_': 'O'}, {'USB': 'O'}, {'_': 'O'}, {'มา': 'O'}, {'ต่อ': 'O'}, {'ก็ได้': 'O'}]


In [7]:
from pythainlp.tokenize import word_tokenize

text = "การใช้ถุงพลาสติดแทนถุงผ้าผมคิดว่ามันจะทำให้ช่วยลดโลกร้อนได้มากขึ้น"
text_token = word_tokenize(text)
predictions, raw_outputs = test_ner.predict([text_token], split_on_space=False)
print(predictions[0]) 

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'การ': 'B_C'}, {'ใช้': 'I_C'}, {'ถุง': 'I_C'}, {'พ': 'I_C'}, {'ลา': 'I_C'}, {'สติ': 'I_C'}, {'ด': 'I_C'}, {'แทน': 'I_C'}, {'ถุง': 'I_C'}, {'ผ้า': 'I_C'}, {'ผม': 'I_C'}, {'คิด': 'I_C'}, {'ว่า': 'I_C'}, {'มัน': 'I_C'}, {'จะ': 'I_C'}, {'ทำให้': 'I_C'}, {'ช่วย': 'I_C'}, {'ลด': 'I_P'}, {'โลก': 'I_P'}, {'ร้อน': 'I_P'}, {'ได้': 'I_P'}, {'มากขึ้น': 'I_P'}]


In [8]:
# get label test list
y_test = []
for sent in test_sents:
    labels = []
    for word, label in sent:
        label = label.upper().replace("-", "_")
        labels.append(label)
    y_test.append(labels)
    
# print(y_test[1])


In [9]:
# get label pred list
test_list = []
for sent in test_sents:
    words = []
    for word, label in sent:
        words.append(word)
    test_list.append(words)

predictions, raw_outputs = test_ner.predict(test_list, split_on_space=False)

y_pred = []
for preds in predictions:
    y_pred.append([list(pred.items())[0][1] for pred in preds])

# print(y_pred[1])

  0%|          | 0/138 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/18 [00:00<?, ?it/s]

In [10]:
y_pred_ = []
y_test_ = []
for i in range(len(y_test)):
    if len(y_pred[i]) != len(y_test[i]):
        continue;
    y_pred_.append(y_pred[i])
    y_test_.append(y_test[i])

In [11]:
def convert_format_lable(list_lable):
    result = []
    for list_ in list_lable:
        result.append(list(map(lambda lable: lable.lower().capitalize().replace("_", "-"), list_)))
    return result

y_test_ = convert_format_lable(y_test_)
y_pred_ = convert_format_lable(y_pred_)



In [12]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

def pos_classification_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    print("accuracy",accuracy_score(y_true_combined, y_pred_combined))
    tagset = list(sorted(set(lb.classes_)))
    del tagset[len(tagset)-1] # del O
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset if cls!="O"],
        target_names = tagset,
        zero_division=0
    )
print(pos_classification_report(y_test_,y_pred_))

accuracy 0.939703034215623
              precision    recall  f1-score   support

         B-c       0.92      0.98      0.95       163
         B-p       0.92      0.97      0.94       155
         I-c       0.97      0.94      0.95      1765
         I-p       0.93      0.98      0.96      3852

   micro avg       0.94      0.97      0.95      5935
   macro avg       0.94      0.97      0.95      5935
weighted avg       0.94      0.97      0.95      5935
 samples avg       0.74      0.74      0.74      5935



In [13]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report

print("accuracy:" ,accuracy_score(y_test_, y_pred_))
print(classification_report(y_test_, y_pred_))

accuracy: 0.939703034215623
              precision    recall  f1-score   support

           c       0.87      0.96      0.91       163
           p       0.80      0.92      0.86       155

   micro avg       0.83      0.94      0.88       318
   macro avg       0.83      0.94      0.88       318
weighted avg       0.84      0.94      0.88       318

