In [1]:
import pickle
import dill
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
from sklearn.model_selection import cross_validate,train_test_split
import sklearn_crfsuite

# load dataset

In [2]:
path = "../../dataset/data/"

with open(path+"comment-pos.data", 'rb') as file:
    datatofile = dill.load(file)

In [3]:
datatofile[0]

[('สำหรับ', 'JSBR', 'O'),
 ('คน', 'NCMN', 'O'),
 ('ที่', 'PREL', 'O'),
 ('มีประวัติ', 'VACT', 'O'),
 ('แพ้ยา', 'NCMN', 'O'),
 ('หรือ', 'JCRG', 'O'),
 ('มี', 'VSTA', 'O'),
 ('โรคประจำตัว', 'NCMN', 'O'),
 (' ', 'PUNC', 'O'),
 ('เรา', 'PPRS', 'B-c'),
 ('ว่า', 'JSBR', 'I-c'),
 ('ตัว', 'CNIT', 'I-c'),
 ('นี้', 'DDAC', 'I-c'),
 ('น่าสนใจ', 'VATT', 'I-c'),
 (' ', 'PUNC', 'I-c'),
 ('เพราะ', 'JSBR', 'B-p'),
 ('ผลข้างเคียง', 'NCMN', 'I-p'),
 ('น้อย', 'VATT', 'I-p'),
 (' ', 'PUNC', 'I-p'),
 ('ประสิทธิภาพ', 'NCMN', 'I-p'),
 ('พอใช้', 'NCMN', 'I-p'),
 (' ', 'PUNC', 'I-p')]

# Preparing the Data

In [4]:
def doc2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]
    
    #test add features
    key_claim = ["ดังนั้น","เพราะฉะนั้น","แสดงว่า"]
    key_premise = ["เพราะ", "เพราะว่า", "เนื่องจาก","เพื่อ","เช่น","เหตุผล","คือ"]
    word_claim = word in key_claim
    word_premise = word in key_premise
    # Features from current word
    features={
        'word.word': word,
        'word.isspace':word.isspace(),
        'postag':postag,
        'word.isdigit()': word.isdigit(),
        'woed.claim':word_claim,
        'word.premise':word_premise
    }
    if i > 0:
        prevword = doc[i-1][0]
        postag1 = doc[i-1][1]
        features['word.prevword'] = prevword
        features['word.previsspace']=prevword.isspace()
        features['word.prepostag'] = postag1
        features['word.prevwordisdigit'] = prevword.isdigit()
    else:
        features['BOS'] = True # Special "Beginning of Sequence" tag
    # Features from next word
    if i < len(doc)-1:
        nextword = doc[i+1][0]
        postag1 = doc[i+1][1]
        features['word.nextword'] = nextword
        features['word.nextisspace']=nextword.isspace()
        features['word.nextpostag'] = postag1
        features['word.nextwordisdigit'] = nextword.isdigit()
    else:
        features['EOS'] = True # Special "End of Sequence" tag
    return features

In [5]:
def extract_features(doc):
    return [doc2features(doc, i) for i in range(len(doc))]

def get_labels(doc):
    return [tag for (token,postag,tag) in doc]

X_data = [extract_features(doc) for doc in datatofile] # เอา คำ แยกออกมา
y_data = [get_labels(doc) for doc in datatofile] # เอา tag แยกออกมา

X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.2) # แบ่ง 0.2 หรือ 20%

# show extrac_features

In [6]:
import pandas as pd
pd.DataFrame(X[0])

Unnamed: 0,word.word,word.isspace,postag,word.isdigit(),woed.claim,word.premise,BOS,word.nextword,word.nextisspace,word.nextpostag,word.nextwordisdigit,word.prevword,word.previsspace,word.prepostag,word.prevwordisdigit,EOS
0,เชียงราย,False,NCMN,False,False,False,True,ค่ะ,False,NCMN,False,,,,,
1,ค่ะ,False,NCMN,False,False,False,,,True,PUNC,False,เชียงราย,False,NCMN,False,
2,,True,PUNC,False,False,False,,,True,PUNC,False,ค่ะ,False,NCMN,False,
3,,True,PUNC,False,False,False,,น่าน,False,VSTA,False,,True,PUNC,False,
4,น่าน,False,VSTA,False,False,False,,เดินทาง,False,NCMN,False,,True,PUNC,False,
5,เดินทาง,False,NCMN,False,False,False,,ลำบาก,False,VATT,False,น่าน,False,VSTA,False,
6,ลำบาก,False,VATT,False,False,False,,เกิน,False,ADVN,False,เดินทาง,False,NCMN,False,
7,เกิน,False,ADVN,False,False,False,,,True,PUNC,False,ลำบาก,False,VATT,False,
8,,True,PUNC,False,False,False,,,,,,เกิน,False,ADVN,False,True


# train model

In [7]:
path = "../../trained_model/CRF/"

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=400,
    all_possible_transitions=True,
    model_filename=path+"model_CRF-test.model0" # ตั้งชื่อโมเดล
)

In [8]:
crf.fit(X, y); # train

In [9]:
labels = list(crf.classes_)
labels.remove('O')
y_pred = crf.predict(X_test)

# Evaluatetion model

In [262]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

def pos_classification_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    print("accuracy",accuracy_score(y_true_combined, y_pred_combined))
    tagset = list(sorted(set(lb.classes_)))
    del tagset[len(tagset)-1] # del O
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset if cls!="O"],
        target_names = tagset,
        zero_division=0
    )
print(pos_classification_report(y_test,y_pred))

accuracy 0.5924520117123956
              precision    recall  f1-score   support

         B-c       0.74      0.59      0.66       176
         B-p       0.65      0.49      0.56       176
         I-c       0.52      0.49      0.50      1901
         I-p       0.71      0.73      0.72      5083

   micro avg       0.66      0.66      0.66      7336
   macro avg       0.65      0.57      0.61      7336
weighted avg       0.66      0.66      0.66      7336
 samples avg       0.52      0.52      0.52      7336



In [263]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

print("accuracy:" ,accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

accuracy: 0.5924520117123956
              precision    recall  f1-score   support

           c       0.41      0.33      0.37       176
           p       0.38      0.28      0.32       176

   micro avg       0.40      0.31      0.35       352
   macro avg       0.40      0.31      0.35       352
weighted avg       0.40      0.31      0.35       352



# test Prediction

In [264]:
def tag_html_format(predict_list, pos=False): # get ist of tuple 1 sentent [(word, pos, tag), .....]
    text_result = ""
    label_start = ""
    start_tag = False
    tag_label = ""
        
    for token in predict_list: # list of tuple
        if pos == True:
            word = token[0]
            tag = token[2]
        else:
            word = token[0]
            tag = token[1]
        
        if tag == "O":
            if start_tag == True :
                label_end = "</claim>" if label_start == "<claim>" else "</premise>"
                text_result += label_end
                text_result += word
                start_tag = False
            else:
                text_result += word
        else:
            if start_tag == False:
                tag_label = tag.split("-")[1]  #I-c  = c 
                label_start = "<claim>" if tag_label == "c" else "<premise>"
                text_result += label_start
                text_result += word
                start_tag = True
            else:
                if tag_label != tag.split("-")[1]: #กรณีที่tag ต่างกันอยู่ติดกัน
                    label_end = "</claim>" if label_start == "<claim>" else "</premise>"
                    text_result += label_end
                    tag_label = tag.split("-")[1]  #I-c  = c 
                    label_start = "<claim>" if tag_label == "c" else "<premise>"
                    text_result += label_start
                    text_result += word
                    start_tag = True
                else:
                    text_result += word
     
    if start_tag == True:
        label_end = "</claim>" if label_start == "<claim>" else "</premise>"
        text_result += label_end
                
    return text_result

In [265]:
def get_ner(text):
    word_cut=word_tokenize(text,engine="newmm")
    list_word=pos_tag(word_cut,engine='perceptron')
    X_test = extract_features([(data,list_word[i][1]) for i,data in enumerate(word_cut)])
    y_=crf.predict_single(X_test)
    return [(word_cut[i],list_word[i][1],data) for i,data in enumerate(y_)]

In [266]:
text = \
"""\
ฉันชอบหมา เพราะมันน่ารัก
"""

def predict_argument(text):
    text_preporcess = text.replace("\n"," ")
    text_preporcess = text_preporcess[:-1] if text_preporcess[-1] == " " else text_preporcess
    w_ner = get_ner(text_preporcess)
    return w_ner

list_predict = predict_argument(text)
print("predict: \n",list_predict)
print("\n")
print("html_tag: \n", tag_html_format(list_predict, pos=True))

predict: 
 [('ฉัน', 'PPRS', 'B-c'), ('ชอบ', 'VACT', 'I-c'), ('หมา', 'NCMN', 'I-c'), (' ', 'PUNC', 'I-c'), ('เพราะ', 'JSBR', 'B-p'), ('มัน', 'PPRS', 'I-p'), ('น่ารัก', 'VATT', 'I-p')]


html_tag: 
 <claim>ฉันชอบหมา </claim><premise>เพราะมันน่ารัก</premise>
