In [9]:
from pythainlp.tokenize import word_tokenize

from pythainlp.tag import pos_tag
import sklearn_crfsuite

In [10]:
path_model = "../trained_model/CRF/"

def doc2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]
    
    #test add features
    key_claim = ["ดังนั้น","เพราะฉะนั้น","แสดงว่า"]
    key_premise = ["เพราะ", "เพราะว่า", "เนื่องจาก","เพื่อ","เช่น","เหตุผล","คือ"]
    word_claim = word in key_claim
    word_premise = word in key_premise
    # Features from current word
    features={
        'word.word': word,
        'word.isspace':word.isspace(),
        'postag':postag,
        'word.isdigit()': word.isdigit(),
        'woed.claim':word_claim,
        'word.premise':word_premise
    }
    if i > 0:
        prevword = doc[i-1][0]
        postag1 = doc[i-1][1]
        features['word.prevword'] = prevword
        features['word.previsspace']=prevword.isspace()
        features['word.prepostag'] = postag1
        features['word.prevwordisdigit'] = prevword.isdigit()
    else:
        features['BOS'] = True # Special "Beginning of Sequence" tag
    # Features from next word
    if i < len(doc)-1:
        nextword = doc[i+1][0]
        postag1 = doc[i+1][1]
        features['word.nextword'] = nextword
        features['word.nextisspace']=nextword.isspace()
        features['word.nextpostag'] = postag1
        features['word.nextwordisdigit'] = nextword.isdigit()
    else:
        features['EOS'] = True # Special "End of Sequence" tag
    return features

In [11]:
def extract_features(doc):
    return [doc2features(doc, i) for i in range(len(doc))]

def get_labels(doc):
    return [tag for (token,postag,tag) in doc]

In [12]:
def tag_html_format(predict_list, pos=False): # get ist of tuple 1 sentent [(word, pos, tag), .....]
    text_result = ""
    label_start = ""
    start_tag = False
    tag_label = ""
        
    for token in predict_list: # list of tuple
        if pos == True:
            word = token[0]
            tag = token[2]
        else:
            word = token[0]
            tag = token[1]
        
        if tag == "O":
            if start_tag == True :
                label_end = "</claim>" if label_start == "<claim>" else "</premise>"
                text_result += label_end
                text_result += word
                start_tag = False
            else:
                text_result += word
        else:
            if start_tag == False:
                tag_label = tag.split("-")[1]  #I-c  = c 
                label_start = "<claim>" if tag_label == "c" else "<premise>"
                text_result += label_start
                text_result += word
                start_tag = True
            else:
                if tag_label != tag.split("-")[1]: #กรณีที่tag ต่างกันอยู่ติดกัน
                    label_end = "</claim>" if label_start == "<claim>" else "</premise>"
                    text_result += label_end
                    tag_label = tag.split("-")[1]  #I-c  = c 
                    label_start = "<claim>" if tag_label == "c" else "<premise>"
                    text_result += label_start
                    text_result += word
                    start_tag = True
                else:
                    text_result += word
     
    if start_tag == True:
        label_end = "</claim>" if label_start == "<claim>" else "</premise>"
        text_result += label_end
                
    return text_result

In [13]:
def get_ner(text):
    word_cut=word_tokenize(text,engine="newmm")
    list_word=pos_tag(word_cut,engine='perceptron')
    X_test = extract_features([(data,list_word[i][1]) for i,data in enumerate(word_cut)])
    y_=crf.predict_single(X_test)
    return [(word_cut[i],list_word[i][1],data) for i,data in enumerate(y_)]

In [14]:
def predict_argument(text):
    text_preporcess = text.replace("\n"," ")
    text_preporcess = text_preporcess[:-1] if text_preporcess[-1] == " " else text_preporcess
    w_ner = get_ner(text_preporcess)
    return w_ner

In [15]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=400,
    all_possible_transitions=True,
    model_filename=path_model+"model_CRF.model0" # ตั้งชื่อโมเดล
)

In [16]:
text = \
"""\
ทำงานดีกว่าเรียน เพราะได้ผลตอบแทน
ถ้าเรียนผลแค่สอบผ่านยังไงก็ทำงานอยู่ดีเมื่อจบ
"""


list_predict = predict_argument(text)
print("predict: \n",list_predict)
print("\n")
print("html_tag: \n", tag_html_format(list_predict, pos=True))

predict: 
 [('ทำงาน', 'VACT', 'B-c'), ('ดีกว่า', 'JSBR', 'I-c'), ('เรียน', 'VACT', 'I-c'), (' ', 'PUNC', 'I-c'), ('เพราะ', 'JSBR', 'B-p'), ('ได้', 'XVAE', 'I-p'), ('ผลตอบแทน', 'NCMN', 'I-p'), (' ', 'PUNC', 'I-p'), ('ถ้า', 'JSBR', 'I-p'), ('เรียน', 'VACT', 'I-p'), ('ผล', 'NCMN', 'I-p'), ('แค่', 'VATT', 'I-p'), ('สอบผ่าน', 'VACT', 'I-p'), ('ยังไง', 'NCMN', 'I-p'), ('ก็', 'JSBR', 'I-p'), ('ทำงาน', 'VACT', 'I-p'), ('อยู่ดี', 'ADVN', 'I-p'), ('เมื่อ', 'JSBR', 'I-p'), ('จบ', 'VSTA', 'I-p')]


html_tag: 
 <claim>ทำงานดีกว่าเรียน </claim><premise>เพราะได้ผลตอบแทน ถ้าเรียนผลแค่สอบผ่านยังไงก็ทำงานอยู่ดีเมื่อจบ</premise>
