In [15]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
from sklearn.model_selection import cross_validate,train_test_split
import torch.nn.functional as F
from tqdm.notebook import tqdm, trange
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim


print(torch.__version__)

1.9.0


In [16]:
import dill
with open('../../dataset/data/comment-pos.data', 'rb') as file:
    datatofile = dill.load(file)

In [17]:
word_ner=[]
for i in datatofile:
    t1=[]
    t2=[]
    for j in i:
        t1.append(j[0])
        t2.append(j[2])
    word_ner.append((t1,t2))

In [18]:
tagged_sents=word_ner

In [19]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] if w in to_ix else to_ix["UNK"] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [20]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [21]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 64
HIDDEN_DIM = 128

In [22]:
#split data into train set and test set
train_size = int(len(tagged_sents) * 0.8)
train_sents = tagged_sents[:train_size]
test_sents = tagged_sents[train_size:]

In [23]:
word_list=[]
pos_list=[]

for sent,tags in train_sents:
    for word in sent:
        word_list.append(word)
    for tag in tags:
        pos_list.append(tag)

#Word to Index
word_list.append("UNK") #Special Token for unknown words
pos_list.append(START_TAG)
pos_list.append(STOP_TAG)
all_words = sorted(set(word_list))
all_pos = sorted(set(pos_list))
#del word_list, pos_list
word_to_ix = dict((c, i) for i, c in enumerate(all_words)) #convert word to index 
pos_to_ix = dict((c, i) for i, c in enumerate(all_pos)) #convert pos to index

In [24]:
ix_to_word = dict((v,k) for k,v in word_to_ix.items()) #convert index to word
ix_to_pos = dict((v,k) for k,v in pos_to_ix.items())  #convert index to word

In [25]:
model = BiLSTM_CRF(len(word_to_ix), pos_to_ix, EMBEDDING_DIM, HIDDEN_DIM)

In [26]:
tag_to_ix=pos_to_ix

In [27]:
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [28]:
with torch.no_grad():
    precheck_sent = prepare_sequence(train_sents[0][0], word_to_ix)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in train_sents[0][1]], dtype=torch.long)
    print(model(precheck_sent))

(tensor(37.0389), [2, 4, 3, 2, 4, 3, 2, 4, 3, 2, 4, 3, 2, 4, 3, 2, 4, 5, 6, 2, 4, 3])


In [29]:
def train(sentence, tags,train=True):
    if train:
        model.train(True)
    else:
        model.eval()
    model.zero_grad()
    sentence_in = prepare_sequence(sentence, word_to_ix)
    targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
    loss = model.neg_log_likelihood(sentence_in, targets)
    if train:
        loss.backward()#backprop
        optimizer.step()#update parameters
    return loss.item()

In [30]:
import time
import math

n_iters = 50
print_every = 1

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()
train_losses=[]
val_losses=[]
for iter in range(1, n_iters + 1):#Epoch
    loader = tqdm(train_sents, total=len(train_sents))
    for sentence, pos_tags in loader: #Sentence
        loss = train(sentence, pos_tags) #fwd
    loader = tqdm(train_sents, total=len(train_sents))
    for sentence, pos_tags in loader: #Sentence
        val_loss = train(sentence, pos_tags,train=False) #fwd
    # Print iter number,time, loss
    if iter % print_every == 0:
        print('ep %d %d%% (%s) loss %.4f ' % (iter, iter / n_iters * 100, timeSince(start), loss))
        print('val loss %.4f ' % (val_loss))
        train_losses.append(loss)
        val_losses.append(val_loss)
        torch.save(model.state_dict(), "ner-2.pt.ep"+str(iter))

  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 1 2% (1m 33s) loss 33.4875 
val loss 65.4670 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 2 4% (3m 15s) loss 52.8727 
val loss 74.9430 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 3 6% (5m 0s) loss 20.7994 
val loss 10.7498 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 4 8% (6m 45s) loss 30.1999 
val loss 28.4387 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 5 10% (8m 28s) loss 66.7371 
val loss 95.8873 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 6 12% (10m 12s) loss 14.8168 
val loss 2.7887 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 7 14% (11m 56s) loss 5.5522 
val loss 3.3772 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 8 16% (13m 41s) loss 6.6496 
val loss 3.2325 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 9 18% (15m 19s) loss 12.5270 
val loss 9.4277 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 10 20% (16m 55s) loss 8.1930 
val loss 1.6738 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 11 22% (18m 29s) loss 16.3594 
val loss 4.3759 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 12 24% (20m 6s) loss 4.8148 
val loss 0.8694 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 13 26% (21m 43s) loss 9.3201 
val loss 28.2744 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 14 28% (23m 18s) loss 0.5376 
val loss 0.3160 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 15 30% (24m 51s) loss 1.6785 
val loss 0.7065 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 16 32% (26m 26s) loss 0.4010 
val loss 0.3503 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 17 34% (28m 0s) loss 6.3373 
val loss 1.1238 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 18 36% (29m 34s) loss 0.6400 
val loss 0.5464 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 19 38% (31m 8s) loss 0.6171 
val loss 0.3191 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 20 40% (32m 52s) loss 0.3893 
val loss 0.4663 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 21 42% (34m 30s) loss 0.3995 
val loss 0.3530 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 22 44% (36m 10s) loss 0.6696 
val loss 0.3257 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 23 46% (37m 43s) loss 1.1520 
val loss 0.3561 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 24 48% (39m 16s) loss 0.3119 
val loss 0.2667 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 25 50% (40m 50s) loss 0.2472 
val loss 0.2018 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 26 52% (42m 23s) loss 0.2792 
val loss 0.1865 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 27 54% (43m 55s) loss 0.2456 
val loss 0.1887 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 28 56% (45m 34s) loss 0.1954 
val loss 0.1555 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 29 57% (47m 15s) loss 0.2014 
val loss 0.1652 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 30 60% (49m 29s) loss 0.1511 
val loss 0.1414 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 31 62% (51m 59s) loss 0.1289 
val loss 0.1377 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 32 64% (54m 29s) loss 0.1543 
val loss 0.1536 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 33 66% (56m 59s) loss 0.2257 
val loss 0.1705 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 34 68% (59m 29s) loss 0.2515 
val loss 0.1752 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 35 70% (61m 56s) loss 0.1343 
val loss 0.1371 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 36 72% (64m 22s) loss 0.1835 
val loss 0.1298 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 37 74% (66m 49s) loss 0.1792 
val loss 0.1837 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 38 76% (69m 16s) loss 0.2406 
val loss 0.6951 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 39 78% (71m 44s) loss 0.3622 
val loss 0.1249 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 40 80% (74m 11s) loss 0.1403 
val loss 0.1359 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 41 82% (76m 39s) loss 0.1337 
val loss 0.1217 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 42 84% (79m 6s) loss 0.1688 
val loss 0.6588 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 43 86% (81m 5s) loss 0.1595 
val loss 0.1478 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 44 88% (82m 40s) loss 0.2721 
val loss 0.1552 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 45 90% (84m 17s) loss 0.1769 
val loss 0.1620 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 46 92% (85m 47s) loss 0.1707 
val loss 0.1493 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 47 94% (87m 18s) loss 0.1522 
val loss 0.1360 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 48 96% (88m 52s) loss 0.1993 
val loss 0.1277 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 49 98% (90m 27s) loss 0.1277 
val loss 0.1185 


  0%|          | 0/552 [00:00<?, ?it/s]

  0%|          | 0/552 [00:00<?, ?it/s]

ep 50 100% (91m 58s) loss 0.1571 
val loss 0.1443 


In [35]:
def predict(input_sent):
    y_pred=[]
    temp=[]
    with torch.no_grad():
        precheck_sent = prepare_sequence(input_sent, word_to_ix)
        output=model(precheck_sent)[1]
        y_pred=[ix_to_pos[i] for i in output]
    
    return y_pred

In [36]:
y_pred = []

for test_sent,_ in test_sents:
    try:
        temp_pred = predict(test_sent)
        y_pred.append(temp_pred)
    except:
        print(test_sent)

In [37]:
test_targets =[ner1 for sent,ner1 in test_sents if sent!=[]]

In [38]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
def pos_classification_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    print("accuracy",accuracy_score(y_true_combined, y_pred_combined))
    tagset = list(sorted(set(lb.classes_)))
    del tagset[len(tagset)-1] # del O
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset if cls!="O"],
        target_names = tagset,
        zero_division=0
    )
print(pos_classification_report(test_targets,y_pred))

accuracy 0.4193961298095023
              precision    recall  f1-score   support

         B-c       0.38      0.45      0.41       161
         B-p       0.29      0.40      0.34       170
         I-c       0.32      0.31      0.32      1845
         I-p       0.34      0.78      0.48      3518

   micro avg       0.34      0.60      0.43      5694
   macro avg       0.33      0.48      0.38      5694
weighted avg       0.34      0.60      0.42      5694
 samples avg       0.26      0.26      0.26      5694



In [39]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

print("accuracy:" ,accuracy_score(test_targets, y_pred))
print(classification_report(test_targets, y_pred))

accuracy: 0.4193961298095023
              precision    recall  f1-score   support

           c       0.21      0.25      0.23       161
           p       0.11      0.16      0.13       170

   micro avg       0.15      0.21      0.18       331
   macro avg       0.16      0.21      0.18       331
weighted avg       0.16      0.21      0.18       331



In [40]:
def tag_html_format(predict_list, pos=False): # get ist of tuple 1 sentent [(word, pos, tag), .....]
    text_result = ""
    label_start = ""
    start_tag = False
    tag_label = ""
        
    for token in predict_list: # list of tuple
        if pos == True:
            word = token[0]
            tag = token[2]
        else:
            word = token[0]
            tag = token[1]
        
        if tag == "O":
            if start_tag == True :
                label_end = "</claim>" if label_start == "<claim>" else "</premise>"
                text_result += label_end
                text_result += word
                start_tag = False
            else:
                text_result += word
        else:
            if start_tag == False:
                tag_label = tag.split("-")[1]  #I-c  = c 
                label_start = "<claim>" if tag_label == "c" else "<premise>"
                text_result += label_start
                text_result += word
                start_tag = True
            else:
                if tag_label != tag.split("-")[1]: #กรณีที่tag ต่างกันอยู่ติดกัน
                    label_end = "</claim>" if label_start == "<claim>" else "</premise>"
                    text_result += label_end
                    tag_label = tag.split("-")[1]  #I-c  = c 
                    label_start = "<claim>" if tag_label == "c" else "<premise>"
                    text_result += label_start
                    text_result += word
                    start_tag = True
                else:
                    text_result += word
     
    if start_tag == True:
        label_end = "</claim>" if label_start == "<claim>" else "</premise>"
        text_result += label_end
                
    return text_result

In [41]:
def prepocess_text(text, token=True):
    text = text.replace("\n", "")
    if token == True:
        text = word_tokenize(text)
    return text

In [123]:
from pythainlp.tokenize import word_tokenize

text = 'หาความหมายก่อนครับ  มากพอผ่อนรถถ้าไม่เดือดร้อน ขี่จยย. ไปทำงานได้ ก็ซื้อบ้านก่อนเพราะ "รถ" มันมีแต่ลดมูลค่า ไม่กี่ปีก็หมดราคา จมไปกับดอกเบี้ยและค่าดูแลรักษา ไหนจะเติมน้ำมันอีก ค่าใช้จ่ายหลายเท่าของจยย.ถ้าคุณตีโจทย์ไม่ได้ว่า ซื้อรถแล้ว จะมีรายได้จากการมีรถเพิ่มขึ้นเพียงพอ ก็ใช้ จยย. ไปก่อน'

list_word = prepocess_text(text)
print(list_word)
predict_tag = predict(list_word)
print(predict_tag)

tag_html_format(zip(list_word, predict_tag))

['หา', 'ความหมาย', 'ก่อน', 'ครับ', '  ', 'มาก', 'พอ', 'ผ่อน', 'รถ', 'ถ้า', 'ไม่', 'เดือดร้อน', ' ', 'ขี่', 'จยย.', ' ', 'ไป', 'ทำงาน', 'ได้', ' ', 'ก็', 'ซื้อ', 'บ้าน', 'ก่อน', 'เพราะ', ' ', '"รถ"', ' ', 'มัน', 'มี', 'แต่', 'ลด', 'มูลค่า', ' ', 'ไม่', 'กี่', 'ปี', 'ก็', 'หมด', 'ราคา', ' ', 'จม', 'ไป', 'กับ', 'ดอกเบี้ย', 'และ', 'ค่า', 'ดูแลรักษา', ' ', 'ไหนจะ', 'เติมน้ำมัน', 'อีก', ' ', 'ค่าใช้จ่าย', 'หลายเท่า', 'ของ', 'จยย.', 'ถ้า', 'คุณ', 'ตี', 'โจทย์', 'ไม่', 'ได้', 'ว่า', ' ', 'ซื้อ', 'รถ', 'แล้ว', ' ', 'จะ', 'มี', 'รายได้', 'จาก', 'การ', 'มี', 'รถ', 'เพิ่มขึ้น', 'เพียงพอ', ' ', 'ก็', 'ใช้', ' ', 'จยย.', ' ', 'ไป', 'ก่อน']
['B-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'I-c', 'B-p', 'I-p', 'I-p', 'I-p', 'I-p', 'I-p', 'I-p', 'I-p', 'I-p', 'I-p', 'I-p', 'I-p', 'I-p', 'I-p', 'I-p', 'I-p', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-c', 'I-c', 'I-c',

'<claim>หาความหมายก่อนครับ  มากพอผ่อนรถถ้าไม่เดือดร้อน ขี่จยย. ไปทำงานได้ ก็ซื้อบ้านก่อน</claim><premise>เพราะ "รถ" มันมีแต่ลดมูลค่า ไม่กี่ปีก็หมดราคา</premise> จมไปกับดอกเบี้ยและค่าดูแลรักษา ไหนจะเติมน้ำมันอีก <claim>ค่าใช้จ่ายหลายเท่าของจยย.ถ้าคุณตีโจทย์ไม่ได้ว่า ซื้อรถแล้ว</claim> จะมีรายได้จากการมี<premise>รถเพิ่มขึ้นเพียงพอ ก็ใช้ จยย. ไปก่อน</premise>'

In [124]:
path = "../../trained_model/BiLSTM-CRF/"
with open(path+'word_to_ix.pkl', 'wb') as file:
    dill.dump(word_to_ix, file)

with open(path+'pos_to_ix.pkl', 'wb') as file:
    dill.dump(pos_to_ix, file)

torch.save(model.state_dict(), path+"model_BiLSTM-CRF.model")