# Gán nhãn từ loại với mô hình Markov ẩn
Notebook này hướng dẫn sử dụng mô hình Markov ẩn (HMM) để thực hiện gán nhãn dữ liệu. Chúng ta sẽ sử dụng Python để xây dựng một mô hình gán nhãn dữ liệu bằng HMM và thuật toán Veterbi

Đọc dữ liệu và tổ chức dữ liệu huấn luyện

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time
 
# Tiến hành đọc dữ liệu: đầu ra là danh sách các câu, mỗi một câu là một danh sách cặp các token và nhãn tương ứng.
file_path = '/content/drive/MyDrive/NLP/Đáp án cho bài tập về nhà/Lesson6/train.conll'
def load_data(file_path):
  ### 1. BEGIN CODE HERE ###
  f = open(file_path,'r',encoding='utf-8')
  data = []
  sentence = []
  for line in f:
    if line == '\n':
      data.append(sentence)
      sentence = []
    else:
      args = line.split()
      sentence.append((args[0],args[-1]))
  f.close()
  return data
  ### 1. END CODE HERE ###

data = load_data(file_path)

In [42]:
# Chia bộ dữ liệu thành 2 tập train, test với tỷ lệ 80:20
### 2. BEGIN CODE HERE ###
train_set,test_set =train_test_split(data,train_size=0.8,test_size=0.2,random_state = 101)
### 2. END CODE HERE ###

In [43]:
print(len(train_set))
print(len(test_set))

6212
1553


In [44]:
# Tạo danh sách các cặp (word tagged) ở các tập train và test
### 3. BEGIN CODE HERE ###
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
### 3. BEGIN CODE HERE ###
print(len(train_tagged_words))
print(len(test_tagged_words))

143656
36217


In [45]:
# Kiểm tra 5 cặp đầu tiên
train_tagged_words[:5]

[('Vai', 'O'), ('trò', 'O'), ('người', 'O'), ('đứng', 'O'), ('đầu', 'O')]

In [46]:
#use set datatype to check how many unique tags are present in training data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)
 
# check total words in vocabulary
vocab = {word for word,tag in train_tagged_words}

9
{'I-PERSON', 'B-LOCATION', 'O', 'I-LOCATION', 'B-DATETIME', 'I-DATETIME', 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-PERSON'}


In [47]:
train_tagged_words

[('Vai', 'O'),
 ('trò', 'O'),
 ('người', 'O'),
 ('đứng', 'O'),
 ('đầu', 'O'),
 ('là', 'O'),
 ('vậy,', 'O'),
 ('còn', 'O'),
 ('khi', 'O'),
 ('triển', 'O'),
 ('khai', 'O'),
 ('xuống', 'O'),
 ('các', 'O'),
 ('cấp', 'O'),
 ('dưới,', 'O'),
 ('nếu', 'O'),
 ('học', 'O'),
 ('cho', 'O'),
 ('có,', 'O'),
 ('không', 'O'),
 ('thảo', 'O'),
 ('luận,', 'O'),
 ('không', 'O'),
 ('kiểm', 'O'),
 ('tra', 'O'),
 ('thì', 'O'),
 ('vận', 'O'),
 ('dụng', 'O'),
 ('vào', 'O'),
 ('thực', 'O'),
 ('tế', 'O'),
 ('sẽ', 'O'),
 ('rất', 'O'),
 ('khó..', 'O'),
 ('Tuy', 'O'),
 ('nhiên,', 'O'),
 ('sẽ', 'O'),
 ('có', 'O'),
 ('những', 'O'),
 ('điều', 'O'),
 ('mà', 'O'),
 ('không', 'O'),
 ('thể', 'O'),
 ('làm', 'O'),
 ('thỏa', 'O'),
 ('mãn', 'O'),
 ('được', 'O'),
 ('cho', 'O'),
 ('cả', 'O'),
 ('hai', 'O'),
 ('bên..', 'O'),
 ('Các', 'O'),
 ('Bộ,', 'O'),
 ('ngành,', 'O'),
 ('địa', 'O'),
 ('phương', 'O'),
 ('tăng', 'O'),
 ('cường', 'O'),
 ('tổ', 'O'),
 ('chức', 'O'),
 ('đối', 'O'),
 ('thoại,', 'O'),
 ('tiếp', 'O'),
 ('công', 'O')

Tính Emission Probability và Transition Probability từ dữ liệu huấn luyện

In [48]:
# compute Emission Probability (Xác suất để một từ theo 1 nhãn)
def word_given_tag(word, tag, train_bag = train_tagged_words):
    ### 4. BEGIN CODE HERE ###
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    #now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
    ### 4. END CODE HERE ###
     
    return (count_w_given_tag, count_tag)

In [49]:
# compute  Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [50]:
# Tạo ma trận chuyển trạng thái
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
 
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
 
print(tags_matrix)

[[2.65015811e-01 6.84931502e-03 6.63856685e-01 0.00000000e+00
  4.74183355e-03 0.00000000e+00 1.00105377e-02 0.00000000e+00
  4.95258160e-02]
 [0.00000000e+00 2.40460020e-02 2.96393096e-01 6.51332974e-01
  5.22739161e-03 0.00000000e+00 3.13643483e-03 0.00000000e+00
  1.98640879e-02]
 [0.00000000e+00 1.22056017e-02 9.59923029e-01 0.00000000e+00
  3.96967214e-03 0.00000000e+00 8.99640284e-03 0.00000000e+00
  1.48976780e-02]
 [0.00000000e+00 9.19798091e-02 5.70947826e-01 3.01177800e-01
  1.17779020e-02 0.00000000e+00 1.06561976e-02 0.00000000e+00
  1.34604601e-02]
 [0.00000000e+00 5.07614203e-03 2.16582060e-01 0.00000000e+00
  5.07614203e-03 7.54653156e-01 1.01522841e-02 0.00000000e+00
  8.46023671e-03]
 [0.00000000e+00 2.11538468e-02 7.63461530e-01 0.00000000e+00
  9.61538497e-03 1.42307699e-01 3.84615399e-02 0.00000000e+00
  2.50000004e-02]
 [0.00000000e+00 1.27055310e-02 4.43198800e-01 0.00000000e+00
  5.97907323e-03 0.00000000e+00 2.31689084e-02 5.02989531e-01
  1.19581465e-02]
 [0.00

In [51]:
# convert the matrix to a df for better readability
#the table is same as the transition table shown in section 3 of article
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,I-PERSON,B-LOCATION,O,I-LOCATION,B-DATETIME,I-DATETIME,B-ORGANIZATION,I-ORGANIZATION,B-PERSON
I-PERSON,0.265016,0.006849,0.663857,0.0,0.004742,0.0,0.010011,0.0,0.049526
B-LOCATION,0.0,0.024046,0.296393,0.651333,0.005227,0.0,0.003136,0.0,0.019864
O,0.0,0.012206,0.959923,0.0,0.00397,0.0,0.008996,0.0,0.014898
I-LOCATION,0.0,0.09198,0.570948,0.301178,0.011778,0.0,0.010656,0.0,0.01346
B-DATETIME,0.0,0.005076,0.216582,0.0,0.005076,0.754653,0.010152,0.0,0.00846
I-DATETIME,0.0,0.021154,0.763462,0.0,0.009615,0.142308,0.038462,0.0,0.025
B-ORGANIZATION,0.0,0.012706,0.443199,0.0,0.005979,0.0,0.023169,0.50299,0.011958
I-ORGANIZATION,0.0,0.02763,0.277896,0.0,0.004251,0.0,0.026567,0.642402,0.021254
B-PERSON,0.62444,0.000895,0.350492,0.0,0.002238,0.0,0.001791,0.0,0.020143


Xây dựng thuật toán Viterbi

In [52]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))

    #lặp từng từ
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        #Lặp từng nhãn tính xác suất chuyển đổi từ nhãn trước tới các nhãn hiện tại.
        for tag in T:
            ### 5. BEGIN CODE HERE ###

            if key == 0:
                transition_p = tags_df.loc['O', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            ### 5. END CODE HERE ###


        ### 6. BEGIN CODE HERE ###
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)]
        ### 6. END CODE HERE ###

        state.append(state_max)
    return list(zip(words, state))

Test thuật toán Viterbi

In [53]:
# Let's test our Viterbi algorithm on a few sample sentences of test dataset
random.seed(10)      #define a random seed to get same sentences when run multiple times
 
# choose random 10 numbers
rndom = [random.randint(1,len(test_set)-1) for x in range(10)]
 
# list of 10 sents on which we test the model
test_run = [test_set[i] for i in rndom]
 
# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]
 
# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [54]:
#Here We will only test 10 sentences to check the accuracy
#as testing the whole training set takes huge amount of time
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)
 
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Time taken in seconds:  65.2182936668396
Viterbi Algorithm Accuracy:  91.30434782608695


In [36]:
#Code to test all the test sentences
#(takes alot of time to run)
# tagging the test sentences()

test_tagged_words = [tup for sent in test_set for tup in sent]
test_untagged_words = [tup[0] for sent in test_set for tup in sent]
test_untagged_words

 
start = time.time()
tagged_seq = Viterbi(test_untagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)
 
# accuracy
check = [i for i, j in zip(tagged_seq, test_tagged_words) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Time taken in seconds:  640.3209068775177
Viterbi Algorithm Accuracy:  91.19958093242535


Tăng độ chính xác bằng cách kết hợp thêm rule

In [37]:
#To improve the performance,we specify a rule base tagger for unknown words 
# specify patterns for tagging
patterns = [
    (r'^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]))\1|(?:(?:29|30)(\/|-|\.)(?:0?[13-9]|1[0-2])\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)0?2\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9])|(?:1[0-2]))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$', 'B-DATETIME'),              # gerund
]
# rule based tagger
rule_based_tagger = nltk.RegexpTagger(patterns)

In [18]:
rule_based_tagger.tag(['30/04/1975','là','ngày'])

[('30/04/1975', 'B-DATETIME'), ('là', None), ('ngày', None)]

In [38]:
#modified Viterbi to include rule based tagger in it
def Viterbi_rule_based(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            ### 6. BEGIN CODE HERE ###
            if key == 0:
                transition_p = tags_df.loc['O', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
            ### 6. END CODE HERE ###

        ### 7. BEGIN CODE HERE ###
        pmax = max(p)
        state_max = rule_based_tagger.tag([word])[0][1] 
        ### 7. END CODE HERE ###
        
        ### 8. BEGIN CODE HERE ###
        if(pmax==0 and rule_based_tagger.tag([word])[0][1] != None ):
            state_max = rule_based_tagger.tag([word])[0][1] # assign based on rule based tagger
        else:
            state_max = T[p.index(pmax)]                
        ### 8. END CODE HERE ###
        state.append(state_max)
    return list(zip(words, state))

In [None]:
test_tagged_words

In [39]:
#Kết quả Test cả bộ khá lâu
#test accuracy on subset of test data 
start = time.time()
tagged_seq = Viterbi_rule_based(test_untagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)
 
# accuracy
check = [i for i, j in zip(tagged_seq, test_tagged_words) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Time taken in seconds:  621.1240074634552
Viterbi Algorithm Accuracy:  91.19958093242535


In [40]:
#Check how a sentence is tagged by the two POS taggers
#and compare them
test_sent="ngày 20/12/2022 , cú đúp của Sheva đã mang lại chiến thắng cho Ukraine - một chiến thắng không hề' dễ' dàng.."
pred_tags_rule=Viterbi_rule_based(test_sent.split())
pred_tags_withoutRules= Viterbi(test_sent.split())
print(pred_tags_rule)
print(pred_tags_withoutRules)
#Will and Marry are tagged as NUM as they are unknown words for Viterbi Algorithm 

[('ngày', 'O'), ('20/12/2022', 'B-DATETIME'), (',', 'O'), ('cú', 'O'), ('đúp', 'O'), ('của', 'O'), ('Sheva', 'B-PERSON'), ('đã', 'O'), ('mang', 'O'), ('lại', 'O'), ('chiến', 'O'), ('thắng', 'O'), ('cho', 'O'), ('Ukraine', 'B-LOCATION'), ('-', 'I-LOCATION'), ('một', 'O'), ('chiến', 'O'), ('thắng', 'O'), ('không', 'O'), ("hề'", 'O'), ("dễ'", 'I-PERSON'), ('dàng..', 'O')]
[('ngày', 'O'), ('20/12/2022', 'I-PERSON'), (',', 'O'), ('cú', 'O'), ('đúp', 'O'), ('của', 'O'), ('Sheva', 'B-PERSON'), ('đã', 'O'), ('mang', 'O'), ('lại', 'O'), ('chiến', 'O'), ('thắng', 'O'), ('cho', 'O'), ('Ukraine', 'B-LOCATION'), ('-', 'I-LOCATION'), ('một', 'O'), ('chiến', 'O'), ('thắng', 'O'), ('không', 'O'), ("hề'", 'O'), ("dễ'", 'I-PERSON'), ('dàng..', 'O')]
