In [1]:
import io
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
import pandas as pd
import math

In [2]:
gold_corpus = io.open("./Data/gold.txt", encoding="utf-8").readlines()
gold_corpus[:5]

['thành_phố/N washington/N có/V 1/M kiến_trúc/N rất/R đa_dạng/A\n',
 'tuy_nhiên/C vì/E gặp/V nhiều/A khó_khăn/N trong/E cuộc_sống/N ông/N dần/R trở_nên/V khó_tính/A\n',
 'khí_hậu/N hồng_kông/N thuộc/V kiểu/N cận_nhiệt_đới/N và/C chịu/V ảnh_hưởng/V của/E gió_mùa/N\n',
 'khoảng/A hơn/A 70/M bề_mặt/N trái_đất/N được/V bao_phủ/V bởi/E các/D đại_dương/N nước_mặn/N phần/N còn_lại/V là/V các/D lục_địa/N và/C các/D đảo/N\n',
 'đà_lạt/N là/V thành_phố/N trực_thuộc/V tỉnh/N lâm_đồng/N nằm/V trên/E cao_nguyên/N lâm_viên/N thuộc/V vùng/N tây_nguyên/N việt_nam/N\n']

In [3]:
def format(data):
  formated = []
  for line in data:
    sentence = []
    words_tags = line.split()
    for word_tag in words_tags:
      word, tag = word_tag.split("/")
      sentence.append((word,tag))
    formated.append(sentence)
  return formated

In [4]:
data = format(gold_corpus)

In [5]:
train_set, test_set = train_test_split(data, train_size=0.8, random_state=23, shuffle=True)

In [6]:
print(len(train_set))
print(len(test_set))

49
13


In [7]:
with open('./Vocab/vocab.txt', encoding='utf8') as f:
  vocab = f.read().splitlines() 
vocab_dict = {}
index = 0
for word in sorted(vocab): 
    if word not in vocab_dict: 
        vocab_dict[word] = index  
        index += 1

In [8]:
len(vocab_dict)

22600

In [9]:
def preprocess(data, vocab_dict):
  all_tags = []
  all_words = []
  tagged_words = []
  for sent in data:
    all_tags.append("<s>")
    for word, tag in sent:
        all_tags.append(tag)
        if word not in vocab_dict:
          word = "<unk>"
        all_words.append(word)
        tagged_words.append((word, tag))
  return all_tags, all_words, tagged_words

In [10]:
train_tags, train_words, train_tagged_words = preprocess(train_set, vocab_dict)
test_tags, test_words, test_tagged_words = preprocess(test_set, vocab_dict)

In [11]:
train_words[:5]

['những', 'website', 'được', 'thiết_kế', 'đẹp_mắt']

In [12]:
train_tags[:5]

['<s>', 'D', 'N', 'V', 'V']

In [13]:
train_tagged_words[:5]

[('những', 'D'),
 ('website', 'N'),
 ('được', 'V'),
 ('thiết_kế', 'V'),
 ('đẹp_mắt', 'A')]

In [14]:
# transition count
def transition_count(all_tags):
  transition_count = defaultdict(int)
  prev_tag = all_tags[0]
  for i in range(1, len(all_tags)):
    tag = all_tags[i]
    transition_count[(prev_tag, tag)] += 1
    prev_tag = tag
  return transition_count
transition = transition_count(train_tags)

In [15]:
transition

defaultdict(int,
            {('<s>', 'D'): 4,
             ('D', 'N'): 18,
             ('N', 'V'): 42,
             ('V', 'V'): 27,
             ('V', 'A'): 17,
             ('A', 'C'): 4,
             ('C', 'N'): 5,
             ('N', 'A'): 15,
             ('A', 'R'): 3,
             ('R', 'V'): 24,
             ('V', 'R'): 8,
             ('R', 'N'): 2,
             ('N', 'E'): 30,
             ('E', 'N'): 32,
             ('N', '<s>'): 25,
             ('<s>', 'N'): 22,
             ('V', 'N'): 44,
             ('N', 'R'): 12,
             ('E', 'P'): 8,
             ('P', 'R'): 12,
             ('R', 'A'): 10,
             ('C', 'P'): 2,
             ('P', 'V'): 18,
             ('A', 'A'): 3,
             ('A', '<s>'): 13,
             ('V', 'P'): 6,
             ('A', 'V'): 10,
             ('V', 'D'): 6,
             ('R', 'R'): 5,
             ('C', 'I'): 1,
             ('I', 'P'): 1,
             ('A', 'E'): 5,
             ('P', '<s>'): 3,
             ('V', 'C'): 3,
    

In [16]:
# emission count
def emission_count(tagged_words):
  emission_count = defaultdict(int)
  for tagged_word in tagged_words:
    word, tag = tagged_word
    emission_count[(tag, word)] += 1
  return emission_count
emission = emission_count(train_tagged_words)

In [17]:
emission

defaultdict(int,
            {('D', 'những'): 10,
             ('N', 'website'): 1,
             ('V', 'được'): 6,
             ('V', 'thiết_kế'): 1,
             ('A', 'đẹp_mắt'): 1,
             ('C', 'với'): 1,
             ('N', 'bố_cục'): 1,
             ('A', 'hài_hoà'): 1,
             ('C', 'và'): 7,
             ('N', 'tỉ_lệ'): 1,
             ('A', 'cân_đối'): 1,
             ('R', 'sẽ'): 3,
             ('V', 'chiếm'): 1,
             ('R', 'được'): 1,
             ('N', 'thiện_cảm'): 1,
             ('E', 'của'): 14,
             ('N', 'người_dùng'): 2,
             ('N', 'việc'): 2,
             ('V', 'tối_ưu'): 1,
             ('N', 'tốc_độ'): 1,
             ('V', 'tải'): 1,
             ('N', 'trang'): 1,
             ('R', 'luôn'): 1,
             ('V', 'là'): 12,
             ('N', 'nhiệm_vụ'): 1,
             ('V', 'đặt'): 2,
             ('V', 'lên'): 1,
             ('N', 'hàng_đầu'): 1,
             ('N', 'tiếng_anh'): 1,
             ('P', 'tôi'): 13,
           

In [18]:
# tag count
def tag_count(all_tags):
  tag_count = defaultdict(int)
  for tag in all_tags:
    tag_count[tag] += 1
  return tag_count
tags = tag_count(train_tags)

In [19]:
tags

defaultdict(int,
            {'<s>': 49,
             'D': 18,
             'N': 187,
             'V': 138,
             'A': 51,
             'C': 15,
             'R': 43,
             'E': 57,
             'P': 39,
             'I': 2,
             'M': 12})

## only emission

In [20]:
states = sorted(tags.keys())

In [21]:
states

['<s>', 'A', 'C', 'D', 'E', 'I', 'M', 'N', 'P', 'R', 'V']

In [22]:
# predict with emission count
def predict_pos(tagged_words, emission, vocab, states):
  num_correct = 0
  valid_states = states.copy()
  valid_states.remove("<s>")
  for tagged_word in tagged_words:
    word, true_tag = tagged_word
    count_final = 0 
    count = 0
    tag_final = ''

    for tag in valid_states:
      if word not in vocab_dict:
        tag_final = valid_states[0]
        break
      if (tag, word) not in emission:
        continue
      count = emission[(tag, word)]

      if count > count_final:
        count_final = count
        tag_final = tag
    if tag_final == true_tag:
      num_correct += 1
  accuracy = num_correct / len(tagged_words)
  return accuracy


In [23]:
accuracy = predict_pos(train_tagged_words, emission, vocab_dict, states)
print('Độ chính xác trên tập train:', accuracy)

Độ chính xác trên tập train: 0.9750889679715302


In [24]:
accuracy = predict_pos(test_tagged_words, emission, vocab_dict, states)
print('Độ chính xác trên tập test:', accuracy)

Độ chính xác trên tập test: 0.45977011494252873


## hidden markov

In [25]:
def transition_matrix(alpha, transition, states, tags):
  len_tags = len(states)
  A = np.zeros((len_tags, len_tags))
  
  for i in range(len_tags):
    for j in range(len_tags):
      count = 0
      if (states[i], states[j]) in transition:
        count = transition[(states[i], states[j])]
      
      count_prev_tag = tags[states[i]]
      A[i, j] = (count + alpha) / (count_prev_tag + alpha * len_tags)
  return A
  

In [26]:
alpha = 0.01
A = transition_matrix(alpha, transition, states, tags)

In [27]:
def emission_matrix(alpha, emission, vocab_dict, states, tags):
  vocab = [v for v in vocab_dict]
  valid_states = states.copy()
  valid_states.remove("<s>")
  len_tags = len(valid_states)
  len_vocab = len(vocab)

  B = np.zeros((len_tags, len_vocab))
  
  for i in range(len_tags):
    for j in range(len_vocab):
      count = 0
      if (valid_states[i], vocab[j]) in emission:
        count = emission[(valid_states[i], vocab[j])]
      
      count_tag = tags[valid_states[i]]
      B[i,j] = (count + alpha) / (count_tag + alpha * len_vocab)
  return B

In [28]:
B = emission_matrix(alpha, emission,vocab_dict, states, tags)

In [29]:
A = np.array([sublist[1:].tolist() for sublist in A])

In [30]:
pd.DataFrame(A,
index = states,
columns = states[1:])

Unnamed: 0,A,C,D,E,I,M,N,P,R,V
<s>,0.020566,0.040929,0.081653,0.061291,0.000204,0.040929,0.448178,0.203828,0.061291,0.040929
A,0.058893,0.078458,0.019761,0.098024,0.000196,0.039327,0.137155,0.058893,0.058893,0.195852
C,0.000662,0.000662,0.066843,0.066843,0.066843,0.000662,0.331568,0.133024,0.000662,0.331568
D,0.000552,0.000552,0.000552,0.000552,0.000552,0.000552,0.994478,0.000552,0.000552,0.000552
E,0.035195,0.000175,0.087725,0.000175,0.000175,0.017685,0.560497,0.140256,0.000175,0.157766
I,0.004739,0.004739,0.004739,0.004739,0.004739,0.004739,0.004739,0.478673,0.004739,0.478673
M,0.000826,0.000826,0.000826,0.083402,0.000826,0.000826,0.909166,0.000826,0.000826,0.000826
N,0.08022,0.026776,0.005398,0.160387,5.3e-05,0.010742,0.240554,0.048153,0.064187,0.22452
P,0.076962,0.025825,0.000256,0.025825,0.000256,0.000256,0.025825,0.000256,0.307083,0.460496
R,0.232197,0.000232,0.000232,0.023428,0.000232,0.023428,0.046625,0.000232,0.116214,0.556947


In [31]:
def viterbi_initialize(states, tags, A, B, corpus, vocab_dict):
  valid_states = states.copy()
  valid_states.remove("<s>")
  len_tags = len(valid_states)

  best_probs = np.zeros((len_tags, len(corpus)))
  best_paths = np.zeros((len_tags, len(corpus)), dtype=int)

  for i in range(len_tags):
      index = vocab_dict[corpus[0]]
      best_probs[i,0] = math.log(A[0, i]) + math.log(B[i, index])
  return best_probs, best_paths

In [32]:
best_probs_train, best_paths_train = viterbi_initialize(states, train_tags, A, B, train_words, vocab_dict)
best_probs_test, best_paths_test = viterbi_initialize(states, tags, A, B, test_words, vocab_dict)

In [33]:
def viterbi_forward(A, B, corpus, best_probs, best_paths, vocabs_dict):
    num_tags = best_probs.shape[0]
    for i in range(1, len(corpus)): 
        for j in range(num_tags):
            best_prob_i = float('-inf')
            best_path_i = None
            for k in range(num_tags):
                index = vocabs_dict[corpus[i]]
                prob = best_probs[k, i - 1] + math.log(A[k, j]) + math.log(B[j, index])
                # prob = best_probs[k, i - 1] + math.log(A[k, j - 1]) + math.log(B[j - 1, index])

                if prob > best_prob_i:
                    best_prob_i = prob
                    best_path_i = k
                    
            best_probs[j, i] = best_prob_i
            best_paths[j, i] = best_path_i
            
    return best_probs, best_paths

In [34]:
best_probs_train, best_paths_train = viterbi_forward(A, B, train_words, best_probs_train, best_paths_train, vocab_dict)
best_probs_test, best_paths_test = viterbi_forward(A, B, test_words, best_probs_test, best_paths_test, vocab_dict)

In [35]:
def viterbi_backward(best_probs, best_paths, corpus, states):
    valid_states = states.copy()
    valid_states.remove("<s>")
    m = best_paths.shape[1] 
    z = [None] * m
    pred = [None] * m
    
    best_prob_for_last_word = float('-inf')
    num_tags = best_probs.shape[0]
    
    for k in range(num_tags):
        if best_probs[k, m - 1] > best_prob_for_last_word:
            best_prob_for_last_word = best_probs[k, m - 1]
            z[m - 1] = k
            
    pred[m - 1] = valid_states[z[m - 1]]
    for i in range(m - 1, -1, -1):
        z[i - 1] = best_paths[z[i], i]
        pred[i - 1] = valid_states[z[i - 1]]
    return pred

In [36]:
train_pred = viterbi_backward(best_probs_train, best_paths_train, train_words, states)
test_pred = viterbi_backward(best_probs_test, best_paths_test, test_words, states)

In [37]:
i = 0
for tag in test_tags:
  if tag == "<s>":
    print()
    continue
  else:
    print(test_words[i], end="/")
    print(test_pred[i], end=" ")
    i+=1


đà_lạt/N là/N thành_phố/N trực_thuộc/N tỉnh/N lâm_đồng/N nằm/N trên/E cao_nguyên/N <unk>/M thuộc/V vùng/V tây_nguyên/V việt_nam/V 
song_song/V là/V 2/A cửa_sổ/N 2/N người/N ngồi/N trong/E cửa_sổ/N song_song/N 
nó/P có/V đầy_đủ/A các/D nhân_vật/V được/V phát_triển/V chuẩn/A và/C 1/M bối_cảnh/V trò_chơi/V vô_cùng/R sống_động/R 
nhưng/C đó/P là/V chuyện/V đã/V qua/V 
nếu_như/V là/V trước_đây/V chắc/A tôi/P sẽ/R sợ/R không/R muốn/V ai/A nhìn/N thấy/N phần/N cơ_thể/N đó/N 
sau/N trận/N mưa_lũ/N lịch_sử/N đà_nẵng/N giá/N rau/N nhích/N lên/V chính_quyền/V cảnh_báo/V không/R lợi_dụng/V nâng/V giá/V 
gõ/A <unk>/D bằng/V 5/V <unk>/V sẽ/R giúp/V bạn/V có/V tốc_độ/V gõ/V nhanh/A hơn/A 
mỗi/D buổi/V chiều/A tôi/P thường/V dành/V thời_gian/N để/E tập/N gym/N hoặc/N đá/N bóng/N và/C tôi/P đang/R cố_gắng/V để/E duy_trì/N nó/N 
sứ_mệnh/N của/E <unk>/M là/V mang/V đến/A những/D trải_nghiệm/V thú_vị/A và/C độc_đáo/P cho/E người_dùng/N 
điều/N quan_trọng/N là/V bạn/V phải/V có/V mục_tiêu/V rõ_ràng/A và/C

In [38]:
import warnings
warnings.filterwarnings('ignore')

In [39]:
from sklearn.metrics import classification_report
def report(pred, gold):
    y_pred = pred
    y_true = [t for t in gold if t != "<s>"]
    
    print(classification_report(y_pred, y_true))
    return y_pred, y_true

In [40]:
print('Kết quả của HMM + Viterbi trên tập train:\n')
y_pred, y_true_train = report(train_pred, train_tags)

Kết quả của HMM + Viterbi trên tập train:

              precision    recall  f1-score   support

           A       0.94      0.83      0.88        58
           C       0.67      0.91      0.77        11
           D       0.50      0.90      0.64        10
           E       0.86      0.94      0.90        52
           I       0.00      0.00      0.00         0
           M       0.42      0.71      0.53         7
           N       0.82      0.72      0.77       213
           P       0.77      0.91      0.83        33
           R       0.86      0.97      0.91        38
           V       0.70      0.69      0.70       140

    accuracy                           0.78       562
   macro avg       0.65      0.76      0.69       562
weighted avg       0.79      0.78      0.78       562



In [41]:
print('Kết quả của HMM + Viterbi trên tập test:\n')
y_pred, y_true_test = report(test_pred, test_tags)

Kết quả của HMM + Viterbi trên tập test:

              precision    recall  f1-score   support

           A       0.47      0.50      0.48        16
           C       0.56      1.00      0.71         5
           D       1.00      0.75      0.86         4
           E       0.67      1.00      0.80         8
           M       0.40      0.40      0.40         5
           N       0.43      0.57      0.49        40
           P       0.62      0.73      0.67        11
           R       0.77      0.83      0.80        12
           V       0.67      0.44      0.53        73
           X       0.00      0.00      0.00         0

    accuracy                           0.57       174
   macro avg       0.56      0.62      0.57       174
weighted avg       0.60      0.57      0.57       174



# VNCoreNLP

In [42]:
import py_vncorenlp
model = py_vncorenlp.VnCoreNLP(annotators=["pos"], save_dir='./')

In [43]:
def predict(test_set, ifprint=True):
  pred = []
  true = []
  for sent in test_set:
    for w, t in sent:
      true.append(t)
      if "_" in w:
        w = w.replace("_", " ")
      model_pred = model.annotate_text(w)[0]
      for token in model_pred:
        if type(token) != str:
          word = token['wordForm']
          tag = token['posTag']
          if ifprint:
            print(word + '/' + tag, end=" ")
      if tag == "L":
        tag = "D"
      if tag == "T":
        tag = "X"
      pred.append(tag[0])
    if ifprint:
      print()
  return pred, true


In [44]:
vncore_pred, true = predict(train_set, ifprint=False)
print('Kết quả của VnCoreNLP trên tập train:\n')
y_pred, y_true_test = report(vncore_pred, true)

Kết quả của VnCoreNLP trên tập train:

              precision    recall  f1-score   support

           A       0.86      0.79      0.82        56
           C       0.87      0.76      0.81        17
           D       0.94      1.00      0.97        17
           E       0.95      0.90      0.92        60
           I       0.00      0.00      0.00         0
           M       0.92      0.92      0.92        12
           N       0.88      0.92      0.90       179
           P       0.87      0.94      0.91        36
           R       0.84      0.90      0.87        40
           V       0.86      0.83      0.85       143
           X       0.00      0.00      0.00         2

    accuracy                           0.88       562
   macro avg       0.73      0.72      0.72       562
weighted avg       0.88      0.88      0.87       562



In [45]:
vncore_pred, true = predict(test_set, ifprint=False)
print('Kết quả của VnCoreNLP trên tập test:\n')
y_pred, y_true_test = report(vncore_pred, true)

Kết quả của VnCoreNLP trên tập test:

              precision    recall  f1-score   support

           A       0.94      1.00      0.97        16
           C       0.89      1.00      0.94         8
           D       1.00      0.75      0.86         4
           E       1.00      1.00      1.00        12
           M       1.00      1.00      1.00         5
           N       0.91      0.91      0.91        53
           P       0.77      1.00      0.87        10
           R       1.00      1.00      1.00        13
           V       0.98      0.90      0.94        52
           X       0.00      0.00      0.00         1

    accuracy                           0.93       174
   macro avg       0.85      0.86      0.85       174
weighted avg       0.94      0.93      0.93       174



# Custom Test

In [46]:
sentence = ["kiến_thức","đó","rất", "phức_tạp"]

In [47]:
cidx  = ["kiến_thức"]
rvals = states[1:]
cols = [vocab_dict[word] for word in cidx]
rows = [rvals.index(tag) for tag in rvals]
pd.DataFrame(B[np.ix_(rows, cols)], index=rvals, columns=cidx)

Unnamed: 0,kiến_thức
A,3.6e-05
C,4.1e-05
D,4.1e-05
E,3.5e-05
I,4.4e-05
M,4.2e-05
N,0.004867
P,3.8e-05
R,3.7e-05
V,2.7e-05


In [48]:
best_probs_sent, best_paths_sent = viterbi_initialize(states, tags, A, B, sentence, vocab_dict)
pd.DataFrame(best_probs_sent,
index = states[1:],
columns = sentence)

Unnamed: 0,kiến_thức,đó,rất,phức_tạp
A,-14.1133,0.0,0.0,0.0
C,-13.285895,0.0,0.0,0.0
D,-12.60761,0.0,0.0,0.0
E,-13.04274,0.0,0.0,0.0
I,-18.533749,0.0,0.0,0.0
M,-13.273369,0.0,0.0,0.0
N,-6.127879,0.0,0.0,0.0
P,-11.775378,0.0,0.0,0.0
R,-12.992004,0.0,0.0,0.0
V,-13.698252,0.0,0.0,0.0


In [49]:
best_probs_sent, best_paths_sent = viterbi_forward(A, B, sentence, best_probs_sent, best_paths_sent, vocab_dict)

In [50]:
pd.DataFrame(best_probs_sent,
index = states[1:],
columns = sentence)

Unnamed: 0,kiến_thức,đó,rất,phức_tạp
A,-14.1133,-23.456268,-31.549028,-33.704598
C,-13.285895,-23.317048,-32.507088,-39.272487
D,-12.60761,-23.329419,-34.053264,-43.899979
E,-13.04274,-18.862577,-29.586422,-39.433137
I,-18.533749,-23.261596,-33.985442,-43.832157
M,-13.273369,-23.304521,-33.407863,-43.875082
N,-6.127879,-16.851724,-27.575569,-38.299415
P,-11.775378,-18.79686,-32.015122,-43.324406
R,-12.992004,-23.426962,-25.526092,-36.906613
V,-13.698252,-23.729404,-30.792973,-36.803867


In [51]:

pd.DataFrame(best_paths_sent,
index = states[1:],
columns = sentence)

Unnamed: 0,kiến_thức,đó,rất,phức_tạp
A,0,6,7,8
C,0,6,7,8
D,0,6,6,8
E,0,6,6,8
I,0,6,6,8
M,0,6,7,8
N,0,6,6,6
P,0,6,7,0
R,0,6,7,8
V,0,6,7,8


In [52]:
sent_pred = viterbi_backward(best_probs_sent, best_paths_sent, test_words, states)

In [53]:
sent_pred

['N', 'P', 'R', 'A']

In [54]:
for word, tag in zip(sentence,sent_pred):
  print(word + "/" + tag, end=" ")

kiến_thức/N đó/P rất/R phức_tạp/A 