# Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import io
from collections import defaultdict
import codecs

In [None]:
# List data
data_path =  "/content/drive/MyDrive/School Projects/CS221 - Xử lý ngôn ngữ tự nhiên/Data"
file_name = os.listdir(data_path)
file_path = [os.path.join(data_path, f) for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))] #Filtering only the files.

In [None]:
file_path

['/content/drive/MyDrive/School Projects/CS221 - Xử lý ngôn ngữ tự nhiên/Data/easy_clean.txt',
 '/content/drive/MyDrive/School Projects/CS221 - Xử lý ngôn ngữ tự nhiên/Data/hard_clean.txt']

In [None]:
data_hard = io.open(file_path[0], encoding="utf-8").readlines()
data_easy = io.open(file_path[1], encoding="utf-8").readlines()

In [None]:
data = data_easy
data.extend(data_hard)

In [None]:
len(data)

62

In [None]:
data

[' những/D website/N được/V thiết_kế/V đẹp_mắt/A với/C bố_cục/N hài_hòa/A và/C tỉ_lệ/N cân_đối/A sẽ/R chiếm/V được/R thiện_cảm/N của/E người/N dùng/V\n',
 ' gõ/V bàn_phím/N bằng/E 5/M ngón_tay/N sẽ/R giúp/V bạn/P có/V tốc_độ/N gõ/V nhanh/A hơn/A\n',
 ' ý_nghĩa/N của/E tiết_kiệm/V không/R chỉ/R là/V tiền/N mà/C khi/N gặp/V biến_cố/N cũng/R có_thể/R bình_tĩnh/A\n',
 ' thành_phố/N washington/N có/V một/M kiến_trúc/N rất/R đa_dạng/A\n',
 ' cà_phê/N ở/E đây/P làm/V tôi/P phê/V tận/V nóc/N\n',
 ' song_song/A là/V hai/M cửa_sổ/N hai/M người/N ngồi/V trong/E cửa_sổ/N song_song/A\n',
 ' người/N dùng/V có_thể/R xem/V và/C chỉ/V rõ/A thông_tin/N nào/P được/V thu_thập/V để/E thực_thi/V một_số/D chế_định/N về/E quyền/N riêng_tư/A\n',
 ' nếu_như/C là/V trước_đây/N chắc/V tôi/P sẽ/R sợ/V không/R muốn/V ai/P nhìn/V thấy/V phần/N cơ_thể/N đó/P\n',
 ' sứ_mệnh/N của/E visual_designer/N là/V mang/V đến/V những/D trải_nghiệm/N thú_vị/A và/C độc_đáo/A cho/E người/N dùng/V\n',
 ' tự/P đặt/V mình/P vào/E vị_t

In [None]:
import random
random.seed(1)
random.shuffle(data)
train = data[:49]
test = data[49:]

In [None]:
print(len(train), len(test))

49 13


In [None]:
tagsetDict = {
    "N": 0,
    "V": 1,
    "A": 2,
    "P": 3,
    "M": 4,
    "D": 5,
    "R": 6,
    "E": 7,
    "C": 8,
    "I": 9,
    "X": 10
}

# Create Dict

In [None]:
emission_counts = defaultdict(int)
transition_counts = defaultdict(int)
tag_counts = defaultdict(int)
wordBank = defaultdict()

In [None]:
for line in train:
    line_split = line.split()
    for i, w in enumerate(line_split):
        parts = w.split("/")
        if i == 0:
            tag_counts['--s--'] +=1
            prevWord = "--n--"
            prevPos = "--s--"
        else:
            prevParts = line_split[i-1].split("/")
            if len(parts) == 1 or parts[1] not in tagsetDict:
                continue
            prevWord = prevParts[0]
            prevPos = prevParts[1]
        word = parts[0]
        pos = parts[1]

        transition_counts[(prevPos, pos)] += 1
        emission_counts[(pos, word)] += 1
        tag_counts[pos] += 1
        if word not in wordBank:
            wordBank[word] = [pos]
        else:
            wordBank[word] += [pos]

In [None]:
print("Transition examples: ")
for example in list(transition_counts.items())[:5]:
    print(example)

Transition examples: 
(('--s--', 'N'), 21)
(('N', 'E'), 31)
(('E', 'N'), 32)
(('N', 'N'), 45)
(('N', 'R'), 14)


In [None]:
print("Emission examples: ")
for example in list(emission_counts.items())[:5]:
    print (example)

Emission examples: 
(('N', 'internet'), 1)
(('E', 'ở'), 4)
(('N', 'kí_túc_xá'), 1)
(('N', 'khu'), 1)
(('N', 'b'), 1)


In [None]:
states = sorted(tag_counts.keys())

In [None]:
states

['--s--', 'A', 'C', 'D', 'E', 'I', 'M', 'N', 'P', 'R', 'V', 'X']

# Predict with only emission count

In [None]:
def predict_emission(data, emission_counts, states):
  num_correct = 0
  correct_results = []
  for line in data:
      l_split = line.split()
      for i,w in enumerate(l_split):
          parts = w.split("/")
          if len(parts) == 1 or parts[1] not in tagsetDict:
              continue
          word = parts[0]
          correct_pos = parts[1]
          correct_results.append(correct_pos)

          if word not in wordBank: 
              continue

          count_final = 0
          pos_final = ''

          for pos in states:
              if (pos, word) not in emission_counts: 
                continue
              count = emission_counts[(pos, word)]
              if count > count_final:
                  count_final = count
                  pos_final = pos
                  
          if pos_final == correct_pos:
            num_correct += 1

  accuracy = num_correct / len(correct_results)
  return accuracy

In [None]:
predict_emission(train, emission_counts,states)

0.9704433497536946

In [None]:
predict_emission(test, emission_counts,states)

0.39285714285714285

# Hidden Markov

In [None]:
import numpy as np
import pandas as pd

In [None]:
def create_transition_matrix(alpha, tag_counts, transition_counts):
    all_tags = sorted(tag_counts.keys())
    num_tags = len(all_tags)
    
    A = np.zeros((num_tags, num_tags))
    trans_keys = set(transition_counts.keys())
    
    for i in range(num_tags):
        for j in range(num_tags):
            count = 0
            key = (all_tags[i], all_tags[j])
            if key in transition_counts: 
                count = transition_counts[key]
            count_prev_tag = tag_counts[all_tags[i]]
            A[i, j] = (count + alpha) / (count_prev_tag + alpha * num_tags)
    return A

In [None]:
def create_emission_matrix(alpha, tag_counts, emission_counts, vocabs):
    all_tags = sorted(tag_counts.keys())
    num_tags = len(tag_counts)
    num_words = len(vocabs)
    
    B = np.zeros((num_tags, num_words))
    emis_keys = set(list(emission_counts.keys()))
    
    for i in range(num_tags):
        for j, word in enumerate(vocabs):
            count = 0
            key = (all_tags[i], word)
            if key in emission_counts.keys(): 
                count = emission_counts[key]
            count_tag = tag_counts[all_tags[i]]
            B[i, j] = (count + alpha) / (count_tag + alpha * num_words)
    return B

In [None]:
alpha = 0.001
for i in range(len(states)): tag_counts.pop(i, None)
A = create_transition_matrix(alpha, tag_counts, transition_counts)
for i in range(len(states)): tag_counts.pop(i, None)
B = create_emission_matrix(alpha, tag_counts, emission_counts, wordBank)

# viterbi

In [None]:
import math

In [None]:
A = np.array([sublist[1:].tolist() for sublist in A])
B = B[1:]

In [None]:
def viterbi_initialize(states, tag_counts, A, B, corpus, vocabs_dict):
    num_tags = len(tag_counts)
    s_idx = states.index('--s--')
    best_probs = np.zeros((num_tags, len(corpus)))
    best_paths = np.zeros((num_tags, len(corpus)), dtype=int)
    
    for i in range(num_tags):
        if A[s_idx, i - 1] == 0: 
            best_probs[i, 0] = float('-inf')
        else: 
            index = list(vocabs_dict.keys()).index(corpus[0])
            # best_probs[i, 0] = math.log(A[s_idx, i]) + math.log(B[i, index])
            best_probs[i, 0] = math.log(A[s_idx, i - 1]) + math.log(B[i - 1, index])
    return best_probs, best_paths

In [None]:
def viterbi_forward(A, B, corpus, best_probs, best_paths, vocabs_dict):
    num_tags = best_probs.shape[0]
    
    for i in range(1, len(corpus)): 
        for j in range(num_tags):
            best_prob_i = float('-inf')
            best_path_i = None
            
            for k in range(num_tags):
                if corpus[i] not in vocabs_dict:
                    prob = float('-inf')

                else:
                    index = list(vocabs_dict.keys()).index(corpus[i])
                    # prob = best_probs[k, i - 1] + math.log(A[k, j]) + math.log(B[j, index])
                    prob = best_probs[k, i - 1] + math.log(A[k, j - 1]) + math.log(B[j - 1, index])

                if prob >= best_prob_i:
                    best_prob_i = prob
                    best_path_i = k
                    
            best_probs[j, i] = best_prob_i
            best_paths[j, i] = best_path_i
            
    return best_probs, best_paths

In [None]:
def viterbi_backward(best_probs, best_paths, corpus, states):
    m = best_paths.shape[1] 
    z = [None] * m
    pred = [None] * m
    
    best_prob_for_last_word = float('-inf')
    num_tags = best_probs.shape[0]
    
    for k in range(num_tags):
        if best_probs[k, m - 1] >= best_prob_for_last_word:
            best_prob_for_last_word = best_probs[k, m - 1]
            z[m - 1] = k
            
    pred[m - 1] = states[z[m - 1]]
    for i in range(m - 1, -1, -1):
        z[i - 1] = best_paths[z[i], i]
        pred[i - 1] = states[z[i - 1]]
    return pred

In [None]:
words = [w for w in wordBank]

In [None]:
train_words = []
for line in train:
    l_split = line.split()
    for w in l_split:
        len_line = len(l_split)
        parts = w.split("/")
        word = parts[0]
        train_words.append(word)

In [None]:
best_probs_train, best_paths_train = viterbi_initialize(states, tag_counts, A, B, train_words, wordBank)
best_probs_train, best_paths_train = viterbi_forward(A, B, train_words, best_probs_train, best_paths_train, wordBank)

In [None]:
train_pred = viterbi_backward(best_probs_train, best_paths_train, train_words, states)
print(words[0:10])
print(train_pred[0:10])

['internet', 'ở', 'kí_túc_xá', 'khu', 'b', 'rất', 'yếu', 'vừa', 'ngồi', 'vào']
['N', 'E', 'N', 'N', 'N', 'R', 'A', 'R', 'V', 'E']


In [None]:
test_words = []
for line in test:
    l_split = line.split()
    for w in l_split:
        len_line = len(l_split)
        parts = w.split("/")
        word = parts[0]
        test_words.append(word)
        

In [None]:
best_probs_test, best_paths_test = viterbi_initialize(states, tag_counts, A, B, test_words, wordBank)
best_probs_test, best_paths_test = viterbi_forward(A, B, test_words, best_probs_test, best_paths_test, wordBank)

In [None]:
test_pred = viterbi_backward(best_probs_test, best_paths_test, test_words, states)
print(test_words)
print(test_pred)

['nó', 'có', 'đầy_đủ', 'các', 'nhân_vật', 'được', 'phát_triển', 'chuẩn', 'và', 'một', 'bối_cảnh', 'trò_chơi', 'vô_cùng', 'sống_động', 'đám_cưới', 'của', 'cặp_đôi', 'tổ_chức', 'ấm_cúng', 'tại', 'nhà_riêng', 'ở', 'hà_nội', 'vào', 'chiều', 'nay', 'số_lượng', 'share', 'chính', 'là', 'đích_đến', 'tất_cả', 'thương_hiệu', 'công_việc', 'tôi', 'không', 'hẳn', 'dễ', 'nhưng', 'dường_như', 'đang', 'suy_nghĩ', 'nhiều', 'về', 'họ', 'đều', 'đồng_loạt', 'cho', 'rằng', 'đây', 'sự_kiện', 'đáng', 'chú_ý', 'nhất', 'bóng_đá', 'đông_nam_á', 'những', 'con_sâu', 'ẩn_mình', 'sâu', 'trong', 'vòm', 'cây', 'cơn_gió', 'căng_tràng', 'vi_vu', 'khắp', 'núi_rừng', 'đà_lạt', 'khiến', 'quả_hồng', 'uống', 'no_nê', 'giọt_sương', 'mờ', 'kiến', 'bò', 'lúc_nhúc', 'xung_quanh', 'đĩa', 'thịt', 'lúc_lắc', 'trận', 'banh', 'tối', 'diễn_ra', 'sân', 'thể_dục', 'thể_thao', 'tiếng_anh', 'rất', 'kém', 'vì_vậy', 'cần', 'cải_thiện', 'hơn', 'đó', 'thói_quen', 'chuyên_gia', 'công_trình', 'xanh', 'sẽ', 'tiết_kiệm', 'chủ', 'đầu_tư', 'vừa', 