In [1]:
import os
import io
from collections import defaultdict
from sklearn.model_selection import train_test_split


In [2]:
gold_corpus = io.open("./Data/gold.txt", encoding="utf-8").readlines()
train, test = train_test_split(gold_corpus, train_size=0.8, random_state=23, shuffle=True)

In [3]:
tagsetDict = {
    "N": 0,
    "V": 1,
    "A": 2,
    "P": 3,
    "M": 4,
    "D": 5,
    "R": 6,
    "E": 7,
    "C": 8,
    "I": 9,
    "X": 10
}

In [4]:
inverseTagsetDict = {tagsetDict[k]: k for k in tagsetDict}
wordBank = defaultdict()
bigramBank = defaultdict()
bigramDict = defaultdict(int)

In [5]:
for line in train:
    line_split = line.split()
    for i, w in enumerate(line_split):
        parts = w.split("/")
        if i >= 1:
            prevParts = line_split[i-1].split("/")
        if len(parts) == 1 or parts[1] not in tagsetDict:
            continue

        word = parts[0]
        pos = parts[1]
        if i >= 1:
            prevWord = prevParts[0]
            prevPos = prevParts[1]
            bigramBank[word] = (pos, prevWord, prevPos)
            bigramDict[(pos, prevPos)] += 1

        if word not in wordBank:
            wordBank[word] = [pos]
        else:
            wordBank[word] += [pos]

In [6]:
bi_grams = []
tri_grams = []

for sentence in set(wordBank):
    temp = 0
    for s in list(sentence):
        if s == "_":
            temp += 1
    if temp == 1:
        bi_grams.append(sentence.replace("_", " "))
    elif temp == 2:
        tri_grams.append(sentence.replace("_", " "))

In [7]:
bigramFreq = {x: [0]*10 for x in tagsetDict}
mostCommonBigrams = defaultdict()
for k in bigramFreq:
    maxFreq = 0
    maxPos = "N"
    for i, x in enumerate(bigramFreq[k]):
        if x > maxFreq:
            maxFreq = x
            maxPos = inverseTagsetDict[i]
    mostCommonBigrams[k] = maxPos

In [8]:
def Viterbi_rule_based(word, wordIdx, lineSize, line):
    feat = [1]
    sentPercent = float(wordIdx)/float(lineSize)
    feat.append(sentPercent)

    if word[0].isupper() and wordIdx != 0:
        feat.append(1)
    else:
        feat.append(0)

    posIdx_array = ([0] * len(tagsetDict))
    posSet = []
    if word in wordBank:
        posSet = wordBank[word]
    else:
        if wordIdx == 0:
            posSet = list(tagsetDict.keys())[0]
            posIdx_array[tagsetDict[posSet]] = 1
            return feat + posIdx_array + [0]
        else:
            prevWord = line[wordIdx-1]
            if prevWord in wordBank:
                prevPos = wordBank[prevWord]
                maxPos = mostCommonBigrams[prevPos]
                if prevPos == "E":
                    maxPos = 3
                posIdx_array[tagsetDict[maxPos]] = 1
                feat += posIdx_array + [tagsetDict[maxPos]]
                return feat
            else:
                posSet = list(tagsetDict.keys())[0]
                posIdx_array[tagsetDict[posSet]] = 1
                feat += posIdx_array + [0]
                return feat

    for pos in posSet:
        posIdx = tagsetDict[pos]
        posIdx_array[posIdx] += 1.0 / len(wordBank[word])
    feat += (posIdx_array) + [0]
    return feat

In [9]:
y = []
X_train = []
for line in train:
    l_split = line.split()
    for i, w in enumerate(l_split):
        parts = w.split("/")
        word = parts[0]
        len_line = len(l_split)

        if len(parts) == 1 or parts[1] not in tagsetDict:
            continue
        y.append(wordBank[word][0])
        X_train.append(Viterbi_rule_based(word, i, len_line, l_split))

In [10]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [11]:
train_fit = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y)

In [12]:
import re
import unicodedata as ud

In [13]:
import string

In [14]:
X_test = []
correct_results = []
for line in test:
    l_split = line.split()
    for i,w in enumerate(l_split):
        len_line = len(l_split)
        parts = w.split("/")
        if len(parts) == 1 or parts[1] not in tagsetDict:
            continue
        word = parts[0]
        pos = parts[1]
        X_test.append(Viterbi_rule_based(word,i,len_line,l_split))
        correct_results.append(pos)

In [15]:
predicted_results = train_fit.predict(X_test)

In [16]:
from sklearn.metrics import classification_report
print('Results of the Markov hidden model combined with the Viterbi algorithm:\n')
print(classification_report(predicted_results, correct_results))

Results of the Markov hidden model combined with the Viterbi algorithm:

              precision    recall  f1-score   support

           A       0.29      1.00      0.45         5
           C       0.56      1.00      0.71         5
           D       1.00      1.00      1.00         3
           E       0.83      0.91      0.87        11
           M       0.40      1.00      0.57         2
           N       0.96      0.50      0.66       102
           P       0.92      1.00      0.96        12
           R       0.85      1.00      0.92        11
           V       0.44      0.91      0.59        23
           X       0.00      0.00      0.00         0

    accuracy                           0.69       174
   macro avg       0.63      0.83      0.67       174
weighted avg       0.84      0.69      0.70       174



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
def syllablize(sentence):
    word = '\w+'
    non_word = '[^\w\s]'
    digits = '\d+([\.,_]\d+)+'
    
    patterns = []
    patterns.extend([word, non_word, digits])
    patterns = f"({'|'.join(patterns)})"
    
    sentence = ud.normalize('NFC', sentence)
    tokens = re.findall(patterns, sentence, re.UNICODE)
    return [token[0] for token in tokens]

In [18]:
def longest_matching(sentence, bi_grams, tri_grams):
    syllables = syllablize(sentence)
    syl_len = len(syllables)
    
    curr_id = 0
    word_list = []
    done = False
    
    while (curr_id < syl_len) and (not done):
        curr_word = syllables[curr_id]
        if curr_id >= syl_len - 1:
            word_list.append(curr_word)
            done = True
        else:
            next_word = syllables[curr_id + 1]
            pair_word = ' '.join([curr_word.lower(), next_word.lower()])
            if curr_id >= (syl_len - 2):
                if pair_word in bi_grams:
                    word_list.append('_'.join([curr_word, next_word]))
                    curr_id += 2
                else:
                    word_list.append(curr_word)
                    curr_id += 1
            else:
                next_next_word = syllables[curr_id + 2]
                triple_word = ' '.join([pair_word, next_next_word.lower()])
                if triple_word in tri_grams:
                    word_list.append('_'.join([curr_word, next_word, next_next_word]))
                    curr_id += 3
                elif pair_word in bi_grams:
                    word_list.append('_'.join([curr_word, next_word]))
                    curr_id += 2
                else:
                    word_list.append(curr_word)
                    curr_id += 1
    return word_list

In [19]:
wl = ["vì","nó","rất","đặc_biệt","nên","tôi","đã","chú_ý"]

In [20]:
def toString(wl):
  wl=longest_matching(wl, bi_grams, tri_grams)
  X=[]
  A=[]
  text=""
  for i in set(wl):
    if i ==".":
      A="./."
    else:
      X.append(Viterbi_rule_based(i,1,1,1))
      A=str(train_fit.predict(X))
  #print(A)  
  for i in range(len(wl)):
    text+=wl[i]
    text+='/'
    text+=str(A[2])
    text+=' '
  return text

In [21]:
for i in wl:
    print(toString(i), end='')

vì/E nó/P rất/R đặc_biệt/A nên/C tôi/P đã/R chú_ý/V 