In [1]:
import re
import numpy as np
import pandas as pd
from keras.models import load_model
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
vietnamese_char_dict = {
    ('a', 5): 'á',
    ('a', 9): 'à',
    ('a', 13): 'ả',
    ('a', 17): 'ã',
    ('a', 21): 'ạ',
    ('a', 1): 'â',
    ('a', 6): 'ấ',
    ('a', 10): 'ầ',
    ('a', 14): 'ẩ',
    ('a', 18): 'ẫ',
    ('a', 22): 'ậ',
    ('a', 3): 'ă',
    ('a', 8): 'ắ',
    ('a', 12): 'ằ',
    ('a', 16): 'ẳ',
    ('a', 20): 'ẵ',
    ('a', 24): 'ặ',
    ('d', 4): 'đ',
    ('e', 5): 'é',
    ('e', 9): 'è',
    ('e', 13): 'ẻ',
    ('e', 17): 'ẽ',
    ('e', 21): 'ẹ',
    ('e', 1): 'ê',
    ('e', 6): 'ế',
    ('e', 10): 'ề',
    ('e', 14): 'ể',
    ('e', 18): 'ễ',
    ('e', 22): 'ệ',
    ('i', 5): 'í',
    ('i', 9): 'ì',
    ('i', 13): 'ỉ',
    ('i', 17): 'ĩ',
    ('i', 21): 'ị',
    ('o', 5): 'ó',
    ('o', 9): 'ò',
    ('o', 13): 'ỏ',
    ('o', 17): 'õ',
    ('o', 21): 'ọ',
    ('o', 1): 'ô',
    ('o', 6): 'ố',
    ('o', 10): 'ồ',
    ('o', 14): 'ổ',
    ('o', 18): 'ỗ',
    ('o', 22): 'ộ',
    ('o', 2): 'ơ',
    ('o', 7): 'ớ',
    ('o', 11): 'ờ',
    ('o', 15): 'ở',
    ('o', 19): 'ỡ',
    ('o', 23): 'ợ',
    ('u', 5): 'ú',
    ('u', 9): 'ù',
    ('u', 13): 'ủ',
    ('u', 17): 'ũ',
    ('u', 21): 'ụ',
    ('u', 2): 'ư',
    ('u', 7): 'ứ',
    ('u', 11): 'ừ',
    ('u', 15): 'ử',
    ('u', 19): 'ữ',
    ('u', 23): 'ự',
    ('y', 5): 'ý',
    ('y', 9): 'ỳ',
    ('y', 13): 'ỷ',
    ('y', 17): 'ỹ',
    ('y', 21): 'ỵ'
}

alphabet_dict = {
    'a': 0,
    'b': 1,
    'c': 2,
    'd': 3,
    'e': 4,
    'f': 5,
    'g': 6,
    'h': 7,
    'i': 8,
    'j': 9,
    'k': 10,
    'l': 11,
    'm': 12,
    'n': 13,
    'o': 14,
    'p': 15,
    'q': 16,
    'r': 17,
    's': 18,
    't': 19,
    'u': 20,
    'v': 21,
    'w': 22,
    'x': 23,
    'y': 24,
    'z': 25,
    ' ': 26
}

reverse_alphabet_dict = {v: k for k, v in alphabet_dict.items()}

alphabet_size = len(alphabet_dict)

In [3]:
def represent(line_int_repr, tone_int_repr):
    """
    Args:
        line_int_repr: a list of int represents the tone-removed characters (english alphabet or blank space)
        tone_int_repr: a list of int represents the tone marks
    Returns:
        str, a vietnamese unicode string
    """
    line_char_repr = [reverse_alphabet_dict[key] for key in line_int_repr]
    vietnamese_char_list = [vietnamese_char_dict[(char, tone)] if (char, tone) in vietnamese_char_dict.keys() else char 
                            for char, tone in zip(line_char_repr, tone_int_repr)]
    return ''.join(vietnamese_char_list)

In [4]:
model = load_model('model.h5')

In [5]:
df = pd.read_csv('test.csv')

In [6]:
def predict_tone_mark(lowercase_sentence, model):
    """
    Args:
        lowercase_sentence: a string consists of only characters from alphabet_dict (english alphabet + blank space)
        model: trained keras model, input shape (None, None, alphabet_size), output shape (None, None, tone_size)
    Returns:
        str, a vietnamese unicode string with predicted tone marks
    """
    X_int = [alphabet_dict[char] for char in lowercase_sentence]
    X = np.array([to_categorical(X_int, num_classes=alphabet_size)])
    y = model.predict(X).squeeze()
    tone_int = np.argmax(y, -1)
    return represent(X_int, tone_int)

In [7]:
# These functions are taken directly from https://github.com/aivivn/vietnamese_tone_prediction_utils/blob/master/utils.py

def remove_tone_line(utf8_str):
    intab_l = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ"
    intab_u = "ẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ"
    intab = intab_l + intab_u

    outtab_l = "a"*17 + "o"*17 + "e"*11 + "u"*11 + "i"*5 + "y"*5 + "d"
    outtab_u = "A"*17 + "O"*17 + "E"*11 + "U"*11 + "I"*5 + "Y"*5 + "D"
    outtab = outtab_l + outtab_u

    r = re.compile("|".join(intab))
    replaces_dict = dict(zip(intab, outtab))

    return r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)


def normalize_tone_line(utf8_str):
    intab_l = "áàảãạâấầẩẫậăắằẳẵặđèéẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵ"
    intab_u = "ÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶĐÈÉẺẼẸÊẾỀỂỄỆÍÌỈĨỊÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴ"
    intab = intab_l + intab_u

    outtab_l = [
        "a1", "a2", "a3", "a4", "a5",
        "a6", "a61", "a62", "a63", "a64", "a65",
        "a8", "a81", "a82", "a83", "a84", "a85",
        "d9",
        "e1", "e2", "e3", "e4", "e5",
        "e6", "e61", "e62", "e63", "e64", "e65",
        "i1", "i2", "i3", "i4", "i5",
        "o1", "o2", "o3", "o4", "o5",
        "o6", "a61", "o62", "o63", "o64", "o65",
        "o7", "o71", "o72", "o73", "o74", "o75",
        "u1", "u2", "u3", "u4", "u5",
        "u7", "u71", "u72", "u73", "u74", "u75",
        "y1", "y2", "y3", "y4", "y5",
    ]

    outtab_u = [
        "A1", "A2", "A3", "A4", "A5",
        "A6", "A61", "A62", "A63", "A64", "A65",
        "A8", "A81", "A82", "A83", "A84", "A85",
        "D9",
        "E1", "E2", "E3", "E4", "E5",
        "E6", "E61", "E62", "E63", "E64", "E65",
        "I1", "I2", "I3", "I4", "I5",
        "O1", "O2", "O3", "O4", "O5",
        "O6", "O61", "O62", "O63", "O64", "O65",
        "O7", "O71", "O72", "O73", "O74", "O75",
        "U1", "U2", "U3", "U4", "U5",
        "U7", "U71", "U72", "U73", "U74", "U75",
        "Y1", "Y2", "Y3", "Y4", "Y5",
    ]

    r = re.compile("|".join(intab))
    replaces_dict = dict(zip(intab, outtab_l + outtab_u))

    return r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)


def simplify(word):
    """
    normalize and simplify a vni word:
    * move tone digit to the end
    * return only digits
    * return 0 if there is no digit
    """
    if word.isalpha(): 
        return '0'
    ret = ''
    tone = ''
    for letter in word:
        if '1' <= letter <= '9':
            if '1' <= letter <= '5':
                # assert len(tone) == 0, '{}, {}'.format(tone, word)
                if tone != '':
                    return '#'  # ignore this word
                tone = letter
            else:
                ret += letter
    return ret + tone


def my_simplify(word):
    """
    normalize and simplify a vni word:
    * move tone digit to the end
    * return only digits
    * return 0 if there is no digit
    """
    if word.isalpha(): 
        return '0'
    ret = ''
    tone = ''
    for letter in word:
        if '1' <= letter <= '9':
            if '1' <= letter <= '5':
                tone = letter
            else:
                ret += letter
    return ret + tone


def process_line(line):
    """
    Process a line
    :param line:
    :return: no_tone_line, no_tone_words, simplified_words
    """
    utf8_line = line.strip('\n')

    no_tone_line_pre = remove_tone_line(utf8_line)
    normalized_line_pre = normalize_tone_line(utf8_line)

    no_tone_line_alphanumeric = re.sub('[^a-zA-Z\d]', ' ', repr(no_tone_line_pre))
    normalized_line_alphanumeric = re.sub('[^a-zA-Z\d]', ' ', repr(normalized_line_pre))

    no_tone_words = no_tone_line_alphanumeric.split()
    normalized_words = normalized_line_alphanumeric.split()
    assert len(no_tone_words) == len(normalized_words)

    filtered_no_tone_words = []
    simplified_words = []
    for i, word in enumerate(no_tone_words):
        if not word.isalpha():
            continue
#         simplified_word = simplify(normalized_words[i])
        simplified_word = my_simplify(normalized_words[i])
#         if simplified_word == '#':
#             continue
        filtered_no_tone_words.append(word)
        simplified_words.append(simplified_word)

    return filtered_no_tone_words, simplified_words

In [None]:
def get_label(lowercase_sentence, model):
    """
    Args:
        lowercase_sentence: a string consists of only characters from alphabet_dict (english alphabet + blank space)
        model: trained keras model, input shape (None, None, alphabet_size), output shape (None, None, tone_size)
    Returns:
        str, a list of labels, one label for each word (for submission)
    """
    vietnamese_string = predict_tone_mark(lowercase_sentence, model)
    _, labels = process_line(vietnamese_string)
    return labels

In [81]:
x = df.loc[1012]['sentence'].lower()
x

'tai xe dung xe phan doi tram thu bot can tho phung hiep trong may ngay qua dong thoi cac don vi chuc nang cu luc luong tuc truc dam bao an ninh trat tu tai khu vuc tranh de doi tuong xau loi dung xuyen tac chu truong cua dang chinh sach phap luat cua nha nuoc'

In [82]:
predict_tone_mark(x, model)

'tài xế dừng xe phản đối trạm thu bot cần thơ phụng hiệp trong mấy ngày qua đồng thời các đơn vị chức năng cử lực lượng túc trục đảm bảo an ninh trật tự tại khu vực tranh để đối tượng xấu lợi dụng xuyên tạc chủ trương của đảng chính sách pháp luật của nhà nước'

In [85]:
row_list = []
for idx, row in df.iterrows():
    sentence_id = row['id']
    sentence = row['sentence']
    label_list = get_label(sentence.lower(), model)
    for i, label in enumerate(label_list):
        word_id = '{}{:03}'.format(sentence_id, i)
        row_list.append({'id': word_id, 'label': label})
    if idx % 100 == 0:
        print(idx)
submission = pd.DataFrame(row_list)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200


In [86]:
submission.to_csv("submission.csv", index=False)