In [3]:
%pip install --quiet pyvi

In [4]:
import pandas as pd
import numpy as np 
from pyvi import ViTokenizer
from tqdm.contrib.concurrent import thread_map

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
DATAPATH = "/Users/nampham/OneDrive - Đại học FPT- FPT University/Intern/Menu/label data/labels.xlsx"
df = pd.read_excel(DATAPATH)
df.head()

Unnamed: 0,ImageName,VietnameseName,EnglishName,Price
0,001.jpeg,COMBO 1,COMBO 1,169000
1,001.jpeg,COMBO 2,COMBO 2,169000
2,001.jpeg,COMBO 3,COMBO 3,169000
3,001.jpeg,RƯỢU SOJU,SOJU,NOT GIVEN
4,001.jpeg,RƯỢU VODKA,VODKA,NOT GIVEN


In [6]:
food_names = df['VietnameseName'].values
print(food_names)

['COMBO 1' 'COMBO 2' 'COMBO 3' ... 'CÁ SƠN NƯỚNG' 'CÁ CHỈ VÀNG'
 'CÁ LAO NƯỚNG ']


In [13]:
def clean_food_name(food_name):
    #Remove quantity for example 500ML 1L 20KG ....
    food_name = food_name.lower()
    food_name = re.sub(r'\d+\w+', ' ', food_name)
    #Remove purification and digits
    food_name = re.sub(r"[^\w\s]|\d", ' ', food_name)
    #Remove size entites
    token = r"\s+x{0,2}[xlms][\.:-]?\b|(nhỏ|vừa|lớn|bự|to|small|medium|big|large):?"
    food_name = re.sub(token, " ", food_name)
    #Remove specail token
    food_name = re.sub(r"_x", ' ', food_name)
    
    food_name = food_name.strip().split()
    return food_name
    

test = clean_food_name('BELGIAN DARK 8.1%/IBU 32 -    X 500ML (PHỞ ĐÙI + TRỨNG NON) X L M nhỏ vừa_x hải sản xuân')
test

['belgian', 'dark', 'ibu', 'phở', 'đùi', 'trứng', 'non', 'hải', 'sản', 'xuân']

In [14]:
def get_big_gram(text, n=2, m=2):
    words = clean_food_name(text)
    big_grams = []
    
    for k in range(n,m+1):
        for i in range(len(words)):
            big_gram = ''
            if i + k > len(words):
                continue
            
            for j in range(k):
                big_gram += words[i+j] + ' '
                
            big_grams.append(big_gram.strip())
            
    return big_grams

print(get_big_gram('BELGIAN DARK 8.1%/IBU 32 - 500ML (PHỞ ĐÙI + TRỨNG NON) hải sản'))

['belgian dark', 'dark ibu', 'ibu phở', 'phở đùi', 'đùi trứng', 'trứng non', 'non hải', 'hải sản']


In [15]:
food_vocabulary = thread_map(clean_food_name, food_names, max_workers=6)
food_vocabulary_big_grams = thread_map(get_big_gram, food_names, max_workers=6)

100%|██████████| 15211/15211 [00:00<00:00, 848872.48it/s]
100%|██████████| 15211/15211 [00:00<00:00, 822742.38it/s]


In [16]:
food_vocabulary = [item for sublist in food_vocabulary for item in sublist]
print(food_vocabulary[:10])

['combo', 'combo', 'combo', 'rượu', 'soju', 'rượu', 'vodka', 'tiger', 'lon', 'tiger']


In [17]:
food_vocabulary_big_grams = [item for sublist in food_vocabulary_big_grams for item in sublist]
print(food_vocabulary_big_grams[:20])

['rượu soju', 'rượu vodka', 'tiger lon', 'tiger chai', 'tiger bạc', 'bạc chai', 'tiger bạc', 'bạc lon', 'bò húc', 'bia quy', 'quy nhơn', 'bia bivina', 'strong bow', 'rau muống', 'muống xào', 'xào tỏi', 'mồng tơi', 'tơi xào', 'xào tỏi', 'cải xào']


In [12]:
#Save corpus
with open('food_vocabulary_tokenize.txt', 'w', encoding='utf-8') as f:
    for food, count in corpus.most_common():
        save_format = f"{food}${count}\n"
        f.write(save_format)

In [18]:
from collections import Counter

corpus = Counter(food_vocabulary)
corpus.most_common(10)

[('cá', 1513),
 ('trà', 1312),
 ('nướng', 1174),
 ('sữa', 1117),
 ('chiên', 1053),
 ('bò', 1014),
 ('xào', 823),
 ('gà', 779),
 ('tôm', 710),
 ('cơm', 620)]

In [19]:
corpus_big_grams = Counter(food_vocabulary_big_grams)
corpus_big_grams.most_common(10)

[('trà sữa', 472),
 ('hải sản', 374),
 ('phô mai', 322),
 ('trân châu', 252),
 ('cơm chiên', 245),
 ('cá hồi', 240),
 ('sữa chua', 212),
 ('chiên mắm', 177),
 ('thập cẩm', 166),
 ('muối ớt', 161)]

In [20]:
#Save corpus
with open('food_vocabulary.txt', 'w', encoding='utf-8') as f:
    for food, count in corpus.most_common():
        if count > 1:
            save_format = f"{food}${count}\n"
            f.write(save_format)

with open('food_vocabulary_big_grams.txt', 'w', encoding='utf-8') as f:
    for food, count in corpus_big_grams.most_common():
        if count > 1:
            save_format = f"{food}${count}\n"
            f.write(save_format)

In [21]:
import symspellpy
import random

import string

In [22]:
def random_delete(text, percent = 0.1):
    n = int(percent*len(text))+1
    text_aggumented = list(text)
    
    for i in range(n):
        k = random.randint(0, len(text_aggumented)-1)
        del text_aggumented[k]
        
    return "".join(text_aggumented)

def random_replace(text, percent = 0.2):
    n = int(percent*len(text))+1
    text_aggumented = list(text)
    
    for i in range(n):
        k = -1
        while k == -1 or text[k] == ' ':
            k = random.randint(0, len(text)-1)

        char = random.choice(string.ascii_uppercase)
        text_aggumented[k] = char
        
    return "".join(text_aggumented)

def random_swap(text, percent = 0.1):
    n = int(percent*len(text))+1
    text_aggumented = list(text)
    
    for i in range(n):
        k = -1
        while k == -1 or text[k] == ' ':
            k = random.randint(0, len(text)-1)
        h = k
        while h == k or text[h] == ' ':
            h = random.randint(0, len(text)-1)
        text_aggumented[k], text_aggumented[h] = text_aggumented[h], text_aggumented[k]
        
    return "".join(text_aggumented)

def lower(text):
    text = text.lower()
    return text

def random_text_aggument(text):
    k = random.randint(0, 2)
    random_agg = {0: random_delete, 1:random_replace, 2:random_swap}
    return random_agg[k](text, percent=0.125)
    
random_text_aggument('BELGIAN DARK 8.1%IBU 32 - 500ML (PHỞ ĐÙI + TRỨNG NON)')
    

'BLIAN DARK 8.1%IU 32 -500MLPHỞ ĐÙI + TRỨNG NO)'

In [23]:
TEST_CASE = random.choices(food_names, k = 500)
TEST_CASE = thread_map(lower, TEST_CASE, max_workers=6)
TEST_CASE_WRONG = thread_map(random_text_aggument, TEST_CASE, max_workers=6)

print(TEST_CASE[:7])
print(TEST_CASE_WRONG[:7])

100%|██████████| 500/500 [00:00<00:00, 430715.14it/s]
100%|██████████| 500/500 [00:00<00:00, 530387.46it/s]

['nước ép cà rốt', 'lẩu đuôi bò', 'nem chua rán', 'sake dassai (720ml)', 'dồi rán', 'bò húc ', 'sô cô la']
['ưc ép cà rốt', 'lẩu đôui bò', 'nem AhDa rán', 'sakG daAsaF (720ml)', 'dồi rá', 'ò húc ', 'sô Jô la']





In [24]:
def cer(pred, true):
    n = len(true)
    wrong = 0
    for c1, c2 in zip(pred, true):
        if c1 != c2:
            wrong += 1
    
    
    return (n - wrong)/n

def wer(pred, true):
    pred = pred.split()
    true = true.split()
    n = len(true)
    wrong = 0
    for c1, c2 in zip(pred, true):
        if c1 != c2:
            wrong += 1
    
    return (n - wrong)/n

print(cer("tomorrow now today and tomorrow", "tomorrow now today and tomoraow"))
wer("tomorrow now today and tomorrow", "tomorrow now today and tomoraow")

0.967741935483871


0.8

In [25]:
from symspellpy import SymSpell, Verbosity
from itertools import islice
import numpy as np

EDIT_DISTANCE = 3

spell_check = SymSpell(max_dictionary_edit_distance=EDIT_DISTANCE)
spell_check.load_dictionary('food_vocabulary.txt', 0, 1, 
                                    encoding='utf-8', separator='$')
spell_check.load_bigram_dictionary('food_vocabulary_big_grams.txt', 0, 1, 
                                    encoding='utf-8', separator='$')

print(list(islice(spell_check.words.items(), 5)))
print(list(islice(spell_check.bigrams.items(), 5)))

[('cá', 1513), ('trà', 1312), ('nướng', 1174), ('sữa', 1117), ('chiên', 1053)]
[('trà sữa', 472), ('hải sản', 374), ('phô mai', 322), ('trân châu', 252), ('cơm chiên', 245)]


In [26]:
def correct_spell(text):
    import re 
    
    text = re.sub(r'[.\?#@+,<>%~`!$^&\(\):;\\\/]', r' \g<0> ', text)
    
    suggestion = spell_check.lookup_compound(text, max_edit_distance=EDIT_DISTANCE,
                                             ignore_non_words=True, ignore_term_with_digits=True)
    
    return suggestion[0]._term

In [27]:
%%time
test_result = []
wer_result = []
cer_result = []
N = 7
for test_case, val in zip(TEST_CASE_WRONG, TEST_CASE):
    correct_text = correct_spell(test_case)
    
    test_result.append(correct_text)
        
        
    wer_result.append(
        wer(correct_text, val)
    )
    
    cer_result.append(
        cer(correct_text, val)
    )
    

print("Example: ")
print(TEST_CASE[:N])
print(TEST_CASE_WRONG[:N])
print(test_result[:N])

print("Metric:", f"WER: {np.mean(wer_result):.3f}", f"CER: {np.mean(cer_result):.3f}")

Example: 
['nước ép cà rốt', 'lẩu đuôi bò', 'nem chua rán', 'sake dassai (720ml)', 'dồi rán', 'bò húc ', 'sô cô la']
['ưc ép cà rốt', 'lẩu đôui bò', 'nem AhDa rán', 'sakG daAsaF (720ml)', 'dồi rá', 'ò húc ', 'sô Jô la']
['ốc ép cà rốt', 'lẩu đuôi bò', 'nem a da rán', 'sake dassai 720ml', 'dồi cá', 'bò húc', 'sô ô la']
Metric: WER: 0.677 CER: 0.725
CPU times: user 380 ms, sys: 5.4 ms, total: 386 ms
Wall time: 391 ms


In [49]:
correct_spell("nuoc ep cà rốt")

'nước ép cà rốt'