In [39]:
import pandas as pd
from pyvi import ViTokenizer
from tqdm.contrib.concurrent import thread_map

## Create dictionary

In [40]:
df = pd.read_excel('../data_sample/Data_Labeling.xlsx')

df.head()

Unnamed: 0,ImageName,VietnameseName,EnglishName,Price
0,001.jpeg,COMBO 1,COMBO 1,169000
1,001.jpeg,COMBO 2,COMBO 2,169000
2,001.jpeg,COMBO 3,COMBO 3,169000
3,001.jpeg,RƯỢU SOJU,SOJU,NOT GIVEN
4,001.jpeg,RƯỢU VODKA,VODKA,NOT GIVEN


In [41]:
df.tail()

Unnamed: 0,ImageName,VietnameseName,EnglishName,Price
15206,850.jpeg,SÒ ĐIỆP NƯỚNG,GRILLED SCALLOP,10000
15207,850.jpeg,KHÔ MỰC,DRIED SQUID,NOT GIVEN
15208,850.jpeg,CÁ SƠN NƯỚNG,GRILLED CARDINAL FISH,NOT GIVEN
15209,850.jpeg,CÁ CHỈ VÀNG,YELLOWSTRIPE SCAD,60000
15210,850.jpeg,CÁ LAO NƯỚNG,GRILLED RED CORNETFISH,99000


In [42]:
#get Vietnamese food names
food_names = df['VietnameseName'].values
type(food_names)

numpy.ndarray

In [43]:
#Clean text, remove punification number and etc
import re

def clean_food_name(food_name):
    #Remove quantity for example 500ML 1L 20KG ....
    food_name = re.sub(r'\d+\w+', '', food_name)
    #Remove purification and digits
    food_name = re.sub(r"[^\w\s]|\d", '', food_name)
    return food_name.lower().strip()

test = clean_food_name('BELGIAN DARK 8.1%/IBU 32 - 500ML (PHỞ ĐÙI + TRỨNG NON) mì trứng thành phố huế lòng đào')
test

'belgian dark ibu    phở đùi  trứng non mì trứng thành phố huế lòng đào'

In [44]:
def get_bag_words(text):
    tokens = ViTokenizer.tokenize(text).split(' ')
    return [t.replace('_', '_') for t in tokens]

get_bag_words(test)

['belgian',
 'dark',
 'ibu',
 'phở',
 'đùi',
 'trứng',
 'non',
 'mì',
 'trứng',
 'thành_phố',
 'huế',
 'lòng_đào']

In [45]:
food_vocabulary = thread_map(clean_food_name, food_names, max_workers=6)
food_vocabulary = thread_map(get_bag_words, food_vocabulary, max_workers=6)


100%|██████████| 15211/15211 [00:00<00:00, 338005.53it/s]
100%|██████████| 15211/15211 [00:00<00:00, 316896.77it/s]


In [46]:
food_vocabulary = [item for sublist in food_vocabulary for item in sublist]
print(food_vocabulary[:10])


['combo', 'combo', 'combo', 'rượu', 'soju', 'rượu', 'vodka', 'tiger', 'lon', 'tiger']


In [47]:
from collections import Counter

corpus = Counter(food_vocabulary)
corpus.most_common(20)

[('nướng', 1152),
 ('trà', 1140),
 ('bò', 951),
 ('xào', 809),
 ('chiên', 793),
 ('sữa', 765),
 ('gà', 712),
 ('cá', 619),
 ('tôm', 528),
 ('trứng', 505),
 ('hấp', 458),
 ('lẩu', 409),
 ('muối', 397),
 ('kem', 392),
 ('hải_sản', 372),
 ('cơm', 334),
 ('mực', 333),
 ('mì', 319),
 ('phô_mai', 315),
 ('chả', 310)]

In [48]:
#Save corpus
with open('food_vocabulary_tokenize.txt', 'w', encoding='utf-8') as f:
    for food, count in corpus.most_common():
        save_format = f"{food}${count}\n"
        f.write(save_format)




# Spell correct with symspell

In [49]:
import symspellpy
import random

import string

## Generate test cases

In [50]:
def random_delete(text, percent = 0.1):
    n = int(percent*len(text))+1
    text_aggumented = list(text)
    
    for i in range(n):
        k = random.randint(0, len(text_aggumented)-1)
        del text_aggumented[k]
        
    return "".join(text_aggumented)

def random_replace(text, percent = 0.2):
    n = int(percent*len(text))+1
    text_aggumented = list(text)
    
    for i in range(n):
        k = -1
        while k == -1 or text[k] == ' ':
            k = random.randint(0, len(text)-1)

        char = random.choice(string.ascii_uppercase)
        text_aggumented[k] = char
        
    return "".join(text_aggumented)

def random_swap(text, percent = 0.1):
    n = int(percent*len(text))+1
    text_aggumented = list(text)
    
    for i in range(n):
        k = -1
        while k == -1 or text[k] == ' ':
            k = random.randint(0, len(text)-1)
        h = k
        while h == k or text[h] == ' ':
            h = random.randint(0, len(text)-1)
        text_aggumented[k], text_aggumented[h] = text_aggumented[h], text_aggumented[k]
        
    return "".join(text_aggumented)

def lower(text):
    text = text.lower()
    return text

def random_text_aggument(text):
    k = random.randint(0, 2)
    random_agg = {0: random_delete, 1:random_replace, 2:random_swap}
    return random_agg[k](text, percent=0.2)
    
test = random_text_aggument('BELGIAN DARK 8.1%IBU 32 - 500ML (PHỞ ĐÙI + TRỨNG NON) thành pho')


In [51]:
TEST_CASE = random.choices(food_names, k = 500)
TEST_CASE = thread_map(lower, TEST_CASE, max_workers=6)
TEST_CASE_WRONG = thread_map(random_text_aggument, TEST_CASE, max_workers=6)

print(TEST_CASE[:7])
print(TEST_CASE_WRONG[:7])

100%|██████████| 500/500 [00:00<00:00, 246375.94it/s]
100%|██████████| 500/500 [00:00<00:00, 250915.53it/s]

['cơm trắng', 'sụn gà chiên mắm', 'nộm hoa chuối', 'bò tơ xào nén', 'cóc lắc', 'kimbap chiên', 'salad rong nho']
['cắm ơrtng', 'sụn à ciên m', 'nUm hoH Fhuối', 'Pò tơ xLo néJ', 'óc lc', 'kimbaR cQiên', 'ralag sonn dho']





## METRIC

In [52]:
def cer(pred, true):
    n = len(true)
    wrong = 0
    for c1, c2 in zip(pred, true):
        if c1 != c2:
            wrong += 1
    
    
    return (n - wrong)/n

def wer(pred, true):
    pred = pred.split()
    true = true.split()
    n = len(true)
    wrong = 0
    for c1, c2 in zip(pred, true):
        if c1 != c2:
            wrong += 1
    
    return (n - wrong)/n

print(cer("tomorrow now today and tomorrow", "tomorrow now today and tomoraow"))
wer("tomorrow now today and tomorrow", "tomorrow now today and tomoraow")


0.967741935483871


0.8

## Spell check

In [53]:
from symspellpy import SymSpell, Verbosity
from itertools import islice
import numpy as np

EDIT_DISTANCE = 3

spell_check = SymSpell(max_dictionary_edit_distance=EDIT_DISTANCE)
spell_check.load_dictionary('food_vocabulary_tokenize.txt', 0, 1, 
                                    encoding='utf-8', separator='$')


print(list(islice(spell_check.words.items(), 20)))



[('nướng', 1152), ('trà', 1140), ('bò', 951), ('xào', 809), ('chiên', 793), ('sữa', 765), ('gà', 712), ('cá', 619), ('tôm', 528), ('trứng', 505), ('hấp', 458), ('lẩu', 409), ('muối', 397), ('kem', 392), ('hải_sản', 372), ('cơm', 334), ('mực', 333), ('mì', 319), ('phô_mai', 315), ('chả', 310)]


In [63]:
def correct_spell(text):
    import re 
    
    text = re.sub(r'[.\?#@+,<>%~`!$^&\(\):;\\\/]', r' \g<0> ', text)
    
    suggestion = spell_check.lookup_compound(text, max_edit_distance=EDIT_DISTANCE,
                                             ignore_non_words=False, ignore_term_with_digits=False)
    
    return suggestion[0]._term

In [64]:
%%time
test_result = []
wer_result = []
cer_result = []
N = 7
for test_case, val in zip(TEST_CASE_WRONG, TEST_CASE):
    correct_text = correct_spell(test_case)
    
    test_result.append(correct_text)
        
        
    wer_result.append(
        wer(correct_text, val)
    )
    
    cer_result.append(
        cer(correct_text, val)
    )
    

print("Example: ")
print(TEST_CASE[:N])
print(TEST_CASE_WRONG[:N])
print(test_result[:N])

print("Metric:", f"WER: {np.mean(wer_result):.3f}", f"CER: {np.mean(cer_result):.3f}")



Example: 
['cơm trắng', 'sụn gà chiên mắm', 'nộm hoa chuối', 'bò tơ xào nén', 'cóc lắc', 'kimbap chiên', 'salad rong nho']
['cắm ơrtng', 'sụn à ciên m', 'nUm hoH Fhuối', 'Pò tơ xLo néJ', 'óc lc', 'kimbaR cQiên', 'ralag sonn dho']
['cơm trứng', 'sụn gà chiên m', 'nấm hoa chuối', 'bò tơ xào né', 'óc l', 'kimbap chiên', 'rang song kho']
Metric: WER: 0.538 CER: 0.636
CPU times: total: 750 ms
Wall time: 1.14 s


In [65]:
correct_spell("hoi_sả")

'hoa sả'