In [37]:
import pandas as pd
from tqdm.contrib.concurrent import thread_map

## Create dictionary

In [38]:
df = pd.read_excel('../data_sample/Data_Labeling.xlsx')

df.head()

Unnamed: 0,ImageName,VietnameseName,EnglishName,Price
0,001.jpeg,COMBO 1,COMBO 1,169000
1,001.jpeg,COMBO 2,COMBO 2,169000
2,001.jpeg,COMBO 3,COMBO 3,169000
3,001.jpeg,RƯỢU SOJU,SOJU,NOT GIVEN
4,001.jpeg,RƯỢU VODKA,VODKA,NOT GIVEN


In [39]:
df.tail()

Unnamed: 0,ImageName,VietnameseName,EnglishName,Price
15206,850.jpeg,SÒ ĐIỆP NƯỚNG,GRILLED SCALLOP,10000
15207,850.jpeg,KHÔ MỰC,DRIED SQUID,NOT GIVEN
15208,850.jpeg,CÁ SƠN NƯỚNG,GRILLED CARDINAL FISH,NOT GIVEN
15209,850.jpeg,CÁ CHỈ VÀNG,YELLOWSTRIPE SCAD,60000
15210,850.jpeg,CÁ LAO NƯỚNG,GRILLED RED CORNETFISH,99000


In [40]:
#get Vietnamese food names
food_names = df['VietnameseName'].values
type(food_names)

numpy.ndarray

In [68]:
#Clean text, remove punification number and etc
import re

def clean_food_name(food_name):
    #Remove quantity for example 500ML 1L 20KG ....
    food_name = re.sub(r'\d+\w+', '', food_name)
    #Remove purification and digits
    food_name = re.sub(r"[^\w\s]|\d", '', food_name)
    return food_name.lower().strip().split()

test = clean_food_name('BELGIAN DARK 8.1%/IBU 32 - 500ML (PHỞ ĐÙI + TRỨNG NON)')
test

['belgian', 'dark', 'ibu', 'phở', 'đùi', 'trứng', 'non']

In [70]:
def get_big_gram(text, n=2, m=4):
    words = clean_food_name(text)
    big_grams = []
    
    for k in range(n,m+1):
        for i in range(len(words)):
            big_gram = ''
            if i + k > len(words):
                continue
            
            for j in range(k):
                big_gram += words[i+j] + ' '
                
            big_grams.append(big_gram.strip())
            
    return big_grams

print(get_big_gram('BELGIAN DARK 8.1%/IBU 32 - 500ML (PHỞ ĐÙI + TRỨNG NON)'))
                

['belgian dark', 'dark ibu', 'ibu phở', 'phở đùi', 'đùi trứng', 'trứng non', 'belgian dark ibu', 'dark ibu phở', 'ibu phở đùi', 'phở đùi trứng', 'đùi trứng non', 'belgian dark ibu phở', 'dark ibu phở đùi', 'ibu phở đùi trứng', 'phở đùi trứng non']


In [71]:
food_vocabulary = thread_map(clean_food_name, food_names, max_workers=6)
food_vocabulary_big_grams = thread_map(get_big_gram, food_names, max_workers=6)


100%|██████████| 15211/15211 [00:00<00:00, 323559.99it/s]
100%|██████████| 15211/15211 [00:00<00:00, 334116.57it/s]


In [72]:
food_vocabulary = [item for sublist in food_vocabulary for item in sublist]
food_vocabulary[:10]

['combo',
 'combo',
 'combo',
 'rượu',
 'soju',
 'rượu',
 'vodka',
 'tiger',
 'lon',
 'tiger']

In [73]:
food_vocabulary_big_grams = [item for sublist in food_vocabulary_big_grams for item in sublist]
print(food_vocabulary_big_grams[:20])

['rượu soju', 'rượu vodka', 'tiger lon', 'tiger chai', 'tiger bạc', 'bạc chai', 'tiger bạc chai', 'tiger bạc', 'bạc lon', 'tiger bạc lon', 'bò húc', 'bia quy', 'quy nhơn', 'bia quy nhơn', 'bia bivina', 'strong bow', 'rau muống', 'muống xào', 'xào tỏi', 'rau muống xào']


In [74]:
from collections import Counter

corpus = Counter(food_vocabulary)
corpus.most_common(10)

[('cá', 1500),
 ('trà', 1310),
 ('nướng', 1157),
 ('sữa', 1114),
 ('chiên', 1052),
 ('bò', 1009),
 ('xào', 817),
 ('gà', 775),
 ('tôm', 706),
 ('cơm', 620)]

In [75]:
corpus_big_grams = Counter(food_vocabulary_big_grams)
corpus_big_grams.most_common(10)

[('trà sữa', 472),
 ('hải sản', 373),
 ('phô mai', 320),
 ('trân châu', 252),
 ('cơm chiên', 245),
 ('cá hồi', 240),
 ('sữa chua', 211),
 ('chiên mắm', 175),
 ('thập cẩm', 166),
 ('muối ớt', 159)]

In [76]:
#Save corpus
with open('../postprocessing/food_vocabulary.txt', 'w', encoding='utf-8') as f:
    for food, count in corpus.most_common():
        save_format = f"{food}${count}\n"
        f.write(save_format)

with open('../postprocessing/food_vocabulary_big_grams.txt', 'w', encoding='utf-8') as f:
    for food, count in corpus_big_grams.most_common():
        save_format = f"{food}${count}\n"
        f.write(save_format)


# Spell correct with symspell

In [13]:
import symspellpy
import random

import string

## Generate test cases

In [87]:
def random_delete(text, percent = 0.1):
    n = int(percent*len(text))+1
    text_aggumented = list(text)
    
    for i in range(n):
        k = random.randint(0, len(text_aggumented)-1)
        del text_aggumented[k]
        
    return "".join(text_aggumented)

def random_replace(text, percent = 0.2):
    n = int(percent*len(text))+1
    text_aggumented = list(text)
    
    for i in range(n):
        k = -1
        while k == -1 or text[k] == ' ':
            k = random.randint(0, len(text)-1)

        char = random.choice(string.ascii_lowercase)
        text_aggumented[k] = char
        
    return "".join(text_aggumented)

def random_swap(text, percent = 0.1):
    n = int(percent*len(text))+1
    text_aggumented = list(text)
    
    for i in range(n):
        k = -1
        while k == -1 or text[k] == ' ':
            k = random.randint(0, len(text)-1)
        h = k
        while h == k or text[h] == ' ':
            h = random.randint(0, len(text)-1)
        text_aggumented[k], text_aggumented[h] = text_aggumented[h], text_aggumented[k]
        
    return "".join(text_aggumented)

def lower(text):
    text = text.lower()
    return text

def random_text_aggument(text):
    #k = random.randint(2, 2)
    k = 1
    random_agg = {0: random_delete, 1:random_replace, 2:random_swap}
    return random_agg[k](text, percent=0.15)
    
random_text_aggument('BELGIAN DARK 8.1%IBU 32 - 500ML (PHỞ ĐÙI + TRỨNG NON)')
    

'BELGIzN DARK 8.1%IBU 3e - 500ML (PyỞ ĐÙf n TRỨNj NnN)'

In [88]:
TEST_CASE = random.choices(food_names, k = 500)
TEST_CASE = thread_map(lower, TEST_CASE, max_workers=5)
TEST_CASE_WRONG = thread_map(random_text_aggument, TEST_CASE, max_workers=6)

print(TEST_CASE[:7])
print(TEST_CASE_WRONG[:7])

100%|██████████| 500/500 [00:00<00:00, 166784.79it/s]
100%|██████████| 500/500 [00:00<00:00, 166427.43it/s]

['ốc hương xào me', 'cua bỏ lò phô mai', 'lịch huyết nướng giấy bạc', 'chình nướng muối ớt', 'phở trứng', 'ngọc dương hầm thuốc bắc', 'mì xào nấm chay']
['xc hưvrg xào me', 'nua yỏ lò phô mai', 'lfmh hcyết nướnk giấy bạc', 'chìhh nưbng guối ớt', 'phở lrứng', 'ngọc yzơng hvm thuốc bắl', 'mì xào nwm yeay']





## METRIC

In [89]:
def cer(pred, true):
    n = len(true)
    wrong = 0
    for c1, c2 in zip(pred, true):
        if c1 != c2:
            wrong += 1
    
    
    return (n - wrong)/n

def wer(pred, true):
    pred = pred.split()
    true = true.split()
    n = len(true)
    wrong = 0
    for c1, c2 in zip(pred, true):
        if c1 != c2:
            wrong += 1
    
    return (n - wrong)/n

print(cer("tomorrow now today and tomorrow", "tomorrow now today and tomoraow"))
wer("tomorrow now today and tomorrow", "tomorrow now today and tomoraow")


0.967741935483871


0.8

## Spell check

In [90]:
from symspellpy import SymSpell, Verbosity
from itertools import islice
import numpy as np

EDIT_DISTANCE = 3

spell_check = SymSpell(max_dictionary_edit_distance=EDIT_DISTANCE)
spell_check.load_dictionary('../postprocessing/food_vocabulary.txt', 0, 1, 
                                    encoding='utf-8')
spell_check.load_bigram_dictionary('../postprocessing/food_vocabulary_big_grams.txt', 0, 1, 
                                    encoding='utf-8', separator='$')

print(list(islice(spell_check.words.items(), 5)))
print(list(islice(spell_check.bigrams.items(), 5)))


[('cá', 1500), ('trà', 1310), ('nướng', 1157), ('sữa', 1114), ('chiên', 1052)]
[('trà sữa', 472), ('hải sản', 373), ('phô mai', 320), ('trân châu', 252), ('cơm chiên', 245)]


In [91]:
def correct_spell(text):
    suggestion = spell_check.lookup_compound(text, max_edit_distance=EDIT_DISTANCE, ignore_non_words=True)
    
    return suggestion[0]._term

In [92]:
%%time
test_result = []
wer_result = []
cer_result = []
N = 7
for test_case in TEST_CASE_WRONG:
    correct_text = correct_spell(test_case)
    
    test_result.append(correct_text)
        
        
    wer_result.append(
        wer(correct_text, test_case)
    )
    
    cer_result.append(
        cer(correct_text, test_case)
    )
    

print("Example: ")
print(TEST_CASE[:N])
print(TEST_CASE_WRONG[:N])
print(test_result[:N])

print("Metric:", f"WER: {np.mean(wer_result):.3f}", f"CER: {np.mean(cer_result):.3f}")



Example: 
['ốc hương xào me', 'cua bỏ lò phô mai', 'lịch huyết nướng giấy bạc', 'chình nướng muối ớt', 'phở trứng', 'ngọc dương hầm thuốc bắc', 'mì xào nấm chay']
['xc hưvrg xào me', 'nua yỏ lò phô mai', 'lfmh hcyết nướnk giấy bạc', 'chìhh nưbng guối ớt', 'phở lrứng', 'ngọc yzơng hvm thuốc bắl', 'mì xào nwm yeay']
['ốc hương xào me', 'cua đỏ lò phô mai', 'lạnh huyết nướng giấy bạc', 'chình nướng muối ớt', 'phở trứng', 'ngọc hương hầm thuốc bắp', 'mì xào nấm cay']
Metric: WER: 0.410 CER: 0.790
CPU times: total: 828 ms
Wall time: 1.88 s
