# DistilBERT feature matching : food typos to right food name

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

from rich.progress import track

# distilbert-base-multilingual-cased
# bert-base-multilingual-cased
tokenizer = BertTokenizer.from_pretrained("distilbert-base-multilingual-cased")
bert_model = TFBertModel.from_pretrained("distilbert-base-multilingual-cased")  

2023-10-02 16:25:40.276853: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
2023-10-02 16:25:42.912296: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0a:00.0/numa_node
Your kernel may have been built without NUMA support.
20

In [2]:
import tensorflow as tf
tf.random.set_seed(42)

In [3]:
def get_bert_hidden_state(tokenizer, bert_model, sequence, max_size=None):
    inputs = tokenizer(sequence, return_tensors="tf")
    outputs = bert_model(inputs)
    last_hidden_states = outputs.last_hidden_state
    bert_feat = tf.keras.layers.Flatten()(last_hidden_states).numpy()   
    
    if max_size:
        bert_fixed_feat = np.zeros((1,max_size))
        size = bert_feat.shape[1]
        bert_fixed_feat[0][:size] = bert_feat
    
        return bert_fixed_feat
    else:
        return bert_feat    

# Data load

## food nutrient data

In [4]:
food_df  = pd.read_csv('data/food_nutrient_info_finalized.csv')
food_df.head()

Unnamed: 0,food_item,serving_size,calories (kcal),protein (g),protein (g).1,protein (g).2,protein (g).3,carbohydrate (g),sugar (g)
0,닭갈비,400,595.61,45.9,25.8,45.9,25.8,44.9,21.2
1,닭꼬치,70,176.72,11.56,8.57,11.56,8.57,13.35,3.15
2,더덕구이,100,184.0,3.1,5.2,3.1,5.2,31.1,11.6
3,소양념갈비구이,300,989.15,60.1,71.6,60.1,71.6,26.2,13.9
4,양념장어구이,150,433.35,30.77,30.56,30.77,30.56,8.8,4.18


## food typo data
* train: test = 8:2

In [5]:
food_train = pd.read_csv('data/food_aging_train.csv')
food_test = pd.read_csv('data/food_aging_test.csv')

In [6]:
food_train.head()

Unnamed: 0,식품명,식품오타,food_line
0,호떡,하떡,하떡<input>호떡
1,달걀찜(새우젓),새우젓계란찜,새우젓계란찜<input>달걀찜(새우젓)
2,오징어덮밥,오징이덥밥,오징이덥밥<input>오징어덮밥
3,참나물무침,참나물,참나물<input>참나물무침
4,크림소스스파게티,크림스파게티,크림스파게티<input>크림소스스파게티


In [7]:
food_test.head()

Unnamed: 0,식품명,식품오타,test_input
0,달래나물무침,다래나물무침,다래나물무침<input>다래나물무침
1,깻잎찜,깬닙찜,깬닙찜<input>깬닙찜
2,등심돈가스,등심돈까쓰,등심돈까쓰<input>등심돈까쓰
3,게살죽,게살쭉,게살쭉<input>게살쭉
4,버섯전,버섯즌,버섯즌<input>버섯즌


# Read marked text inputs

In [8]:
train_lines = [str(s) for s in food_train['food_line']]
test_lines = [str(s) for s in food_test['test_input']]
print(train_lines[:10])
print(test_lines[:10])

['하떡<input>호떡', '새우젓계란찜<input>달걀찜(새우젓)', '오징이덥밥<input>오징어덮밥', '참나물<input>참나물무침', '크림스파게티<input>크림소스스파게티', '안심돈까쑤<input>안심돈가스', '쇠고기뭇국<input>쇠고기무국', '채소샌드이치<input>채소샌드위치', '채소보끈빱<input>채소볶음밥', '볶은밥<input>볶음밥']
['다래나물무침<input>다래나물무침', '깬닙찜<input>깬닙찜', '등심돈까쓰<input>등심돈까쓰', '게살쭉<input>게살쭉', '버섯즌<input>버섯즌', '고추튀기<input>고추튀기', '간장닭다리구이<input>간장닭다리구이', '버섯샤부<input>버섯샤부', '닥꼬기냉채<input>닥꼬기냉채', '소고기전골<input>소고기전골']


## check the number of UNK tokens of BertTokenizer

In [9]:
all_lines = [str(s) for s in food_train['식품오타']] + [str(s) for s in food_test['식품오타']]
all_lines[:10]

['하떡',
 '새우젓계란찜',
 '오징이덥밥',
 '참나물',
 '크림스파게티',
 '안심돈까쑤',
 '쇠고기뭇국',
 '채소샌드이치',
 '채소보끈빱',
 '볶은밥']

In [10]:
count = 0
for line in all_lines:
    tokens= tokenizer.tokenize(line)
    if tokens[0] == '[UNK]':
        count +=1
print('The number of UNK token : ', count)
print('The number of all food names : ', len(all_lines))
print('percentage of UNK token/all food names: ', ((count/len(all_lines))*100))

The number of UNK token :  718
The number of all food names :  2921
percentage of UNK token/all food names:  24.580623074289626


In [11]:
unk_tokens = []
for line in food_df['food_item'].tolist():
    tokens= tokenizer.tokenize(line)
    if tokens[0] == '[UNK]':
        unk_tokens.append(line)
print('the number of unk_token: ', len(unk_tokens))
print(unk_tokens)

the number of unk_token:  155
['떡만둣국', '만둣국', '수수팥떡', '쑥떡', '찹쌀떡', '굴짬뽕', '삼선짬뽕', '짬뽕라면', '쑥갓나물무침', '삼선볶음밥', '숯불갈비삼각김밥', '해물볶음밥', '깻잎나물볶음', '돼지고기볶음', '돼지껍데기볶음', '머위나물볶음', '멸치풋고추볶음', '소세지볶음', '순대볶음', '양송이버섯볶음', '주꾸미볶음', '표고버섯볶음', '해물볶음', '찹쌀도우넛', '간장마늘쫑장아찌', '고추장마늘쫑장아찌', '깻잎전', '동그랑땡(육원전)', '잣죽', '꽁치찌개', '새우젓두부찌개', '대구찜', '동태찜', '돼지등갈비찜', '북어찜', '소갈비찜', '아구찜', '조기찜', '해물찜', '쑥절편', '찐감자', '찐고구마', '찐옥수수', '팥시루떡', '팥찹쌀떡', '꽁치구이', '감잣국', '김칫국', '쇠고기배춧국', '쑥된장국', '얼갈이배춧국', '깍두기', '깻잎김치', '보쌈김치', '고춧잎나물무침', '깻잎나물무침', '애호박볶음', '콩나물고춧가루무침', '톳나물무침', '볶음우동', '월남쌈', '짬뽕', '쫄면', '계란볶음밥', '김치볶음밥', '돌솥비빔밥', '볶음밥', '새우볶음밥', '쇠고기볶음밥', '영양돌솥밥', '짬뽕밥', '참치볶음밥', '채소볶음밥', '팥밥', '표고버섯볶음밥', '감자볶음', '감자채소볶음', '건새우볶음', '고추볶음', '김치볶음', '꽈리고추멸치볶음', '낙지볶음', '느타리버섯볶음', '닭가슴살피망볶음', '닭발볶음', '닭볶음', '당근볶음', '돼지고기김치볶음', '돼지고기채소볶음', '돼지곱창순대볶음', '떡볶이', '라볶이', '마늘쫑볶음', '마늘쫑잔멸치볶음', '멸치견과류볶음', '멸치마늘쫑볶음', '멸치볶음', '묵은지삼겹살볶음', '문어고추장볶음', '미역줄기볶음', '브로콜리볶음', '새송이버섯볶음', '새우볶음', '소시지볶음', '쇠고기고추장볶음', '쇠고기버섯볶음', '쇠고기볶음', '쇠고기채소볶음'

## Set max fixed tokenized data size

Then get the fixed data

In [12]:
max_token_length = 384
bert_feats = np.zeros((len(train_lines), max_token_length * 768))

In [13]:
bert_feats.shape

(2337, 294912)

In [14]:
inputs = tokenizer(train_lines, max_length=max_token_length, padding='max_length', truncation=True, return_tensors='tf')

### Batch processing 

In [15]:
batch_size = 32
file_count = len(train_lines)
for start_index in track(list(range(0, file_count, batch_size))):

        end_index = min(start_index + batch_size, file_count)
        input_ids_ = inputs['input_ids'][start_index:end_index]
        token_type_ids_ = inputs['token_type_ids'][start_index:end_index]
        attention_mask_ = inputs['attention_mask'][start_index:end_index]

        outputs = bert_model({'input_ids' : input_ids_, 
                'token_type_ids' : token_type_ids_, 
                'attention_mask' : attention_mask_})

        bert_feats[start_index:end_index, :] = tf.keras.layers.Flatten()(outputs.last_hidden_state).numpy()  

Output()

In [16]:
bert_feats.shape

(2337, 294912)

In [17]:
#np.save("bert_food_feat1.npy", bert_feats)

In [18]:
max_size = bert_feats.shape[1]

### Set up a text search engine

In [19]:
x_bert = tf.keras.Input(shape=(max_size,))
#x = tf.keras.layers.BatchNormalization()(x_bert)
x = tf.keras.layers.Dense(units=bert_feats.shape[0], activation='linear', name='fc1', use_bias=False)(x_bert)   
x_bert_feat_matching = tf.keras.models.Model(inputs=x_bert, outputs=x)

In [20]:
bert_feats.shape

(2337, 294912)

In [21]:
bert_feats_T = bert_feats.T

In [22]:
bert_feats_T.shape

(294912, 2337)

In [23]:
temp_weights = x_bert_feat_matching.get_weights()

In [24]:
temp_weights[-1].shape

(294912, 2337)

In [25]:
temp_weights[-1] = bert_feats_T 
x_bert_feat_matching.set_weights(temp_weights)

## Define chosen topics the model can handle

In [26]:
total_text = []
for each in track(train_lines):
    total_text.append(each)

Output()

In [27]:
topics = []

for each in train_lines:
    topics.append(each.split('<input>')[1])
    
topics = np.unique(topics).tolist()

# store sample chats
topic_samples = {}
for each in track(train_lines):
    current_topic = each.split('<input>')[1]
    names = each.split('<input>')[0]
    if current_topic in topic_samples:
        topic_samples[current_topic].append(names)
    else:
        topic_samples[current_topic] = []
        topic_samples[current_topic].append(names) 

Output()

In [28]:
test_samples = {}
real_names = food_test['식품명'].tolist()
for i in range(len(real_names)):
    real_food = real_names[i]
    input_line = test_lines[i]
    test_samples[input_line] = real_food

print(test_samples)

{'다래나물무침<input>다래나물무침': '달래나물무침', '깬닙찜<input>깬닙찜': '깻잎찜', '등심돈까쓰<input>등심돈까쓰': '등심돈가스', '게살쭉<input>게살쭉': '게살죽', '버섯즌<input>버섯즌': '버섯전', '고추튀기<input>고추튀기': '고추튀김', '간장닭다리구이<input>간장닭다리구이': '간장양념닭다리구이', '버섯샤부<input>버섯샤부': '버섯샤브샤브', '닥꼬기냉채<input>닥꼬기냉채': '닭고기냉채', '소고기전골<input>소고기전골': '쇠고기전골', '삼겹살<input>삼겹살': '삼겹살구이', '딱뽀끔탕<input>딱뽀끔탕': '닭볶음탕', '냉이된장찌기<input>냉이된장찌기': '냉이된장찌개', '순두부김치찌기<input>순두부김치찌기': '순두부김치찌개', '가배기<input>가배기': '꽈배기', '쇠고기완자즌<input>쇠고기완자즌': '쇠고기완자전', '우렁된장구<input>우렁된장구': '우렁된장국', '불고기햄버거<input>불고기햄버거': '불고기버거', '소고기주먹밥<input>소고기주먹밥': '쇠고기주먹밥', '해덥밥<input>해덥밥': '회덮밥', '감저사라다<input>감저사라다': '감자샐러드', '소고기채소보끈<input>소고기채소보끈': '쇠고기채소볶음', '대지괴기산즉<input>대지괴기산즉': '돼지고기산적', '바지락조개궁<input>바지락조개궁': '바지락조개국', '오징어티김<input>오징어티김': '오징어튀김', '다시마멋침<input>다시마멋침': '다시마무침', '쇠고기고추장보끈<input>쇠고기고추장보끈': '쇠고기고추장볶음', '닥뽀끔<input>닥뽀끔': '닭볶음', '중국우동<input>중국우동': '우동(중식)', '오이장아지<input>오이장아지': '오이장아찌', '갓낌치<input>갓낌치': '갓김치', '계란찜(우유)<input>계란찜(우유)': '달걀찜(우유)', '우거질해장국<input>우거질해장국': '우거지해장국', '쌀국시<

# Text Search Engine response (for multiple items)

In [29]:
inputs = tokenizer(test_lines, max_length=max_token_length, padding='max_length', truncation=True, return_tensors='tf')

In [30]:
batch_size = 32
file_count = len(test_lines)
input_feats = np.zeros((len(test_lines), max_token_length*768))
for start_index in track(list(range(0, file_count, batch_size))):
    
    end_index = min(start_index + batch_size, file_count)
    input_ids_ = inputs['input_ids'][start_index:end_index]
    token_type_ids_ = inputs['token_type_ids'][start_index:end_index]
    attention_mask_ = inputs['attention_mask'][start_index:end_index]
    
    outputs = bert_model({'input_ids' : input_ids_, 
            'token_type_ids' : token_type_ids_, 
            'attention_mask' : attention_mask_})
    
    input_feats[start_index:end_index, :] = tf.keras.layers.Flatten()(outputs.last_hidden_state).numpy()  

Output()

In [31]:
input_feats.shape

(584, 294912)

In [32]:
x_bert_feat_matching.save_weights('x_bert-food-ko-featmatching1.h5')

In [33]:
json_model = x_bert_feat_matching.to_json()
#save the model architecture to JSON file
with open('/mnt/f/IEEE-Access-Code/x_bert_feat_matching.json', 'w') as json_file:
    json_file.write(json_model)

In [35]:
from tensorflow.keras.models import model_from_json

# Load JSON and create a model
with open('x_bert_feat_matching.json', "r") as json_file:
    loaded_model_json = json_file.read()

loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights('x_bert-food-ko-featmatching1.h5')


In [36]:
prediction = loaded_model.predict(input_feats)



In [37]:
prediction.shape

(584, 2337)

## Accuracy of text search engine response (for multiple items)

In [38]:
def response_acc(k,total_text, prediction):
    total_results = []
    for item in prediction:
        item_results = []
        for index in item.argsort()[-k::][::-1]:
            item_results.append(total_text[index].split('<input>')[1])
            item_set = set(item_results)
        total_results.append(item_set)
    
    count = 0
    for i in range(len(test_lines)):
        target = test_samples[test_lines[i]]
        for item in total_results:
            if target in item:
                    count +=1
                    break
    return (count/len(test_lines))*100

### There might be ~0.5% deviation of accuracy 

In [39]:
print('TOP 1 accuracy of test data response: ',response_acc(1,total_text, prediction))
print('TOP 3 accuracy of test data response: ',response_acc(3,total_text, prediction))
print('TOP 5 accuracy of test data response: ',response_acc(5,total_text, prediction))

TOP 1 accuracy of test data response:  69.86301369863014
TOP 3 accuracy of test data response:  78.42465753424658
TOP 5 accuracy of test data response:  82.1917808219178


## Text search engine response (for 1 item)

In [40]:
input_keyword = '옥시시샐러다'
input_feats = get_bert_hidden_state(tokenizer, bert_model, input_keyword, max_size=max_size)
one_prediction = x_bert_feat_matching.predict(input_feats).reshape(-1)



In [41]:
def find_food_info(df, target_column, chat_response):
    food_info = df[df[target_column] == chat_response]
    return food_info

In [42]:
chat_response = total_text[one_prediction.argsort()[-5:][::-1][0]].split('<input>')[1]
find_food_info(food_df,'food_item',chat_response)

Unnamed: 0,food_item,serving_size,calories (kcal),protein (g),protein (g).1,protein (g).2,protein (g).3,carbohydrate (g),sugar (g)
604,옥수수샐러드,160,265.65,3.13,19.11,3.13,19.11,20.28,9.61
