## Library

In [1]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertModel
from kobert_tokenizer import KoBERTTokenizer
import torch

from rich.progress import track

2024-02-28 11:46:33.531656: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-28 11:46:33.694578: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-28 11:46:33.694608: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-28 11:46:33.722234: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-28 11:46:33.778913: I tensorflow/core/platform/cpu_feature_guar

In [2]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bert_model = BertModel.from_pretrained('skt/kobert-base-v1')

In [3]:
device = torch.device("cuda:0")

## Load data

In [5]:
food_train = pd.read_csv('data/food_aging_train.csv')
food_test = pd.read_csv('data/food_aging_test.csv')
food_nutrient_info = pd.read_csv('data/food_nutrient_info_finalized.csv')

In [6]:
food_train.head()

Unnamed: 0,식품명,식품오타,food_line
0,호떡,하떡,하떡<input>호떡
1,달걀찜(새우젓),새우젓계란찜,새우젓계란찜<input>달걀찜(새우젓)
2,오징어덮밥,오징이덥밥,오징이덥밥<input>오징어덮밥
3,참나물무침,참나물,참나물<input>참나물무침
4,크림소스스파게티,크림스파게티,크림스파게티<input>크림소스스파게티


In [7]:
food_test.head()

Unnamed: 0,식품명,식품오타,test_input
0,달래나물무침,다래나물무침,다래나물무침<input>다래나물무침
1,깻잎찜,깬닙찜,깬닙찜<input>깬닙찜
2,등심돈가스,등심돈까쓰,등심돈까쓰<input>등심돈까쓰
3,게살죽,게살쭉,게살쭉<input>게살쭉
4,버섯전,버섯즌,버섯즌<input>버섯즌


# data prep

## read marked text inputs

In [8]:
train_lines = [str(s) for s in food_train['food_line']]
test_lines = [str(s) for s in food_test['test_input']]
print(train_lines[:10])
print(test_lines[:10])

['하떡<input>호떡', '새우젓계란찜<input>달걀찜(새우젓)', '오징이덥밥<input>오징어덮밥', '참나물<input>참나물무침', '크림스파게티<input>크림소스스파게티', '안심돈까쑤<input>안심돈가스', '쇠고기뭇국<input>쇠고기무국', '채소샌드이치<input>채소샌드위치', '채소보끈빱<input>채소볶음밥', '볶은밥<input>볶음밥']
['다래나물무침<input>다래나물무침', '깬닙찜<input>깬닙찜', '등심돈까쓰<input>등심돈까쓰', '게살쭉<input>게살쭉', '버섯즌<input>버섯즌', '고추튀기<input>고추튀기', '간장닭다리구이<input>간장닭다리구이', '버섯샤부<input>버섯샤부', '닥꼬기냉채<input>닥꼬기냉채', '소고기전골<input>소고기전골']


## Check the number of UNK tokens

In [9]:
tokenizer

PreTrainedTokenizer(name_or_path='skt/kobert-base-v1', vocab_size=8002, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [10]:
all_lines = [str(s) for s in food_train['식품오타']] + [str(s) for s in food_test['식품오타']]
all_lines[:10]

['하떡',
 '새우젓계란찜',
 '오징이덥밥',
 '참나물',
 '크림스파게티',
 '안심돈까쑤',
 '쇠고기뭇국',
 '채소샌드이치',
 '채소보끈빱',
 '볶은밥']

In [11]:
count = 0
for line in all_lines:
    tokens= tokenizer.tokenize(line)
    if tokens[0] == '[UNK]':
        count +=1
print('The number of UNK token : ', count)
print('The number of all food names : ', len(all_lines))
print('percentage of UNK token/all food names: ', (count/len(all_lines)*100))

The number of UNK token :  0
The number of all food names :  2921
percentage of UNK token/all food names:  0.0


# extracting embedding values

## set max fixed tokenized data size

In [12]:
max_token_length = 384
kobert_feats = np.zeros((len(train_lines), max_token_length*768))

## Batch processing

In [13]:
for i in track(range(len(train_lines))):
    encoded = tokenizer(train_lines[i], max_length=384, padding='max_length', truncation=True, return_tensors='pt')
    kobert_feats[i] = bert_model(**encoded).last_hidden_state.detach().numpy().reshape(max_token_length*768)

In [14]:
kobert_feats.shape

(2337, 294912)

In [15]:
np.save('kobert_food_feat1.npy',kobert_feats)

# Generate Text search engine

In [16]:
max_size = kobert_feats.shape[1]
print(max_size)

294912


In [17]:
x_kobert = tf.keras.Input(shape=(max_size,))
x = tf.keras.layers.Dense(units=kobert_feats.shape[0], activation='linear', name='fc1', use_bias=False)(x_kobert)
x_kobert_feat_matching = tf.keras.models.Model(inputs=x_kobert, outputs=x)

2024-02-28 11:52:03.332471: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-28 11:52:03.414803: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-28 11:52:03.414950: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-28 11:52:03.416119: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-28 11:52:03.416253: I external/local_xla/xla/stream_executor

In [18]:
temp_weights = x_kobert_feat_matching.get_weights()

In [19]:
temp_weights[-1] = kobert_feats.T
x_kobert_feat_matching.set_weights(temp_weights)

In [20]:
x_kobert_feat_matching.save_weights('kobert-food-ko-featmatching1.h5')

# Define chosen topics the model can handle

In [21]:
total_text = []
for each in track(train_lines):
    total_text.append(each)

In [22]:
topics = []

for each in train_lines:
    topics.append(each.split('<input>')[1])

topics = np.unique(topics).tolist()

# store sample chats
topic_samples = {}
for each in track(train_lines):
    current_topic = each.split('<input>')[1]
    names = each.split('<input>')[0]
    if current_topic in topic_samples:
        topic_samples[current_topic].append(names)
    else:
        topic_samples[current_topic] = []
        topic_samples[current_topic].append(names)

In [23]:
test_samples = {}
real_names = food_test['식품명'].tolist()
for i in range(len(real_names)):
    real_food = real_names[i]
    input_line = test_lines[i]
    test_samples[input_line] = real_food

print(test_samples)

{'다래나물무침<input>다래나물무침': '달래나물무침', '깬닙찜<input>깬닙찜': '깻잎찜', '등심돈까쓰<input>등심돈까쓰': '등심돈가스', '게살쭉<input>게살쭉': '게살죽', '버섯즌<input>버섯즌': '버섯전', '고추튀기<input>고추튀기': '고추튀김', '간장닭다리구이<input>간장닭다리구이': '간장양념닭다리구이', '버섯샤부<input>버섯샤부': '버섯샤브샤브', '닥꼬기냉채<input>닥꼬기냉채': '닭고기냉채', '소고기전골<input>소고기전골': '쇠고기전골', '삼겹살<input>삼겹살': '삼겹살구이', '딱뽀끔탕<input>딱뽀끔탕': '닭볶음탕', '냉이된장찌기<input>냉이된장찌기': '냉이된장찌개', '순두부김치찌기<input>순두부김치찌기': '순두부김치찌개', '가배기<input>가배기': '꽈배기', '쇠고기완자즌<input>쇠고기완자즌': '쇠고기완자전', '우렁된장구<input>우렁된장구': '우렁된장국', '불고기햄버거<input>불고기햄버거': '불고기버거', '소고기주먹밥<input>소고기주먹밥': '쇠고기주먹밥', '해덥밥<input>해덥밥': '회덮밥', '감저사라다<input>감저사라다': '감자샐러드', '소고기채소보끈<input>소고기채소보끈': '쇠고기채소볶음', '대지괴기산즉<input>대지괴기산즉': '돼지고기산적', '바지락조개궁<input>바지락조개궁': '바지락조개국', '오징어티김<input>오징어티김': '오징어튀김', '다시마멋침<input>다시마멋침': '다시마무침', '쇠고기고추장보끈<input>쇠고기고추장보끈': '쇠고기고추장볶음', '닥뽀끔<input>닥뽀끔': '닭볶음', '중국우동<input>중국우동': '우동(중식)', '오이장아지<input>오이장아지': '오이장아찌', '갓낌치<input>갓낌치': '갓김치', '계란찜(우유)<input>계란찜(우유)': '달걀찜(우유)', '우거질해장국<input>우거질해장국': '우거지해장국', '쌀국시<

# Text search engine response (for multiple items)

## Test data batch processing (multiple data)

In [24]:
max_token_length = 384
input_feats = np.zeros((len(test_lines), max_token_length*768))

for i in track(range(len(test_lines))):
    encoded = tokenizer(test_lines[i], max_length=384, padding='max_length', truncation=True, return_tensors='pt')
    input_feats[i] = bert_model(**encoded).last_hidden_state.detach().numpy().reshape(max_token_length*768)

In [25]:
# check inference time

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

# start the event
start.record()

# inference all at once
prediction = x_kobert_feat_matching.predict(input_feats)

end.record()

# wait until the event is done
torch.cuda.synchronize()

elapsed_time = start.elapsed_time(end)  # msec 
execution_per_sample = (elapsed_time / 1000) / len(test_lines)
print(f"Elapsed time: {execution_per_sample} seconds")  # convert to sec 


Elapsed time: 0.001440478703747057 seconds


## accuracy to test data

In [31]:
def response_acc(k,total_text, prediction):
    total_results = []
    for item in prediction:
        item_results = []
        for index in item.argsort()[-k::][::-1]:
            item_results.append(total_text[index].split('<input>')[1])
            item_set = set(item_results)
        total_results.append(item_set)

    count = 0
    for i in range(len(test_lines)):
        target = test_samples[test_lines[i]]
        for item in total_results:
            if target in item:
                    count +=1
                    break
    return (count/len(test_lines))*100

In [32]:
print('TOP 1 accuracy of response: ',response_acc(1,total_text, prediction))
print('TOP 3 accuracy of response: ',response_acc(3,total_text, prediction))
print('TOP 5 accuracy of response: ',response_acc(5,total_text, prediction))

TOP 1 accuracy of response:  79.7945205479452
TOP 3 accuracy of response:  90.23972602739725
TOP 5 accuracy of response:  93.83561643835617


# Text search engine response for 1 item

In [33]:
input_keyword = '노각머침<input>노각머침'

encoded = tokenizer(input_keyword, max_length=384, padding='max_length', truncation=True, return_tensors='pt')
temp_feats = bert_model(**encoded).last_hidden_state.detach().numpy().reshape(1,384*768)

one_prediction = x_kobert_feat_matching.predict(temp_feats).reshape(-1)
print(one_prediction.shape)

(2337,)


In [34]:
k=5
for index in one_prediction.argsort()[-k:][::-1]:
    print (total_text[index])

노각머침<input>노각무침
노각물침<input>노각무침
노각무침<input>노각무침
낙지무침<input>낙지무침
해초머침<input>해초무침


## chatbot response for 1 item

In [35]:
total_text[one_prediction.argsort()[-5:][::-1][0]].split('<input>')[1]

'노각무침'

## Find nutrient with matched food names (for chat response for 1 item)

In [36]:
def find_food_info(df, target_column, chat_response):
    food_info = df[df[target_column] == chat_response]
    return food_info

In [37]:
chat_response = total_text[one_prediction.argsort()[-5:][::-1][0]].split('<input>')[1]
find_food_info(food_nutrient_info,'food_item',chat_response)

Unnamed: 0,food_item,serving_size,calories (kcal),protein (g),protein (g).1,protein (g).2,protein (g).3,carbohydrate (g),sugar (g)
76,노각무침,150,84.0,2.6,1.8,2.6,1.8,14.3,7.1


In [42]:
def calculate_precision_recall_f1(k, total_text, label_dict):
    predictions = []
    for item in prediction:
        item_results = []
        for index in item.argsort()[-k::][::-1]:
            item_results.append(total_text[index].split('<input>')[1])
            item_set = set(item_results)
        predictions.append(item_set)
        
    # create truth label table 
    correct_labels_set = set(label_dict.values())

    # count correct prediction from the prediction
    correct_predictions_count = sum(1 for pred in predictions if any(label in pred for label in correct_labels_set))

    predictions_count = len(predictions)

    total_labels_count = len(correct_labels_set)

    # Precision
    precision = correct_predictions_count / predictions_count if predictions_count else 0

    # Recall and make sure it never go over 1.0
    recall = correct_predictions_count / total_labels_count if total_labels_count else 0
    recall = min(recall, 1.0)

    # F1-Score 
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0

    return precision, recall, f1_score

In [43]:
calculate_precision_recall_f1(1, total_text, test_samples)

(0.8458904109589042, 1.0, 0.9165120593692023)