## GPT2 Feature Matching Part 1
<br> 
We save an npy file due to its heavy file size and memory consumption

In [1]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from rich.progress import track

2023-10-02 18:45:43.178542: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### The pre-trained model

In [2]:
# This model "eaglewatch/gpt2-ko-wikipedia" was trained with "eaglewatch/Korean_Wikipedia_Dataset_for_GPT2_August_2022" dataset from HuggingFace
# this dataset can be downloaded as follow;

#from datasets import load_dataset
#dataset = load_dataset("eaglewatch/Korean_Wikipedia_Dataset_for_GPT2_August_2022")

tokenizer = AutoTokenizer.from_pretrained("eaglewatch/gpt2-ko-wikipedia")
gpt_model = AutoModel.from_pretrained("eaglewatch/gpt2-ko-wikipedia", output_hidden_states=True)

### Food Nutrient Data

In [3]:
food_df  = pd.read_csv('data/food_nutrient_info_finalized.csv')
food_df.head()

Unnamed: 0,food_item,serving_size,calories (kcal),protein (g),protein (g).1,protein (g).2,protein (g).3,carbohydrate (g),sugar (g)
0,닭갈비,400,595.61,45.9,25.8,45.9,25.8,44.9,21.2
1,닭꼬치,70,176.72,11.56,8.57,11.56,8.57,13.35,3.15
2,더덕구이,100,184.0,3.1,5.2,3.1,5.2,31.1,11.6
3,소양념갈비구이,300,989.15,60.1,71.6,60.1,71.6,26.2,13.9
4,양념장어구이,150,433.35,30.77,30.56,30.77,30.56,8.8,4.18


### Korean Food Data (Train and Test)

In [4]:
food_train = pd.read_csv('data/food_aging_train.csv')
food_test = pd.read_csv('data/food_aging_test.csv')

In [5]:
food_train['food_line'] = food_train['식품오타']+'<input>'+food_train['식품명']
food_test['test_input'] = food_test['식품오타']+'<input>'

In [6]:
food_train.head()

Unnamed: 0,식품명,식품오타,food_line
0,호떡,하떡,하떡<input>호떡
1,달걀찜(새우젓),새우젓계란찜,새우젓계란찜<input>달걀찜(새우젓)
2,오징어덮밥,오징이덥밥,오징이덥밥<input>오징어덮밥
3,참나물무침,참나물,참나물<input>참나물무침
4,크림소스스파게티,크림스파게티,크림스파게티<input>크림소스스파게티


In [7]:
food_test.head()

Unnamed: 0,식품명,식품오타,test_input
0,달래나물무침,다래나물무침,다래나물무침<input>
1,깻잎찜,깬닙찜,깬닙찜<input>
2,등심돈가스,등심돈까쓰,등심돈까쓰<input>
3,게살죽,게살쭉,게살쭉<input>
4,버섯전,버섯즌,버섯즌<input>


In [8]:
print(food_train.shape)
print(food_test.shape)

(2337, 3)
(584, 3)


### Check the Train and Test lines

In [9]:
train_lines = [str(s) for s in food_train['food_line']]
test_lines = [str(s) for s in food_test['test_input']]
print(train_lines[:10])
print(test_lines[:10])

['하떡<input>호떡', '새우젓계란찜<input>달걀찜(새우젓)', '오징이덥밥<input>오징어덮밥', '참나물<input>참나물무침', '크림스파게티<input>크림소스스파게티', '안심돈까쑤<input>안심돈가스', '쇠고기뭇국<input>쇠고기무국', '채소샌드이치<input>채소샌드위치', '채소보끈빱<input>채소볶음밥', '볶은밥<input>볶음밥']
['다래나물무침<input>', '깬닙찜<input>', '등심돈까쓰<input>', '게살쭉<input>', '버섯즌<input>', '고추튀기<input>', '간장닭다리구이<input>', '버섯샤부<input>', '닥꼬기냉채<input>', '소고기전골<input>']


In [10]:
tokenizer

GPT2TokenizerFast(name_or_path='eaglewatch/gpt2-ko-wikipedia', vocab_size=100000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True)

### Set max fixed tokenized data size
Then get the fixed data

In [11]:
max_token_length = 1024
gpt_feats = np.zeros((len(train_lines), max_token_length * 768))

In [12]:
gpt_feats.shape

(2337, 786432)

### Extract Features
* feature extraction with pytorch
* it takes about 30 - 60 min depends on the machine 


In [15]:
%%time 
for i in track(range(len(train_lines))):   
    encoded = tokenizer(train_lines[i], max_length=1024, padding='max_length', truncation=True, return_tensors='pt') 
    gpt_feats[i] = gpt_model(**encoded).last_hidden_state.detach().numpy().reshape(1024*768)

Output()

CPU times: user 6h 54min 52s, sys: 39min, total: 7h 33min 53s
Wall time: 28min 22s


In [16]:
gpt_feats.shape

(2337, 786432)

### Save the NPY file

In [19]:
np.save("gpt_feat_food4.npy", gpt_feats)