In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import model_from_json
from transformers import BertTokenizer, TFBertModel

import tqdm

# distilbert-base-multilingual-cased
# bert-base-multilingual-cased
tokenizer = BertTokenizer.from_pretrained("distilbert-base-multilingual-cased")
bert_model = TFBertModel.from_pretrained("distilbert-base-multilingual-cased", output_hidden_states=True) 

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
2023-12-11 15:06:44.174151: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2023-12-11 15:06:44.174186: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2023-12-11 15:06:44.174192: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2023-12-11 15:06:44.174310: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-12-11 1

### Read Pre-generated WordNet train/test datasets

In [2]:
food_train = pd.read_csv('WordNet_train_for_distilBERT.csv')
food_test = pd.read_csv('WordNet_test_for_distilBERT.csv')

In [3]:
food_train.head()

Unnamed: 0,original_words,typos,name_line,label
0,intumescency,intudescency,intumescbency<input>intumescency,0
1,invasion_of_iwo,invasioan_of_iwo,invasicn_of_iwo<input>invasion_of_iwo,1
2,occupational_therapy,occupational_twherapy,occupationai_therapy<input>occupational_therapy,2
3,inconspicuous,inconspiuous,inconrspicuous<input>inconspicuous,3
4,highly_strung,highlystrung,highly_stzung<input>highly_strung,4


In [4]:
food_test.head()

Unnamed: 0,original_words,typos,name_line,label
0,cavalry_sword,cavalryfsword,cavalryisword<input>cavalry_sword,234
1,sweatbox,swetbox,swatbox<input>sweatbox,230
2,ischium,ischim,iscpium<input>ischium,2011
3,star-glory,star-mlory,star-gloy<input>star-glory,2128
4,prosciutto,prosciatto,prokciutto<input>prosciutto,1917


### Pre-process just like we did in Korean food dataset

In [5]:
train_lines = [str(s) for s in food_train['name_line']]
test_lines = [str(s) for s in food_test['name_line']]
print(train_lines[:10])
print(test_lines[:10])

['intumescbency<input>intumescency', 'invasicn_of_iwo<input>invasion_of_iwo', 'occupationai_therapy<input>occupational_therapy', 'inconrspicuous<input>inconspicuous', 'highly_stzung<input>highly_strung', 'ehoplift<input>shoplift', 'newtown_wondemr<input>newtown_wonder', 'deviled_tegg<input>deviled_egg', 'fohlktale<input>folktale', 'secondary_coib<input>secondary_coil']
['cavalryisword<input>cavalry_sword', 'swatbox<input>sweatbox', 'iscpium<input>ischium', 'star-gloy<input>star-glory', 'prokciutto<input>prosciutto', 'fwmily_megatheriidae<input>family_megatheriidae', 'cysually<input>casually', 'petaloia<input>petaloid', 'dashng<input>dashing', "pikeb's_peak<input>pike's_peak"]


In [6]:
all_lines = [str(s) for s in food_train['typos']] + [str(s) for s in food_test['typos']]
all_lines[:10]

['intudescency',
 'invasioan_of_iwo',
 'occupational_twherapy',
 'inconspiuous',
 'highlystrung',
 'shoplifs',
 'newgown_wonder',
 'deviled_hgg',
 'folktafle',
 'secondtry_coil']

### Check if DistilBERT produces UNK tokens for WordNet dataset

In [7]:
count = 0
for line in all_lines:
    tokens= tokenizer.tokenize(line)
    if tokens[0] == '[UNK]':
        count +=1
print('The number of UNK token : ', count)
print('The number of all food names : ', len(all_lines))
print('percentage of UNK token/all food names: ', ((count/len(all_lines))*100))

The number of UNK token :  0
The number of all food names :  2876
percentage of UNK token/all food names:  0.0


In [8]:
max_token_length = 384
bert_feats = np.zeros((len(train_lines), max_token_length * 768))

In [9]:
def get_hidden_states(model, inputs):
    with tf.GradientTape() as tape:
        tape.watch(inputs)
        outputs = model(inputs)
        hidden_states = outputs.last_hidden_state
    return hidden_states

### Extract hidden states from DistilBERT 

In [10]:
for idx, sentence in enumerate(tqdm.tqdm(train_lines)):
    inputs = tokenizer(sentence, return_tensors="tf", truncation=True, padding="max_length", max_length=max_token_length)
    hidden_states = get_hidden_states(bert_model, inputs["input_ids"])
    bert_feats[idx] = hidden_states.numpy().reshape(-1)

  0%|          | 0/2338 [00:00<?, ?it/s]



  0%|          | 1/2338 [00:00<15:41,  2.48it/s]



  0%|          | 2/2338 [00:00<09:02,  4.31it/s]



  0%|          | 3/2338 [00:00<06:52,  5.66it/s]



  0%|          | 4/2338 [00:00<05:49,  6.67it/s]



100%|██████████| 2338/2338 [04:09<00:00,  9.36it/s]


### Check the extracted feature size

In [11]:
bert_feats.shape

(2338, 294912)

### Save to an NPY file

In [39]:
np.save('WordNet_distilBERT.npy', bert_feats)