In [2]:
import pandas as pd
import json
import os
from tqdm import tqdm
from collections import defaultdict

In [3]:
def load_jsonl(filepath):
    with open(filepath, 'r', encoding='utf-8') as json_file:
        json_list = list(json_file)

    jsons = []

    for json_str in json_list:
        jsons.append(json.loads(json_str))

    return jsons

In [4]:
root_dir = "./MASSIVE/"

In [5]:
train_filelist = os.listdir(f"{root_dir}/train_data")
test_filelist = os.listdir(f"{root_dir}/test_data")
len(train_filelist), len(test_filelist)

(52, 51)

In [6]:
train_texts = []
train_labels = []

for item in train_filelist:
    data = load_jsonl(f"{root_dir}/train_data/{item}")

    for example in tqdm(data, f"loading {item}"):
        train_texts.append(example["utt"])
        train_labels.append(example["intent"])

loading af-ZA.jsonl: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16521/16521 [00:00<00:00, 1312587.07it/s]
loading am-ET.jsonl: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16521/16521 [00:00<00:00, 1301126.54it/s]
loading ar-SA.jsonl: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16521/16521 [00:00<00:00, 1395539.06it/s]
loading az-AZ.jsonl: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16521/16521 [00:00<00:00, 917826.91it/s]
loading bn-BD.jsonl: 100%|██████████████████████████████████████████████████

In [7]:
from tensorflow.keras.utils import to_categorical
from pandas import Series

2023-08-11 17:35:22.847531: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
unq = Series(train_labels).unique()
idx = Series(Series(train_labels).unique()).index
mapping = Series(idx, index=unq)
train_labels = to_categorical([mapping[item] for item in train_labels])

In [9]:
train_labels.shape

(859092, 60)

In [10]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [11]:
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text  # Needed for loading universal-sentence-encoder-cmlm/multilingual-preprocess
import numpy as np

preprocessor = hub.KerasLayer(
    "https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2")
encoder = hub.KerasLayer(
    "https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base-br/1", trainable=False)

# english_embeds = encoder(preprocessor(english_sentences))["default"]
# japanese_embeds = encoder(preprocessor(japanese_sentences))["default"]
# italian_embeds = encoder(preprocessor(italian_sentences))["default"]

2023-08-11 17:35:31.061994: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-11 17:35:31.278018: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-11 17:35:31.278387: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-11 17:35:31.280736: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-08-11 17:35:31.281482: I tensorflow/compile

In [12]:
BATCH_SIZE = 64

dataset = (tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
            .shuffle(buffer_size=1000)
            .batch(BATCH_SIZE))

DATASET_SIZE = len(dataset)

split = 0.8

train_size = int(split * DATASET_SIZE)
val_size = int(1-split * DATASET_SIZE)

train_ds = dataset.take(train_size)
validation_ds = dataset.skip(train_size)

len(train_ds), len(validation_ds)

(10739, 2685)

In [13]:
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from pandas import Series

In [14]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
x = preprocessor(text_input)
x = encoder(x)["default"]
x = layers.Dense(train_labels.shape[-1], activation='softmax')(x)

model = Model(inputs=text_input, outputs=x)
model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

2023-08-11 17:36:03.944936: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string and shape [?]
	 [[{{node inputs}}]]
2023-08-11 17:36:03.998703: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder' with dtype string and shape [?]
	 [[{{node Placeholder}}]]
2023-08-11 17:36:04.409075: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,128]
	 [[{{node inputs}}]]
2023-08-11 1

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['input_1[0][0]']                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

2023-08-11 17:36:04.625316: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_1' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_1}}]]
2023-08-11 17:36:04.625458: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_2' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_2}}]]


 keras_layer_1 (KerasLayer)     {'pooled_output': (  470926849   ['keras_layer[0][0]',            
                                None, 768),                       'keras_layer[0][1]',            
                                 'sequence_output':               'keras_layer[0][2]']            
                                 (None, 128, 768),                                                
                                 'default': (None,                                                
                                768)}                                                             
                                                                                                  
 dense (Dense)                  (None, 60)           46140       ['keras_layer_1[0][0]']          
                                                                                                  
Total params: 470,972,989
Trainable params: 46,140
Non-trainable params: 470,926,849
________________________

In [19]:
# ModelCheckpoint: Save the model's weights after every epoch
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='./chechkpoints/model_checkpoint.h5', save_best_only=True)
# EarlyStopping: Stop training when a monitored metric has stopped improving
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [28]:
history = model.fit(train_ds, validation_data=validation_ds, epochs=5, checkpoints=[model_checkpoint, early_stopping])

Epoch 1/5


2023-08-11 17:30:33.933132: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [859092]
	 [[{{node Placeholder/_0}}]]
2023-08-11 17:30:33.934146: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype float and shape [859092,60]
	 [[{{node Placeholder/_1}}]]


  102/10739 [..............................] - ETA: 3:26:17 - loss: 1.6250 - accuracy: 0.6011

KeyboardInterrupt: 