In [2]:
import os
import pickle
import keras
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_tuner as kt

In [3]:
import tensorflow_text
import tensorflow_hub as hub
from tensorflow.keras.models import load_model

In [4]:
from tensorflow import keras
from tensorflow.keras.layers import (
    Input, Dense, Dropout, BatchNormalization
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import (
    ModelCheckpoint, EarlyStopping
)
from transformers import TFAutoModel, AutoTokenizer
from sklearn.model_selection import StratifiedShuffleSplit

In [5]:
# run /kaggle/input/make-datasets
# run /kaggle/input/make-embeddings

In [6]:
get_embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
# bert_name = "neuralmind/bert-base-portuguese-cased"
# bert_model = TFAutoModel.from_pretrained(bert_name)
# tokenizer = AutoTokenizer.from_pretrained(bert_name)
# def get_embed(texts, max_length=128):
#     inputs = tokenizer(
#         texts, padding=True, 
#         truncation=True, 
#         max_length=max_length, 
#         return_tensors="tf")
#     outputs = bert_model(inputs)
#     embeds = outputs.last_hidden_state[:, 0, :]
#     return embeds

In [7]:
#df = pd.read_csv('/kaggle/input/datasets-snn/df.csv')
df = pd.read_csv('/kaggle/input/snn-datasets-0x1/df_fs.csv')
df.dropna(inplace=True)
df = df.reset_index(drop=True)
print(df.shape)
df.head()

(4020, 5)


Unnamed: 0,name,price,category,brand,target
0,aro de aco rolete polegada axion,24.9,complementos,AXION,2
1,luva soldavel com rosca em pvc pol fortlev,1.89,conexoes,FORTLEV,3
2,te reducao esgoto sn em pvc branco fortlev,16.9,conexoes,FORTLEV,3
3,te esgoto sn em pvc branco fortlev,2.59,conexoes,FORTLEV,3
4,ralo sifonado quadrado em pvc xxmm branco fortlev,10.9,conexoes,FORTLEV,3


In [8]:
def split_data(df):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_valid_idx, test_idx in sss.split(df, df['target']):
        train_valid = df.loc[train_valid_idx].reset_index(drop=True)
        test = df.loc[test_idx].reset_index(drop=True)

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
    for train_idx, valid_idx in sss.split(train_valid, train_valid['target']):
        train = train_valid.loc[train_idx].reset_index(drop=True)
        valid = train_valid.loc[valid_idx].reset_index(drop=True)

    return train, valid, test


train, test, valid = split_data(df)
embed_dim = get_embed(train.loc[:1, 'name'].values.tolist()).shape[1]
print(f"train: {train.shape}, test: {test.shape}, valid: {valid.shape}, embe: {embed_dim}")
print(train.shape, test.shape, valid.shape)

train: (2412, 5), test: (804, 5), valid: (804, 5), embe: 512
(2412, 5) (804, 5) (804, 5)


In [9]:
class L2NormalizeLayer(keras.Layer):
    def __init__(self, **kwargs):
        super(L2NormalizeLayer, self).__init__(**kwargs)

    def call(self, inputs):
        return tf.math.l2_normalize(inputs, axis=1)

class TripletLossBlock(keras.Layer):
    def __init__(self, alpha, **kwargs):
        self.alpha = alpha
        super(TripletLossBlock, self).__init__(**kwargs)
    
    def triplet_loss(self, inputs):
        a, p, n = inputs
        p_dist = keras.ops.sum(keras.ops.square(a - p), axis=-1)
        n_dist = keras.ops.sum(keras.ops.square(a - n), axis=-1)
        return keras.ops.sum(keras.ops.maximum(p_dist - n_dist + self.alpha, 0), axis=0)
    
    def call(self, inputs):
        loss = self.triplet_loss(inputs)
        self.add_loss(loss)
        return loss
        
def build_model(hp):
    input_one = Input(shape=(embed_dim,))
    
    dense1_units = hp.Int('dense1_units', min_value=128, max_value=512, step=64)
    dense2_units = hp.Int('dense2_units', min_value=64, max_value=256, step=32)
    dense3_units = hp.Int('dense3_units', min_value=64, max_value=256, step=16)
    dense_layer = hp.Int('dense_layer', min_value=64, max_value=128, step=32)
    
    x = Dense(units=dense1_units, activation='relu')(input_one)
    x = Dropout(hp.Float('dropout1', min_value=0.2, max_value=0.5, step=0.1))(x)
    x = BatchNormalization()(x)
    
    x = Dense(units=dense2_units, activation='relu', 
              kernel_regularizer=keras.regularizers.l2(
                  hp.Float('l2_reg', min_value=0.001, max_value=0.01, sampling='log')))(x)
    x = Dropout(hp.Float('dropout2', min_value=0.2, max_value=0.5, step=0.1))(x)

    x = Dense(units=dense3_units, activation='relu')(input_one)
    x = Dropout(hp.Float('dropout3', min_value=0.2, max_value=0.5, step=0.1))(x)
    x = BatchNormalization()(x)
    
    dense_layer = Dense(units=dense_layer, name='dense_layer')(x)
    norm_layer = L2NormalizeLayer(name='norm_layer')(dense_layer)
    
    base_model = Model(inputs=input_one, outputs=norm_layer, name='base_model')
    
    input_a = Input(shape=(embed_dim,))
    input_p = Input(shape=(embed_dim,))
    input_n = Input(shape=(embed_dim,))
    
    embed_a = base_model(input_a)
    embed_p = base_model(input_p)
    embed_n = base_model(input_n)
    
    alpha = hp.Float('alpha', min_value=0.2, max_value=0.8, step=0.2)
    triplet_loss = TripletLossBlock(
        alpha=alpha, name='triplet_loss_block')([embed_a, embed_p, embed_n])
    
    snn_model = Model([input_a, input_p, input_n], triplet_loss)
    
    snn_model.compile(
        optimizer=keras.optimizers.Adam(
            hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')),
        loss=None
    )
    
    return snn_model

tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=20,
    executions_per_trial=1,
    directory='hyperparam_tuning_bert',
    project_name='siamese_bert_tuning'
)

tuner.search_space_summary()

Search space summary
Default search space size: 10
dense1_units (Int)
{'default': None, 'conditions': [], 'min_value': 128, 'max_value': 512, 'step': 64, 'sampling': 'linear'}
dense2_units (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 32, 'sampling': 'linear'}
dense3_units (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 256, 'step': 16, 'sampling': 'linear'}
dense_layer (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 128, 'step': 32, 'sampling': 'linear'}
dropout1 (Float)
{'default': 0.2, 'conditions': [], 'min_value': 0.2, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
l2_reg (Float)
{'default': 0.001, 'conditions': [], 'min_value': 0.001, 'max_value': 0.01, 'step': None, 'sampling': 'log'}
dropout2 (Float)
{'default': 0.2, 'conditions': [], 'min_value': 0.2, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
dropout3 (Float)
{'default': 0.2, 'conditions': [], 'min_value': 0.2, 'max_value': 

In [10]:
def get_triplets(unique_labels, label_indices_map):
    label_l, label_r = np.random.choice(unique_labels, 2, replace=False)
    a, p = np.random.choice(label_indices_map[label_l], 2, replace=False)
    n = np.random.choice(label_indices_map[label_r])
    return a, p, n

def get_batch(batch_size, dataset, unique_labels, label_indices_map, get_embed):
    while True:
        idxs_a, idxs_p, idxs_n = [], [], []
        for _ in range(batch_size):
            a, p, n = get_triplets(unique_labels, label_indices_map)
            idxs_a.append(a)
            idxs_p.append(p)
            idxs_n.append(n)

        a = dataset.iloc[idxs_a].values.tolist()
        b = dataset.iloc[idxs_p].values.tolist()
        c = dataset.iloc[idxs_n].values.tolist()

        a = get_embed(a).numpy()
        p = get_embed(b).numpy()
        n = get_embed(c).numpy()

        yield a, p, n

def create_label_indices_map(dataset, collabel):
    unique_labels = np.array(dataset[collabel].unique().tolist())
    labels = np.array(dataset[collabel].tolist())
    label_indices_map = {
        label: np.flatnonzero(labels == label)
        for label in unique_labels
    }
    return unique_labels, label_indices_map

def triplet_generator(batch_size, dataset, unique_labels, label_indices_map, get_embed):
    while True:
        for a, p, n in get_batch(batch_size, dataset, unique_labels, label_indices_map, get_embed):
            yield (a, p, n), np.zeros((batch_size,))

def create_trip_dtset(batch_size, dataset, colfeat, collabel, get_embed, embed_dim=512):
    unique_labels, label_indices_map = create_label_indices_map(dataset, collabel)
    
    output_signature = (
        (
            tf.TensorSpec(shape=(None, embed_dim), dtype=tf.float32),
            tf.TensorSpec(shape=(None, embed_dim), dtype=tf.float32),
            tf.TensorSpec(shape=(None, embed_dim), dtype=tf.float32)
        ),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
    
    dataset_tf = tf.data.Dataset.from_generator(
        lambda: triplet_generator(
            batch_size, dataset[colfeat], unique_labels, label_indices_map, get_embed),
        output_signature=output_signature
    )
    return dataset_tf

In [11]:
batch_size, colfeat, collabel = 64, 'name', 'target'
train = train
valid = valid
dataset_train = create_trip_dtset(batch_size, train, colfeat, collabel, get_embed, embed_dim)
dataset_val = create_trip_dtset(batch_size, valid, colfeat, collabel, get_embed, embed_dim)
steps = len(train) // batch_size
dataset_train

<_FlatMapDataset element_spec=((TensorSpec(shape=(None, 512), dtype=tf.float32, name=None), TensorSpec(shape=(None, 512), dtype=tf.float32, name=None), TensorSpec(shape=(None, 512), dtype=tf.float32, name=None)), TensorSpec(shape=(None,), dtype=tf.float32, name=None))>

In [12]:
checkpoint = ModelCheckpoint(
    filepath='besta_snn_model.keras',
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=1,
    restore_best_weights=True
)

tuner.search(
    dataset_train,
    epochs=100,
    steps_per_epoch=steps,
    validation_data=dataset_val,
    validation_steps=steps,
    callbacks=[checkpoint, early_stopping]
)

Trial 20 Complete [00h 00m 39s]
val_loss: 0.49811220169067383

Best val_loss So Far: 0.3929402530193329
Total elapsed time: 00h 14m 21s


In [13]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Melhores hiperparâmetros encontrados:")
print(f" - dense1_units: {best_hps.get('dense1_units')}")
print(f" - dense2_units: {best_hps.get('dense2_units')}")
print(f" - dense3_units: {best_hps.get('dense3_units')}")
print(f" - dense_layer_final: {best_hps.get('dense_layer')}")
print(f" - dropout1: {best_hps.get('dropout1')}")
print(f" - dropout2: {best_hps.get('dropout2')}")
print(f" - dropout3: {best_hps.get('dropout2')}")
print(f" - l2_reg: {best_hps.get('l2_reg')}")
print(f" - alpha: {best_hps.get('alpha')}")
print(f" - learning_rate: {best_hps.get('learning_rate')}")

Melhores hiperparâmetros encontrados:
 - dense1_units: 256
 - dense2_units: 64
 - dense3_units: 256
 - dense_layer_final: 128
 - dropout1: 0.2
 - dropout2: 0.4
 - dropout3: 0.4
 - l2_reg: 0.007347869869871025
 - alpha: 0.2
 - learning_rate: 0.0028505050405801123


In [14]:
best_snn = tuner.hypermodel.build(best_hps)
best_snn.fit(
    dataset_train,
    epochs=100,
    steps_per_epoch=steps,
    validation_data=dataset_val,
    validation_steps=steps,
    callbacks=[checkpoint, early_stopping]
)

Epoch 1/100
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 28ms/step - loss: 6.3747
Epoch 1: val_loss improved from inf to 1.61747, saving model to besta_snn_model.keras
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 71ms/step - loss: 6.2752 - val_loss: 1.6175
Epoch 2/100
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 28ms/step - loss: 2.2423
Epoch 2: val_loss improved from 1.61747 to 1.39853, saving model to besta_snn_model.keras
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 58ms/step - loss: 2.2427 - val_loss: 1.3985
Epoch 3/100
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 26ms/step - loss: 1.6008
Epoch 3: val_loss improved from 1.39853 to 1.26818, saving model to besta_snn_model.keras
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - loss: 1.5975 - val_loss: 1.2682
Epoch 4/100
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 25ms/step - loss: 1.2004
Epoch 

<keras.src.callbacks.history.History at 0x7ef9d0361090>

In [15]:
best_snn.summary()

In [16]:
base_snn_model = load_model(
    'besta_snn_model.keras', 
    custom_objects={
        'L2NormalizeLayer': L2NormalizeLayer,
        'TripletLossBlock': TripletLossBlock
})
base_model = base_snn_model.get_layer('base_model')
base_model.summary()