In [2]:
import keras
import numpy as np
import pandas as pd
import xgboost as xgb
import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from tensorflow.keras.models import load_model
#%run make_datasets.ipynb

In [3]:
df = pd.read_csv('/kaggle/input/datasets-snn/df.csv')
df.dropna(inplace=True)
df = df.reset_index(drop=True)
print(df.shape)
df.head()
use_embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

(6867, 5)


In [4]:
class L2NormalizeLayer(keras.Layer):
    def __init__(self, **kwargs):
        super(L2NormalizeLayer, self).__init__(**kwargs)

    def call(self, inputs):
        return tf.math.l2_normalize(inputs, axis=1)

class TripletLossBlock(keras.Layer):
    def __init__(self, alpha, **kwargs):
        self.alpha = alpha
        super(TripletLossBlock, self).__init__(**kwargs)
    
    def triplet_loss(self, inputs):
        a, p, n = inputs
        p_dist = keras.ops.sum(keras.ops.square(a - p), axis=-1)
        n_dist = keras.ops.sum(keras.ops.square(a - n), axis=-1)
        return keras.ops.sum(keras.ops.maximum(p_dist - n_dist + self.alpha, 0), axis=0)
    
    def call(self, inputs):
        loss = self.triplet_loss(inputs)
        self.add_loss(loss)
        return loss

In [5]:
base_snn_model = load_model(
    '/kaggle/input/snn/keras/default/1/main_snn_model.keras', 
    custom_objects={
    'L2NormalizeLayer': L2NormalizeLayer,
    'TripletLossBlock': TripletLossBlock
})
base_model = base_snn_model.get_layer('base_model')
base_model.summary()

In [6]:
df.category.value_counts()

category
tintas                         547
cozinha                        435
torneiras                      422
acess_sanitarios               373
interruptores_tomadas          366
ferragens                      341
fechaduras                     334
complementos                   326
acess_ferramentas              246
conexoes                       215
pisos                          211
porcelanatos                   204
limpeza                        196
ferramentas_manuais            195
acess_conexoes_eletricas       183
registros                      173
moveis                         162
luminarias_refletores          152
acess_pintura                  150
argamassas_rejuntes            135
pinceis                        135
discos_rebolos                 127
revestimentos                  125
epi                            124
banho                          123
chuveiros                      118
lampadas                       116
cubas_pias_balcoes             116
acess_jardi

In [7]:
def split_train_test(df):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
    for train_valid_idx, test_idx in sss.split(df, df['target']):
        train = df.loc[train_valid_idx].reset_index(drop=True)
        test = df.loc[test_idx].reset_index(drop=True)

    return train, test

In [8]:
def get_embeddings(texts):
    return base_model.predict(use_embed(texts).numpy())

In [9]:
def get_weights(y):
    n_class = np.bincount(y)
    n_samples = len(y)
    class_weights = n_samples / (len(n_class) * n_class)
    weights = np.array([class_weights[label] for label in y])
    return weights

In [10]:
def get_model(X_train, y_train):
    weights = get_weights(y_train)

    model = xgb.XGBClassifier(
        objective='multi:softprob',
        num_class=len(np.unique(y_train)),
        max_depth=6,
        tree_method="hist", 
        device="cuda",
        learning_rate=0.1,
        n_estimators=100,
        eval_metric='mlogloss',
        random_state=42
    )
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    cv_results = {metric: cross_val_score(model, X_train, y_train, cv=cv, scoring=metric,
                                          fit_params={'sample_weight': weights})
                  for metric in scoring}
    
    for metric, scores in cv_results.items():
        print(f"{metric.capitalize()}: {scores.mean():.2f} ± {scores.std():.2f}")
    
    model.fit(X_train, y_train)
    return model

* # Testando a partir de categoria com 100 exemplos

In [11]:
categories = df.category.value_counts()
categories = categories[categories >= 100].index
new_df = df[df.category.isin(categories)]
new_df.reset_index(inplace=True)
new_df.category.value_counts()

train, test = split_train_test(new_df)
print(f"train: {train.shape}, test: {test.shape}")

train: (4806, 6), test: (2061, 6)


In [12]:
label_encoder = LabelEncoder()
y_train0 = label_encoder.fit_transform(train['target'].values)
y_test0 = label_encoder.transform(test['target'].values)

X_train0 = get_embeddings(train['name'].tolist())
X_test0 = get_embeddings(test['name'].tolist())

model0 = get_model(X_train0, y_train0)
y_pred0 = model0.predict(X_test0)
print(classification_report(y_test0, y_pred0))

y_pred_prob0 = model0.predict_proba(X_test0)
roc_auc0 = roc_auc_score(y_test0, y_pred_prob0, multi_class='ovr')
print(f"ROC-AUC Score: {roc_auc0:.4f}")

[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Accuracy: 0.82 ± 0.01
Precision_macro: 0.79 ± 0.02
Recall_macro: 0.76 ± 0.02
F1_macro: 0.77 ± 0.02
              precision    recall  f1-score   support

           0       0.72      0.53      0.61        55
           1       0.67      0.82      0.74        74
           2       0.86      0.60      0.71        30
           3       0.80      0.50      0.62        32
           4       0.63      0.49      0.55        45
           5       0.67      0.88      0.76       112
           6       0.79      0.35      0.49        31
           7       0.59      0.60      0.59        40
           8       0.97      0.84      0.90        37
           9       0.79      0.71      0.75        31
          10       0.81      0.74      0.78        35
          11       0.87      0.92      0.90        98
          12       0.90      0.98      0.94        65
          13       0.66      0.85      0.74       131
          14       0.80      0.80      0.80        35
          15       0.81      0.89   

* # Testando a partir de categoria com 150 exemplos

In [13]:
categories = df.category.value_counts()
categories = categories[categories >= 150].index
new_df = df[df.category.isin(categories)]
new_df.reset_index(inplace=True)
new_df.category.value_counts()

train, test = split_train_test(new_df)
print(f"train: {train.shape}, test: {test.shape}")

train: (3661, 6), test: (1570, 6)


In [14]:
label_encoder = LabelEncoder()
y_train1 = label_encoder.fit_transform(train['target'].values)
y_test1 = label_encoder.transform(test['target'].values)

X_train1 = get_embeddings(train['name'].tolist())
X_test1 = get_embeddings(test['name'].tolist())

model1 = get_model(X_train1, y_train1)
y_pred1 = model1.predict(X_test1)
print(classification_report(y_test1, y_pred1))

y_pred_prob1 = model1.predict_proba(X_test1)
roc_auc1 = roc_auc_score(y_test1, y_pred_prob1, multi_class='ovr')
print(f"ROC-AUC Score: {roc_auc1:.4f}")

[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Accuracy: 0.87 ± 0.01
Precision_macro: 0.84 ± 0.01
Recall_macro: 0.83 ± 0.01
F1_macro: 0.83 ± 0.01
              precision    recall  f1-score   support

           0       0.71      0.55      0.62        55
           1       0.61      0.84      0.71        74
           2       0.68      0.47      0.55        45
           3       0.77      0.94      0.84       112
           4       0.91      0.96      0.94        98
           5       0.91      0.92      0.91        64
           6       0.80      0.95      0.87       131
           7       1.00      0.96      0.98       100
           8       0.81      0.92      0.86       102
           9       0.64      0.43      0.52        58
          10       0.93      0.96      0.95       110
          11       0.82      0.63      0.71        59
          12       0.91      0.70      0.79       

In [15]:
categories = df.category.value_counts()
categories = categories[categories >= 200].index
new_df = df[df.category.isin(categories)]
new_df.reset_index(inplace=True)
new_df.category.value_counts()

train, test = split_train_test(new_df)
print(f"train: {train.shape}, test: {test.shape}")

train: (2814, 6), test: (1206, 6)


In [16]:
label_encoder = LabelEncoder()
y_train2 = label_encoder.fit_transform(train['target'].values)
y_test2 = label_encoder.transform(test['target'].values)

X_train2 = get_embeddings(train['name'].tolist())
X_test2 = get_embeddings(test['name'].tolist())

model2 = get_model(X_train2, y_train2)
y_pred2 = model2.predict(X_test2)
print(classification_report(y_test2, y_pred2))

y_pred_prob2 = model2.predict_proba(X_test2)
roc_auc2 = roc_auc_score(y_test2, y_pred_prob2, multi_class='ovr')
print(f"ROC-AUC Score: {roc_auc2:.4f}")

[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Accuracy: 0.97 ± 0.00
Precision_macro: 0.97 ± 0.00
Recall_macro: 0.97 ± 0.00
F1_macro: 0.97 ± 0.00
              precision    recall  f1-score   support

           0       0.99      0.91      0.94        74
           1       0.89      0.88      0.89       112
           2       0.93      0.96      0.94        98
           3       1.00      1.00      1.00        64
           4       0.92      0.94      0.93       131
           5       0.98      0.99      0.99       100
           6       0.90      0.94      0.92       102
           7       0.99      0.98      0.99       110
           8       1.00      1.00      1.00        63
           9       1.00      0.98      0.99        61
          10       1.00      0.97      0.98       164
          11       0.96      0.98      0.97       127

    accuracy                           0.96      12