In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
%run make_datasets.ipynb

In [8]:
df = pd.read_csv('../data/df_fs.csv')
df.dropna(inplace=True)
df = df.reset_index(drop=True)
print(df.shape)
df.head()
get_embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

(4020, 5)


In [9]:
df.category.value_counts()

category
tintas                   547
cozinha                  435
torneiras                422
acess_sanitarios         373
interruptores_tomadas    366
ferragens                341
fechaduras               334
complementos             326
acess_ferramentas        246
conexoes                 215
pisos                    211
porcelanatos             204
Name: count, dtype: int64

In [11]:
categories = df.category.value_counts()
categories = categories[categories >= 200].index
new_df = df[df.category.isin(categories)]
new_df.reset_index(inplace=True)
new_df.category.value_counts()

category
tintas                   547
cozinha                  435
torneiras                422
acess_sanitarios         373
interruptores_tomadas    366
ferragens                341
fechaduras               334
complementos             326
acess_ferramentas        246
conexoes                 215
pisos                    211
porcelanatos             204
Name: count, dtype: int64

In [12]:
train, test = split_train_test(new_df)
embed_dim = get_embed(train.loc[:1, 'name'].values.tolist()).shape[1]
print(f"train: {train.shape}, test: {test.shape},  embe: {embed_dim}")

train: (2814, 6), test: (1206, 6),  embe: 512


In [15]:
def get_use_embeddings(texts):
    return get_embed(texts).numpy()

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train['target'].values)
y_test = label_encoder.transform(test['target'].values)

X_train = get_use_embeddings(train['name'].tolist())
X_test = get_use_embeddings(test['name'].tolist())

In [16]:
def get_weights(y_train):
    n_class = np.bincount(y_train)
    n_samples = len(y_train)
    class_weights = n_samples / (len(n_class) * n_class)
    weights = np.array([class_weights[label] for label in y_train])
    return weights

In [17]:
weights = get_weights(y_train)

xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=len(np.unique(y_train)),
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    eval_metric='mlogloss',
    random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_results = {
    metric: cross_val_score(
        xgb_model, X_train, y_train,
        cv=cv, scoring=metric,
        params={'sample_weight': weights})
              for metric in scoring
}

for metric, scores in cv_results.items():
    print(f"{metric.capitalize()}: {scores.mean():.2f} ± {scores.std():.2f}")

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

Accuracy: 0.92 ± 0.01
Precision_macro: 0.92 ± 0.01
Recall_macro: 0.92 ± 0.01
F1_macro: 0.92 ± 0.01
              precision    recall  f1-score   support

           0       0.78      0.78      0.78        74
           1       0.87      0.83      0.85       112
           2       0.87      0.88      0.87        98
           3       1.00      0.98      0.99        64
           4       0.88      0.92      0.90       131
           5       0.97      0.99      0.98       100
           6       0.94      0.92      0.93       102
           7       0.95      0.98      0.96       110
           8       1.00      0.97      0.98        63
           9       0.95      0.93      0.94        61
          10       0.99      0.96      0.97       164
          11       0.93      0.96      0.95       127

    accuracy                           0.93      1206
   macro avg       0.93      0.93      0.93      1206
weighted avg       0.93      0.93      0.93      1206



In [19]:
y_pred_prob = xgb_model.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
print(f"ROC-AUC Score: {roc_auc:.4f}")

ROC-AUC Score: 0.9971
