In [None]:
from keras import layers
from sklearn.model_selection import train_test_split

In [2]:
import keras
from keras import ops

In [3]:
import numpy as np
import pandas as pd
import unicodedata, re
import tensorrt as trt
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
def preprocess(tx):
    txt = unicodedata.normalize('NFD', str(tx))
    txt = ''.join([char for char in txt if not unicodedata.combining(char)])
    txt = txt.lower()
    txt = re.sub(r"²+", "", txt)
    txt = re.sub(r"/?\s*ref\s*\.?\s*[a-zA-Z0-9]+", "", txt)
    txt = re.sub(r'(\d)\s*,\s*(\d)', r'\1.\2', txt)
    txt = re.sub(r'(?<=\d)(?=\D)|(?<=\D)(?=\d)', ' ', txt)
    txt = re.sub(r'(\d+)(x)(\d+)', r'\1 \2 \3', txt)

    txt = re.sub(r'(x)(mm|cm)', r' \1 \2', txt)
    txt = re.sub(r'(mm|cm)(x)', r' \1 \2', txt)

    txt = re.sub(r'[^\w\s\./]', '', txt)

    txt = re.findall(r'\d+|\w+|[./]', txt)
    txt = ' '.join(txt)

    return txt

In [46]:
def load_data():
    root_path = "../../data/"
    df_ = pd.read_csv(root_path+"df_nondim.csv")
    df = pd.read_csv(root_path+"df.csv")
    df = pd.concat([df, df_], ignore_index=True)
    df = df[
        df.category.str.contains("PISOS >|PORCELANATOS >|REVESTIMENTOS >", case=False)
        & ~df.category.str.contains("ACESSÓRIOS PARA PISOS", case=False)]
    
    df_leroy = pd.read_csv(root_path+"df_piso_leroy.csv")
    
    df = pd.concat([df, df_leroy], ignore_index=True)
    df.reset_index(drop=True, inplace=True)
    df = df[["id", "name", "price"]]
    df["name"] = df["name"].apply(preprocess)
    df.drop_duplicates(inplace=True)
    display(df.head())
    df.shape
    return  df
df = load_data()

Unnamed: 0,id,name,price
0,999348.0,porcelanato calacatta gold 100 x 100 acetinado...,117.9
1,999707.0,piso esmaltado parquet brilhante 46 x 46 tipo ...,27.9
2,999100.0,porcelanato georgia bege cetim acetinado retif...,79.9
3,999467.0,porcelanato esmaltado hd fior di bosco acetina...,99.9
4,999090.0,porcelanato travertino bege cetim acetinado re...,79.9


In [66]:
texts = ['calacatta gold satin rectified porcelain tile 100 x 100 type a elizabeth',
         'parquet glossy enamel tile 46 x 46 type a incenor 67040',
         'georgia beige satin rectified porcelain tile 80 x 80 type a incesa',
         'fior di bosco satin rectified enamel porcelain tile 101 x 101 type a elizabeth',
         'travertine beige satin rectified porcelain tile 80 x 80 type a incesa',
         'capim dourado satin rectified porcelain tile 100 x 100 type a elizabeth',
         'tenerife natural outdoor rectified porcelain tile 26 x 106 type a incesa',
         'travertine beige outdoor rectified porcelain tile 80 x 80 type a incesa',
         'calacatta gold polished rectified porcelain tile 100 x 100 type a elizabeth',
         'tenerife natural satin rectified porcelain tile 26 x 106 type a incesa',
         'felicita satin rectified wall tile 45 x 90 type a incesa',
         'anni terra satin rectified wall tile 45 x 90 type a incesa',
         'urutu satin rectified porcelain tile 100 x 100 type a elizabeth',
         'ibiza white satin rectified porcelain tile 80 x 80 type a incesa',
         'etna gray outdoor rectified porcelain tile 90 x 90 type a incesa',
         'habitar nebbia satin rectified porcelain tile 80 x 80 type a incesa',
         'habitar gray satin rectified porcelain tile 80 x 80 type a incesa',
         'detroit gray outdoor rectified porcelain tile 90 x 90 type a incesa',
         'essence gray polished rectified porcelain tile 80 x 80 type a incesa',
         'georgia beige outdoor rectified porcelain tile 80 x 80 type a incesa']


In [98]:
texts = df["name"][:20].to_list()

max_tokens = 3000  # Tamanho máximo do vocabulário
max_len = 15        # Tamanho máximo de cada sequência

vectorize_layer = keras.layers.TextVectorization(
    output_mode='int',             
    output_sequence_length=max_len, 
    standardize="lower_and_strip_punctuation", 
)

vectorize_layer.adapt(texts)

X = vectorize_layer(texts)
#print("Exemplo de sequências tokenizadas:\n", X.numpy())

In [99]:
def create_dataset(texts, vectorize_layer, max_len):
    X, y = [], []
    for text in texts:
        tokenized_text = vectorize_layer(text).numpy()
        for i in range(1, len(tokenized_text)):
            X.append(tokenized_text[:i])
            y.append(tokenized_text[i])

    X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=max_len)
    y = tf.keras.utils.to_categorical(y, num_classes=max_tokens)

    return np.array(X), np.array(y)


In [100]:
vectorize_layer.vocabulary_size()

55

In [109]:
embedding_dim = 256
hidden_units = 256
max_tokens = vectorize_layer.vocabulary_size()

model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=max_tokens, output_dim=embedding_dim),
    keras.layers.LSTM(units=hidden_units, return_sequences=False),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units=max_tokens, activation='softmax')
])

model.compile(
    optimizer='adamw', 
    loss='categorical_crossentropy', 
    metrics=['accuracy']
)

model.summary()

In [105]:
X_train, y_train = create_dataset(texts, vectorize_layer, max_len)

In [110]:
model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=32
)

Epoch 1/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 0.1824 - loss: 3.9098
Epoch 2/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.2672 - loss: 3.0541
Epoch 3/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.3078 - loss: 2.5042
Epoch 4/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.4115 - loss: 2.2280
Epoch 5/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.4403 - loss: 2.0845
Epoch 6/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5190 - loss: 1.9384
Epoch 7/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.5133 - loss: 1.8297
Epoch 8/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.5675 - loss: 1.6614
Epoch 9/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x75d64421b860>

In [114]:
df[:100].to_csv("to_claude.csv", index=False)