* ## Carregar os pacotes

In [12]:
import torch
import numpy as np
import pandas as pd
import unicodedata, re
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from transformers import (BertTokenizer, 
                          BertModel)
from sklearn.metrics import (r2_score,
                             mean_absolute_error)
from sklearn.model_selection import train_test_split
%matplotlib inline
torch.cuda.is_available()

False

#### Carregar e inicializar o modelo e o tokenizador usando modelo pré-treinado em português pela neuralmind.
#### O modelo consegue diferenciar o case, portanto o construtor especializado é inicializado com com padronização para miniscula falso.

In [2]:
model = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
tokenizer = BertTokenizer.from_pretrained(
    'neuralmind/bert-base-portuguese-cased', 
    do_lower_case=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### Método para para processar texto, performando alguns ajustes no documento.

In [3]:
def preprocess(tx):
    txt = unicodedata.normalize('NFD', str(tx))
    txt = ''.join([char for char in txt if not unicodedata.combining(char)])
    txt = txt.lower()
    txt = re.sub(r"²+", "", txt)
    txt = re.sub(r"/?\s*ref\s*\.?\s*[a-zA-Z0-9]+", "", txt)
    txt = re.sub(r'(\d)\s*,\s*(\d)', r'\1.\2', txt)
    txt = re.sub(r'(?<=\d)(?=\D)|(?<=\D)(?=\d)', ' ', txt)
    txt = re.sub(r'(\d+)(x)(\d+)', r'\1 \2 \3', txt)

    txt = re.sub(r'(x)(mm|cm)', r' \1 \2', txt)
    txt = re.sub(r'(mm|cm)(x)', r' \1 \2', txt)

    txt = re.sub(r'[^\w\s\./]', '', txt)

    txt = re.findall(r'\d+|\w+|[./]', txt)
    txt = ' '.join(txt)

    return txt
root_path = "../../data/"
df_ = pd.read_csv(root_path+"df_nondim.csv")
df = pd.read_csv(root_path+"df.csv")
df = pd.concat([df, df_], ignore_index=True)
df = df[
    df.category.str.contains("PISOS >|PORCELANATOS >|REVESTIMENTOS >", case=False)
    & ~df.category.str.contains("ACESSÓRIOS PARA PISOS", case=False)]

df_leroy = pd.read_csv(root_path+"df_piso_leroy.csv")

df = pd.concat([df, df_leroy], ignore_index=True)
df.reset_index(drop=True, inplace=True)
df = df[["id", "name", "price"]]
df["name"] = df["name"].apply(preprocess)
df.drop_duplicates(inplace=True)
display(df.head())
df.shape

Unnamed: 0,id,name,price
0,999348.0,porcelanato calacatta gold 100 x 100 acetinado...,117.9
1,999707.0,piso esmaltado parquet brilhante 46 x 46 tipo ...,27.9
2,999100.0,porcelanato georgia bege cetim acetinado retif...,79.9
3,999467.0,porcelanato esmaltado hd fior di bosco acetina...,99.9
4,999090.0,porcelanato travertino bege cetim acetinado re...,79.9


(2541, 3)

In [None]:
docs = df["name"]

In [None]:
model.to(device)

In [7]:
device

device(type='cpu')

#### Aqui, cria-se um dicionário com dois elementos. Os ids e as máscaras de atenção que serão gerados.
#### Da linha 2 à 10, cria-se os tokens usando o tokenizador inicializado com comprimento máximo dos tokens, truncamento e o preenchimento dos indices.
#### Nas 9 e 10, adiciona-se os ids e as máscaras como valores para as chaves no dicionário.
#### As linhas 12 e 13 servem para concatenar os elementos, criando assim para cada chave, uma lista única.

In [5]:
tokens = {"input_ids": [], "attention_mask": []}
for sentence in docs:
    new_token = tokenizer.encode_plus(sentence, 
                                      max_length=12,
                                      truncation=True, 
                                      padding='max_length',
                                      return_tensors='pt')
    
    tokens['input_ids'].append(new_token['input_ids'][0])
    tokens['attention_mask'].append(new_token['attention_mask'][0])

tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
#tokens = {key: value.to(device) for key, value in tokens.items()}

In [6]:
with torch.no_grad():
    outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [7]:
embeddings = outputs.last_hidden_state
mask = (tokens['attention_mask']
                  .unsqueeze(-1)
                  .expand(embeddings.size()))

In [8]:
mask.shape

torch.Size([2541, 12, 768])

In [9]:
mask_embeddings = embeddings * mask
sum = torch.sum(mask_embeddings, 1)
mean = sum / torch.clamp(mask.sum(1), min=1e-9)
embeddings_mean = pd.DataFrame(mean.numpy())

In [13]:
#train, test = train_test_split(embeddings_mean, test_size=0.188, random_state=42)
#train, val = train_test_split(train, test_size=0.177, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    embeddings_mean, df['price'], test_size=0.30, random_state=42
)
print(f"train: {X_train.shape}, test: {X_test.shape}")

# train.reset_index(drop=True, inplace=True)
# val.reset_index(drop=True, inplace=True)
# test.reset_index(drop=True, inplace=True)

train: (1778, 768), test: (763, 768)


In [14]:
params = {
    'n_estimators': 1000,
    'max_depth': 8,
    'min_child_weight': 1,
    'gamma': 0.3,
    'colsample_bytree': 0.9,
    'alpha': 0, 'lambda': 1}

xgb_tf = XGBRegressor(**params, random_state=42)
xgb_tf.fit(X_train, y_train)

In [15]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [16]:
xgb_tf.predict(X_test)

y_pred = xgb_tf.predict(X_test)
print(f"r2: {r2_score(y_test, y_pred)}")
print(f"mae: {mean_absolute_error(y_test, y_pred)}")
print(f"mape: {mape(y_test, y_pred)}")

r2: 0.637428965073331
mae: 101.49974951810324
mape: 51.83498959139349
