In [1]:
import os
import pandas as pd
import yaml
import pickle

from utils.training_utils import find_specific_variables

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, mean_squared_error, make_scorer

import warnings
warnings.filterwarnings('ignore')

In [5]:
features = yaml.safe_load(open(os.path.join('..', 'src', 'config', 'feature_config.yaml'), 'r'))

# Modelo para classificação de um produto em promoção

In [3]:
df = pd.read_parquet(os.path.join('..', 'data', 'train_test', 'train_encoded_clf.parquet'))

print(df.shape)
df.head()

(212047, 19)


Unnamed: 0,attributes_color_value,attributes_main_color_value,buying_mode,condition,currency_id,listing_type_id,sale_price_payment_method_type,shipping_logistic_type,site_id,accepts_mercadopago,available_quantity,catalog_listing,installments_price,installments_quantity,price_tratado,sale_price_conditions_eligible,shipping_free_shipping,use_thumbnail_id,promotion_flag
0,1.0,1.0,0.0,1.0,14.0,5.0,1.0,6.0,9.0,1.0,50.0,0.0,-999.0,-999.0,1350.0,1.0,0.0,1.0,0
1,1.0,1.0,1.0,3.0,13.0,3.0,1.0,0.0,10.0,0.0,1.0,0.0,-999.0,-999.0,1200000.0,1.0,0.0,1.0,0
2,1.0,1.0,0.0,1.0,11.0,4.0,1.0,4.0,13.0,1.0,250.0,0.0,20.0,12.0,240.0,1.0,1.0,0.0,1
3,1.0,1.0,0.0,1.0,14.0,5.0,1.0,1.0,9.0,1.0,500.0,0.0,-999.0,-999.0,519.0,1.0,0.0,0.0,0
4,1.0,1.0,0.0,1.0,13.0,0.0,1.0,6.0,10.0,0.0,1.0,0.0,-999.0,-999.0,22.99,1.0,1.0,1.0,0


In [4]:
seletor = pickle.load(
    open(os.path.join('..', 'models', 'encoders', 'seletor_2.pkl'), 'rb')
)

df_hyperparams = pickle.load(
    open(os.path.join('..', 'models', 'df_metrics_results_tunning.pkl'), 'rb')
)

In [12]:

feature_target = find_specific_variables(features, 'target_clf', specific_value=True)

In [8]:
df_treino, df_valid = train_test_split(df, test_size=0.2, random_state=98)

In [9]:
print(f'Shape Treino: {df_treino.shape}')
print(f'Shape Valid: {df_valid.shape}')

Shape Treino: (169637, 19)
Shape Valid: (42410, 19)


In [14]:
print(f'% promo Treino: {df_treino[feature_target[0]].mean()}')
print(f'% promo Valid: {df_valid[feature_target[0]].mean()}')

% promo Treino: 0.2325141331195435
% promo Valid: 0.23154916293327046


In [16]:
df_hyperparams[df_hyperparams.value == max(df_hyperparams.value)].T

Unnamed: 0,5
number,5
value,0.860428
datetime_start,2025-01-18 22:59:47.506096
datetime_complete,2025-01-18 22:59:51.836617
duration,0 days 00:00:04.330521
params_colsample_bytree,0.760253
params_gamma,0.107945
params_learning_rate,0.079548
params_max_depth,6
params_min_child_weight,13.336635


In [23]:
hyper_params = {
    'learning_rate':0.079548,
    'max_depth':6,
    'min_child_weight':13.336635,
    'gamma':0.107945,
    'subsample':0.780523,
    'colsample_bytree':0.760253,
    'scale_pos_weight':99.518678,
    'eval_metric': 'auc'
}

model = xgb.XGBClassifier(
    **hyper_params,
    random_state=12,
    n_jobs=-1,
    early_stopping_rounds=4
)

model

In [24]:
model.fit(
    df_treino[seletor.features].values,
    df_treino[feature_target].values,
    eval_set=[(df_valid[seletor.features].values, df_valid[feature_target].values)],
    verbose=True
)

[0]	validation_0-auc:0.80364
[1]	validation_0-auc:0.80657
[2]	validation_0-auc:0.80675
[3]	validation_0-auc:0.80693
[4]	validation_0-auc:0.81709
[5]	validation_0-auc:0.81722
[6]	validation_0-auc:0.82053
[7]	validation_0-auc:0.82052
[8]	validation_0-auc:0.82255
[9]	validation_0-auc:0.82269
[10]	validation_0-auc:0.82375
[11]	validation_0-auc:0.82671
[12]	validation_0-auc:0.82787
[13]	validation_0-auc:0.82960
[14]	validation_0-auc:0.82923
[15]	validation_0-auc:0.82968
[16]	validation_0-auc:0.83228
[17]	validation_0-auc:0.83271
[18]	validation_0-auc:0.83248
[19]	validation_0-auc:0.83298
[20]	validation_0-auc:0.83299
[21]	validation_0-auc:0.83315
[22]	validation_0-auc:0.83306
[23]	validation_0-auc:0.83370
[24]	validation_0-auc:0.83393
[25]	validation_0-auc:0.83428
[26]	validation_0-auc:0.83592
[27]	validation_0-auc:0.83645
[28]	validation_0-auc:0.83779
[29]	validation_0-auc:0.83871
[30]	validation_0-auc:0.83921
[31]	validation_0-auc:0.84000
[32]	validation_0-auc:0.84006
[33]	validation_0-au

In [25]:
pickle.dump(
    model, 
    open(os.path.join('..', 'models', 'predictors', 'model_clf.pkl'), 'wb')
)

# Modelo para predizer o preço do desconto

In [2]:
df = pd.read_parquet(os.path.join('..', 'data', 'train_test', 'train_encoded_reg.parquet'))

print(df.shape)
df.head()

(49407, 13)


Unnamed: 0,attributes_color_value,currency_id,listing_type_id,sale_price_metadata_promotion_type,shipping_logistic_type,site_id,available_quantity,catalog_listing,installments_price,installments_quantity,price_tratado,shipping_free_shipping,discount
0,1.0,6.0,3.0,1.0,1.0,5.0,1.0,0.0,-999.0,-999.0,590.0,0.0,0.1
1,0.0,5.0,3.0,1.0,1.0,5.0,1.0,1.0,-999.0,-999.0,305.0,1.0,0.1
2,1.0,2.0,2.0,1.0,5.0,0.0,1.0,1.0,4135.416667,12.0,49625.0,0.0,0.22001
3,1.0,2.0,2.0,0.0,4.0,0.0,50.0,0.0,12325.0,12.0,147900.0,1.0,0.33
4,1.0,1.0,2.0,1.0,8.0,3.0,1.0,1.0,11.633333,3.0,34.9,0.0,0.050143


In [3]:
seletor = pickle.load(
    open(os.path.join('..', 'models', 'encoders_reg', 'seletor_2.pkl'), 'rb')
)

df_hyperparams = pickle.load(
    open(os.path.join('..', 'models', 'df_metrics_results_tunning_reg.pkl'), 'rb')
)

In [15]:
feature_target = find_specific_variables(features, 'target_reg', specific_value=True)

In [16]:
df_treino, df_valid = train_test_split(df, test_size=0.2, random_state=98)

In [17]:
print(f'Shape Treino: {df_treino.shape}')
print(f'Shape Valid: {df_valid.shape}')

Shape Treino: (39525, 13)
Shape Valid: (9882, 13)


In [18]:
df_hyperparams[df_hyperparams.value == min(df_hyperparams.value)].T

Unnamed: 0,59
number,59
value,0.014009
datetime_start,2025-01-19 12:46:53.366090
datetime_complete,2025-01-19 12:46:54.213279
duration,0 days 00:00:00.847189
params_colsample_bytree,0.76322
params_gamma,0.018385
params_learning_rate,0.098695
params_max_depth,6
params_min_child_weight,0.373927


In [21]:
hyper_params = {
    'learning_rate':0.098695,
    'max_depth':6,
    'min_child_weight':0.373927,
    'gamma':0.018385,
    'subsample':0.870589,
    'colsample_bytree':0.76322,
    'eval_metric': 'rmse'
}

model = xgb.XGBRegressor(
    **hyper_params,
    random_state=12,
    n_jobs=-1,
    early_stopping_rounds=4
)

model

In [22]:
model.fit(
    df_treino[seletor.features].values,
    df_treino[feature_target].values,
    eval_set=[(df_valid[seletor.features].values, df_valid[feature_target].values)],
    verbose=True
)

[0]	validation_0-rmse:0.14780
[1]	validation_0-rmse:0.14513
[2]	validation_0-rmse:0.14298
[3]	validation_0-rmse:0.14113
[4]	validation_0-rmse:0.13943
[5]	validation_0-rmse:0.13782
[6]	validation_0-rmse:0.13648
[7]	validation_0-rmse:0.13497
[8]	validation_0-rmse:0.13416
[9]	validation_0-rmse:0.13329
[10]	validation_0-rmse:0.13231
[11]	validation_0-rmse:0.13160
[12]	validation_0-rmse:0.13100
[13]	validation_0-rmse:0.13046
[14]	validation_0-rmse:0.12970
[15]	validation_0-rmse:0.12925
[16]	validation_0-rmse:0.12868
[17]	validation_0-rmse:0.12842
[18]	validation_0-rmse:0.12809
[19]	validation_0-rmse:0.12778
[20]	validation_0-rmse:0.12753
[21]	validation_0-rmse:0.12715
[22]	validation_0-rmse:0.12683
[23]	validation_0-rmse:0.12644
[24]	validation_0-rmse:0.12600
[25]	validation_0-rmse:0.12574
[26]	validation_0-rmse:0.12561
[27]	validation_0-rmse:0.12544
[28]	validation_0-rmse:0.12524
[29]	validation_0-rmse:0.12485
[30]	validation_0-rmse:0.12455
[31]	validation_0-rmse:0.12442
[32]	validation_0-

In [23]:
pickle.dump(
    model, 
    open(os.path.join('..', 'models', 'predictors', 'model_reg.pkl'), 'wb')
)