Para o Projeto 2, iremos tentar prever o tempo de entrega dos produtos através de modelos de regressão. Hoje na base original, já possuímos uma data de estimativa de entrega que podemos concluir ser tirada de algum sistema ou algoritmo do ecommerce. Com isso, podemos também testar se o modelo está obtendo resultados melhores que a solução já existente. A coluna target do modelo será a "tempo_entrega_dias" que possui o tempo, em dias, desde a compra do produto até a entrega ao cliente.

# Imports

In [13]:
import mlflow
import pandas as pd
from math import sqrt
from lightgbm import LGBMRegressor
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
# define opcao do pandas para visualizar todas as colunas do df
pd.set_option("display.max_columns", None)

In [3]:
mlflow.autolog()

2023/07/13 23:40:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/07/13 23:40:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.


# Carrega dados

In [4]:
df = pd.read_csv("data/df_etl.csv", sep=";")
df.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,payment_type_1,payment_type_2,payment_value_1,payment_value_2,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,seller_zip_code_prefix,seller_city,seller_state,review_id,review_score,review_creation_date,review_answer_timestamp,total_order_value,tempo_entrega_dias,tempo_aprovacao_dias,tempo_diff_previsto_entrega_dias,volume_cm_3,flag_customer_seller_mesmo_estado
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02,2017-10-02,2017-10-04,2017-10-10,2017-10-18,credit_card,voucher,18.12,20.59,1,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-10-06,29.99,8.72,utilidades_domesticas,40.0,268.0,4.0,500.0,19.0,8.0,13.0,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,9350,maua,SP,a54f0611adc9ed256b57ede6b6eb5114,4.0,2017-10-11,2017-10-12,38.71,8.0,0.0,8.0,1976.0,1
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24,2018-07-26,2018-07-26,2018-08-07,2018-08-13,boleto,,141.46,,1,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,2018-07-30,118.7,22.76,perfumaria,29.0,178.0,1.0,400.0,19.0,13.0,19.0,af07308b275d755c9edb36a90c618231,47813,barreiras,BA,31570,belo horizonte,SP,8d5266042046a06655c8db133d120ba5,4.0,2018-08-08,2018-08-08,141.46,14.0,2.0,6.0,4693.0,0
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08,2018-08-08,2018-08-08,2018-08-17,2018-09-04,credit_card,,179.12,,1,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,2018-08-13,159.9,19.22,automotivo,46.0,232.0,1.0,420.0,24.0,19.0,21.0,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO,14840,guariba,SP,e73b67b67587f7644d5bd1a52deb1b01,5.0,2018-08-18,2018-08-22,179.12,9.0,0.0,18.0,9576.0,0
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18,2017-11-18,2017-11-22,2017-12-02,2017-12-15,credit_card,,72.2,,1,d0b61bfb1de832b15ba9d266ca96e5b0,66922902710d126a0e7d26b0e3805106,2017-11-23,45.0,27.2,pet_shop,59.0,468.0,3.0,450.0,30.0,10.0,20.0,7c142cf63193a1473d2e66489a9ae977,59296,sao goncalo do amarante,RN,31842,belo horizonte,MG,359d03e676b3c069f62cadba8dd3f6e8,5.0,2017-12-03,2017-12-05,72.2,14.0,0.0,13.0,6000.0,0
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13,2018-02-13,2018-02-14,2018-02-16,2018-02-26,credit_card,,28.62,,1,65266b2da20d04dbe00c5c2d3bb7859e,2c9e548be18521d1c43cde1c582c6de8,2018-02-19,19.9,8.72,papelaria,38.0,316.0,4.0,250.0,51.0,15.0,15.0,72632f0f9dd73dfee390c9b22eb56dd6,9195,santo andre,SP,8752,mogi das cruzes,SP,e50934924e227544ba8246aeb3770dd4,5.0,2018-02-17,2018-02-18,28.62,3.0,0.0,10.0,11475.0,1


In [5]:
# remove colunas que nao serao consideradas para o modelo, deixando apenas identificadores e 
# possiveis features
df.drop(columns=[
    "customer_id", "order_approved_at", "order_delivered_carrier_date", "order_status",
    "order_delivered_customer_date", "order_estimated_delivery_date", "payment_type_2", 
    "payment_value_2", "product_id", "seller_id", "shipping_limit_date", "customer_unique_id",
    "customer_zip_code_prefix", "seller_zip_code_prefix", "review_id", "review_score", 
    "review_creation_date", "review_answer_timestamp", "total_order_value", "tempo_aprovacao_dias",
    "customer_city", "seller_city", "payment_type_1", "payment_value_1", "product_name_lenght",
    "product_description_lenght", "product_photos_qty", "product_length_cm", "product_height_cm",
    "product_width_cm"
    ], inplace=True
)

In [6]:
# deixa apenas registros com tempo de entrega registrado, ja que queremos prever esse tempo
df = df[df["tempo_entrega_dias"] > 0]

# deixa apenas registros com volume e pesos calculados
df = df[~df["volume_cm_3"].isnull()]
df = df[~df["product_weight_g"].isnull()]

# Separa dados

In [7]:
variavel_resposta = "tempo_entrega_dias"
X = df.drop(columns=[variavel_resposta])
y = df[variavel_resposta]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelos

## Baseline através dos dados atuais

Hoje na base original, já possuímos uma data de estimativa de entrega que podemos concluir ser tirada de algum sistema ou algoritmo do ecommerce. Essa data de estimativa, em conjunto com a data de entrega, nos possibilita calcular a diferença entre a data prometida e a data realizada. Essa diferença está presente na coluna "tempo_diff_previsto_entrega_dias".

Através dos valores dessa coluna, podemos calcular o erro total da solução atual, utilizando a mesma métrica que iremos utilizar para avaliar os modelos.

A métrica escolhida é a RMSE (root mean squared error), que basicamente calcula o erro médio da previsões. O cálculo feito é:

RMSE = $\sqrt{\frac{1}{n} \Sigma_{i=1}^n({y}-\hat{y})^2}$

Vamos realizar esse cálculo então para esses dados atuais que possuimos


In [17]:
erro_quadrado = X_train["tempo_diff_previsto_entrega_dias"]**2
somatorio_erro_quadrado = erro_quadrado.sum()
rmse_baseline = sqrt(somatorio_erro_quadrado/len(X_train))
print(f"RMSE atual da base para a previsao de tempo de entrega em dias (baseline) = {rmse_baseline}")

RMSE atual da base para a previsao de tempo de entrega em dias (baseline) = 15.716723804017008


In [18]:
# define features a serem utilizadas nos modelos
features = [
    "price", "freight_value", "product_weight_g", "volume_cm_3", "flag_customer_seller_mesmo_estado",
    "product_category_name", "customer_state", "seller_state"
]

## Regressao Linear

In [10]:
column_transformers = ColumnTransformer(
    transformers=[
        ("robust_scaler", RobustScaler(), ["price", "freight_value"]),
        ("min_max_scaler", MinMaxScaler(), ["product_weight_g", "volume_cm_3"]),
        ("ordinal_encoder", OrdinalEncoder(), ["product_category_name", "customer_state", "seller_state"])
    ],
    remainder="passthrough"
)

pipe = Pipeline(
    steps=[
        ("preprocess", column_transformers),
        ("regressor", LinearRegression())
    ]
)

In [11]:
scores = cross_val_score(pipe, X_train[features], y_train, cv=5, scoring="neg_root_mean_squared_error")
print(f"Melhor score medio encontrado: {scores.mean()}")

2023/07/13 23:40:40 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '11dac9932c834be4b8d0cbe90e91303a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
                  transformers=[('robust_scaler', RobustScaler(),
                                 ['price', 'freight_value']),
                                ('min_max_scaler', MinMaxScaler(),
                                 ['product_weight_g', 'volume_cm_3']),
                                ('ordinal_encoder', OrdinalEncoder(),
                                 ['product_category_name', 'customer_state',
                         ...`
                  transformers=[('robust_scaler', RobustScaler(),
                                 ['price', 'freight_value']),
                                ('min_max_scaler', MinMaxScaler(),
                                 ['product_weight_g', 'volume_cm_3']),
                              

Melhor score medio encontrado: -8.632945641594489


## Ridge

In [23]:
column_transformers = ColumnTransformer(
    transformers=[
        ("robust_scaler", RobustScaler(), ["price", "freight_value"]),
        ("min_max_scaler", MinMaxScaler(), ["product_weight_g", "volume_cm_3"]),
        ("ordinal_encoder", OrdinalEncoder(), ["product_category_name", "customer_state", "seller_state"])
    ],
    remainder="passthrough"
)

pipe = Pipeline(
    steps=[
        ("preprocess", column_transformers),
        ("regressor", Ridge())
    ]
)

In [29]:
grid_param = [
    {
        "regressor": [Ridge()],
        "regressor__alpha": [0.5, 1, 1.5, 2, 2.5, 3, 3.5],
        "regressor__max_iter": [500, 1000, 5000, 10000, 15000, 20000]
    }
]

gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=3, scoring="neg_root_mean_squared_error", n_jobs=1)
best_model = gridsearch.fit(X_train[features], y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits


[CV 1/5] END regressor=Ridge(), regressor__alpha=0.5, regressor__max_iter=500;, score=-8.761 total time=   0.4s
[CV 2/5] END regressor=Ridge(), regressor__alpha=0.5, regressor__max_iter=500;, score=-8.591 total time=   0.4s
[CV 3/5] END regressor=Ridge(), regressor__alpha=0.5, regressor__max_iter=500;, score=-8.822 total time=   0.3s
[CV 4/5] END regressor=Ridge(), regressor__alpha=0.5, regressor__max_iter=500;, score=-8.294 total time=   0.3s
[CV 5/5] END regressor=Ridge(), regressor__alpha=0.5, regressor__max_iter=500;, score=-8.697 total time=   0.2s
[CV 1/5] END regressor=Ridge(), regressor__alpha=0.5, regressor__max_iter=1000;, score=-8.761 total time=   0.4s
[CV 2/5] END regressor=Ridge(), regressor__alpha=0.5, regressor__max_iter=1000;, score=-8.591 total time=   0.4s
[CV 3/5] END regressor=Ridge(), regressor__alpha=0.5, regressor__max_iter=1000;, score=-8.822 total time=   0.3s
[CV 4/5] END regressor=Ridge(), regressor__alpha=0.5, regressor__max_iter=1000;, score=-8.294 total t

In [30]:
print(
    f"Melhor score medio encontrado: {best_model.best_score_}\n"
    f"Hyperparametros utilizados: {best_model.best_params_}"
)

Melhor score medio encontrado: -8.632944297672939
Hyperparametros utilizados: {'regressor': Ridge(alpha=3.5, max_iter=500), 'regressor__alpha': 3.5, 'regressor__max_iter': 500}


## Random Forest

In [185]:
column_transformers = ColumnTransformer(
    transformers=[
        ("ordinal_encoder", OrdinalEncoder(), ["product_category_name", "customer_state", "seller_state"])
    ],
    remainder="passthrough"
)

pipe = Pipeline(
    steps=[
        ("preprocess", column_transformers),
        ("regressor", RandomForestRegressor())
    ]
)

In [187]:
grid_param = [
    {
        "regressor": [RandomForestRegressor()],
        "regressor__n_estimators": [100, 125, 150, 175, 200],
        "regresso__max_depth": [5, 6, 7, 8]
    }
]

gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=3, scoring="neg_root_mean_squared_error", n_jobs=1)
best_model = gridsearch.fit(X_train[features], y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END regressor=RandomForestRegressor(), regressor__n_estimators=75;, score=-8.301 total time=  45.5s
[CV 2/5] END regressor=RandomForestRegressor(), regressor__n_estimators=75;, score=-8.022 total time=  42.7s
[CV 3/5] END regressor=RandomForestRegressor(), regressor__n_estimators=75;, score=-8.278 total time=  46.7s
[CV 4/5] END regressor=RandomForestRegressor(), regressor__n_estimators=75;, score=-7.698 total time=  44.2s
[CV 5/5] END regressor=RandomForestRegressor(), regressor__n_estimators=75;, score=-8.257 total time=  47.9s
[CV 1/5] END regressor=RandomForestRegressor(), regressor__n_estimators=100;, score=-8.277 total time= 1.0min
[CV 2/5] END regressor=RandomForestRegressor(), regressor__n_estimators=100;, score=-8.011 total time= 1.0min
[CV 3/5] END regressor=RandomForestRegressor(), regressor__n_estimators=100;, score=-8.221 total time= 1.0min
[CV 4/5] END regressor=RandomForestRegressor(), regressor__n_esti

In [188]:
print(
    f"Melhor score medio encontrado: {best_model.best_score_}\n"
    f"Hyperparametros utilizados: {best_model.best_params_}"
)

{'regressor': RandomForestRegressor(n_estimators=125),
 'regressor__n_estimators': 125}

## LighGBM

In [13]:
column_transformers = ColumnTransformer(
    transformers=[
        ("robust_scaler", RobustScaler(), ["price", "freight_value"]),
        ("min_max_scaler", MinMaxScaler(), ["product_weight_g", "volume_cm_3"]),
        ("ordinal_encoder", OrdinalEncoder(), ["product_category_name", "customer_state", "seller_state"])
    ],
    remainder="passthrough"
)

pipe = Pipeline(
    steps=[
        ("preprocess", column_transformers),
        ("regressor", LGBMRegressor(objective="regression"))
    ]
)

In [14]:
grid_param = [
    {
        "regressor": [LGBMRegressor(objective="regression")],
        "regressor__n_estimators": [75, 100, 125, 150]
    }
]

gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=3, scoring="neg_root_mean_squared_error", n_jobs=1)
best_model = gridsearch.fit(X_train[features], y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END regressor=LGBMRegressor(objective='regression'), regressor__n_estimators=75;, score=-8.266 total time=   5.6s
[CV 2/5] END regressor=LGBMRegressor(objective='regression'), regressor__n_estimators=75;, score=-8.063 total time=   4.4s
[CV 3/5] END regressor=LGBMRegressor(objective='regression'), regressor__n_estimators=75;, score=-8.286 total time=   3.1s
[CV 4/5] END regressor=LGBMRegressor(objective='regression'), regressor__n_estimators=75;, score=-7.798 total time=   5.9s
[CV 5/5] END regressor=LGBMRegressor(objective='regression'), regressor__n_estimators=75;, score=-8.185 total time=   3.2s
[CV 1/5] END regressor=LGBMRegressor(objective='regression'), regressor__n_estimators=100;, score=-8.241 total time=   3.9s
[CV 2/5] END regressor=LGBMRegressor(objective='regression'), regressor__n_estimators=100;, score=-8.042 total time=   4.9s
[CV 3/5] END regressor=LGBMRegressor(objective='regression'), regressor__n_es

In [15]:
print(
    f"Melhor score encontrado: {best_model.best_score_}\n"
    f"Hyperparametros utilizados: {best_model.best_params_}"
)

Melhor score encontrado: -8.064782426359887
Hyperparametros utilizados: {'regressor': LGBMRegressor(n_estimators=150, objective='regression'), 'regressor__n_estimators': 150}
