# Imports

In [146]:
import pandas as pd
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score

In [106]:
pd.set_option("display.max_columns", None)

# Carrega dados

In [107]:
df = pd.read_csv("data/df_etl.csv", sep=";")
df.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,payment_type_1,payment_type_2,payment_value_1,payment_value_2,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,seller_zip_code_prefix,seller_city,seller_state,review_id,review_score,review_creation_date,review_answer_timestamp,total_order_value,tempo_entrega_dias,tempo_aprovacao_dias,tempo_diff_previsto_entrega_dias,volume_cm_3,flag_customer_seller_mesmo_estado
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02,2017-10-02,2017-10-04,2017-10-10,2017-10-18,credit_card,voucher,18.12,20.59,1,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-10-06,29.99,8.72,utilidades_domesticas,40.0,268.0,4.0,500.0,19.0,8.0,13.0,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,9350,maua,SP,a54f0611adc9ed256b57ede6b6eb5114,4.0,2017-10-11,2017-10-12,38.71,8.0,0.0,8.0,1976.0,1
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24,2018-07-26,2018-07-26,2018-08-07,2018-08-13,boleto,,141.46,,1,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,2018-07-30,118.7,22.76,perfumaria,29.0,178.0,1.0,400.0,19.0,13.0,19.0,af07308b275d755c9edb36a90c618231,47813,barreiras,BA,31570,belo horizonte,SP,8d5266042046a06655c8db133d120ba5,4.0,2018-08-08,2018-08-08,141.46,14.0,2.0,6.0,4693.0,0
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08,2018-08-08,2018-08-08,2018-08-17,2018-09-04,credit_card,,179.12,,1,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,2018-08-13,159.9,19.22,automotivo,46.0,232.0,1.0,420.0,24.0,19.0,21.0,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO,14840,guariba,SP,e73b67b67587f7644d5bd1a52deb1b01,5.0,2018-08-18,2018-08-22,179.12,9.0,0.0,18.0,9576.0,0
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18,2017-11-18,2017-11-22,2017-12-02,2017-12-15,credit_card,,72.2,,1,d0b61bfb1de832b15ba9d266ca96e5b0,66922902710d126a0e7d26b0e3805106,2017-11-23,45.0,27.2,pet_shop,59.0,468.0,3.0,450.0,30.0,10.0,20.0,7c142cf63193a1473d2e66489a9ae977,59296,sao goncalo do amarante,RN,31842,belo horizonte,MG,359d03e676b3c069f62cadba8dd3f6e8,5.0,2017-12-03,2017-12-05,72.2,14.0,0.0,13.0,6000.0,0
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13,2018-02-13,2018-02-14,2018-02-16,2018-02-26,credit_card,,28.62,,1,65266b2da20d04dbe00c5c2d3bb7859e,2c9e548be18521d1c43cde1c582c6de8,2018-02-19,19.9,8.72,papelaria,38.0,316.0,4.0,250.0,51.0,15.0,15.0,72632f0f9dd73dfee390c9b22eb56dd6,9195,santo andre,SP,8752,mogi das cruzes,SP,e50934924e227544ba8246aeb3770dd4,5.0,2018-02-17,2018-02-18,28.62,3.0,0.0,10.0,11475.0,1


In [108]:
# remove colunas que nao serao consideradas para o modelo, deixando apenas identificadores e 
# possiveis features
df.drop(columns=[
    "customer_id", "order_approved_at", "order_delivered_carrier_date", "order_status",
    "order_delivered_customer_date", "order_estimated_delivery_date", "payment_type_2", 
    "payment_value_2", "product_id", "seller_id", "shipping_limit_date", "customer_unique_id",
    "customer_zip_code_prefix", "seller_zip_code_prefix", "review_id", "review_score", 
    "review_creation_date", "review_answer_timestamp", "total_order_value", "tempo_aprovacao_dias",
    "customer_city", "seller_city", "payment_type_1", "payment_value_1", "product_name_lenght",
    "product_description_lenght", "product_photos_qty", "product_length_cm", "product_height_cm",
    "product_width_cm"
    ], inplace=True
)

In [109]:
# deixa apenas registros com tempo de entrega registrado, ja que queremos prever esse tempo
df = df[df["tempo_entrega_dias"] > 0]

# deixa apenas registros com volume e pesos calculados
df = df[~df["volume_cm_3"].isnull()]
df = df[~df["product_weight_g"].isnull()]

# Separa dados

In [110]:
variavel_resposta = "tempo_entrega_dias"
X = df.drop(columns=[variavel_resposta])
y = df[variavel_resposta]

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preparacao dos dados

In [126]:
# valores numericos
price_scaler = RobustScaler().fit(X_train[["price"]].values)
X_train["price_scaled"] = price_scaler.transform(X_train[["price"]].values)

freight_scaler = RobustScaler().fit(X_train[["freight_value"]].values)
X_train["freight_value_scaled"] = freight_scaler.transform(X_train[["freight_value"]].values)

product_weight_scaler = MinMaxScaler().fit(X_train[["product_weight_g"]].values)
X_train["product_weight_g_scaled"] = product_weight_scaler.transform(X_train[["product_weight_g"]].values)

product_volume_scaler = MinMaxScaler().fit(X_train[["volume_cm_3"]].values)
X_train["volume_cm_3_scaled"] = product_volume_scaler.transform(X_train[["volume_cm_3"]].values)

# Modelos

In [113]:
features_numericas = [
    "price", "freight_value", "product_weight_g", "volume_cm_3", "flag_customer_seller_mesmo_estado"
]

In [153]:
column_transformers = ColumnTransformer(
    transformers=[
        ("robust_scaler", RobustScaler(), ["price", "freight_value"]),
        ("min_max_scaler", MinMaxScaler(), ["product_weight_g", "volume_cm_3"])
    ],
    remainder="passthrough"
)

pipe = Pipeline(
    steps=[
        ("preprocess", column_transformers),
        ("regressor", LinearRegression())
    ]
)

In [154]:
scores = cross_val_score(pipe, X_train[features_numericas], y_train, cv=5, scoring="neg_root_mean_squared_error")
scores

array([-8.81822633, -8.64137372, -8.87187385, -8.35963873, -8.76654969])

In [137]:
pipe.fit(X_train[features_numericas], y_train)

In [138]:
y_pred = pipe.predict(X_train)

In [139]:
mean_squared_error(y_train, y_pred)

75.5636002594007

In [120]:
aux = column_transformers.transform(X_train)

In [122]:
aux

array([[ 2.19226638e+00,  4.40768278e+00,  1.45330860e-01,
         2.47669234e-02],
       [ 9.14070892e-01,  5.17719950e+00,  3.87136673e-01,
         1.17038238e-01],
       [-6.32653061e-01, -1.09789343e+00,  6.18429190e-03,
         6.21537630e-04],
       ...,
       [-3.86680988e-01, -3.56877323e-01,  1.97897341e-02,
         4.09809485e-02],
       [ 1.62083781e-01,  8.30235440e-01,  6.18429190e-03,
         1.28090799e-02],
       [-5.69280344e-01,  2.75092937e-01,  4.94743352e-03,
         9.26901770e-03]])

In [127]:
X_train.head()

Unnamed: 0,order_id,order_purchase_timestamp,order_item_id,price,freight_value,product_category_name,product_weight_g,customer_state,seller_state,tempo_diff_previsto_entrega_dias,volume_cm_3,flag_customer_seller_mesmo_estado,price_scaled,freight_value_scaled,product_weight_g_scaled,volume_cm_3_scaled
46411,175d5d1ff76c0ea865a58e125e22743d,2017-10-29,1,279.0,51.82,informatica_acessorios,5875.0,MG,RS,13.0,7500.0,0,2.192266,4.407683,0.145331,0.024767
77677,35990049382e07dba1a9ef3550cad655,2018-05-10,3,160.0,58.03,construcao_ferramentas_construcao,15650.0,RJ,SP,26.0,34816.0,0,0.914071,5.1772,0.387137,0.117038
106126,c950a394ee0369c853e3a1a55090e287,2018-05-31,1,16.0,7.39,fashion_bolsas_e_acessorios,250.0,SP,SP,23.0,352.0,1,-0.632653,-1.097893,0.006184,0.000622
68473,1e8ff8bb172260d3d1b60936246867e0,2017-03-05,1,14.0,14.11,ferramentas_jardim,300.0,MG,SP,17.0,11232.0,0,-0.654135,-0.26518,0.007421,0.037373
93891,442a8a32e7ffcfcc05413962b75de9c8,2018-03-01,1,225.91,19.15,informatica_acessorios,958.0,GO,MG,9.0,8008.0,0,1.622019,0.359356,0.023698,0.026483


## Regressao Linear

In [147]:
lr_model = LinearRegression().fit(X_train[features_numericas], y_train)

In [148]:
y_pred = lr_model.predict(X_train[features_numericas])

In [150]:
mean_squared_error(y_train, y_pred, squared=False)

8.692732611751078

In [152]:
model = LinearRegression()
scores = cross_val_score(model, X_train[features_numericas], y_train, cv=5, scoring="neg_root_mean_squared_error")
scores

array([-8.81822633, -8.64137372, -8.87187385, -8.35963873, -8.76654969])

## Random Forest

In [95]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train[features_numericas], y_train)

In [96]:
y_pred = rf_model.predict(X_train[features_numericas])

In [97]:
mean_squared_error(y_train, y_pred)

19.613770159776948

# Resultados atuais

In [84]:
somatorio_erro_quadrado = X_train["tempo_diff_previsto_entrega_dias"]**2

In [86]:
X_train.head()

Unnamed: 0,order_id,order_purchase_timestamp,order_item_id,price,freight_value,product_category_name,product_weight_g,customer_state,seller_state,tempo_diff_previsto_entrega_dias,volume_cm_3,flag_customer_seller_mesmo_estado,price_scaled,freight_value_scaled,product_weight_g_scaled,volume_cm_3_scaled
46411,175d5d1ff76c0ea865a58e125e22743d,2017-10-29,1,279.0,51.82,informatica_acessorios,5875.0,MG,RS,13.0,7500.0,0,2.192266,4.407683,0.145331,0.185529
77677,35990049382e07dba1a9ef3550cad655,2018-05-10,3,160.0,58.03,construcao_ferramentas_construcao,15650.0,RJ,SP,26.0,34816.0,0,0.914071,5.1772,0.387137,0.861249
106126,c950a394ee0369c853e3a1a55090e287,2018-05-31,1,16.0,7.39,fashion_bolsas_e_acessorios,250.0,SP,SP,23.0,352.0,1,-0.632653,-1.097893,0.006184,0.008707
68473,1e8ff8bb172260d3d1b60936246867e0,2017-03-05,1,14.0,14.11,ferramentas_jardim,300.0,MG,SP,17.0,11232.0,0,-0.654135,-0.26518,0.007421,0.277848
93891,442a8a32e7ffcfcc05413962b75de9c8,2018-03-01,1,225.91,19.15,informatica_acessorios,958.0,GO,MG,9.0,8008.0,0,1.622019,0.359356,0.023698,0.198095


In [88]:
somatorio_erro_quadrado.sum()/len(X_train)

247.0154071317548