In [37]:
import matplotlib.pyplot as plt
from dataProcessing import DataProcessor
import xgboost as xgb 
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from functions import *
import seaborn as sns
import pandas as pd
import numpy as np
import pickle

pd.set_option('display.max_columns', None)
plt.style.use('fivethirtyeight')
color_pal = ["#F8766D", "#D39200", "#93AA00", "#00BA38", "#00C19F", "#00B9E3", "#619CFF", "#DB72FB"]

In [2]:
dt = DataProcessor()
df_train = dt.read_data('data/Desafio 2 - renner_sales_sample_train.csv')
df_test = dt.read_data('data/Desafio 2 - renner_sales_sample_test.csv')

In [3]:
df_train = dt.clean(df_train)
df_train = dt.features(df_train)

In [4]:
df_test = dt.clean(df_test, test=True)

store_avgs = df_train.groupby('loja')['venda'].mean()
store_stds = df_train.groupby('loja')['venda'].std()

df_test = dt.features(df_test, test=True, store_avgs=store_avgs, store_stds=store_stds)

In [5]:
display(df_train.head(3), df_test.head(3))

Unnamed: 0,loja,data,venda,loja_web,index,c_br,c_uy,s_al,s_am,s_ap,s_ba,s_ce,s_df,s_es,s_ex,s_go,s_ma,s_mg,s_ms,s_mt,s_pa,s_pb,s_pe,s_pi,s_pr,s_rj,s_rn,s_ro,s_rr,s_rs,s_sc,s_se,s_sp,s_to,week_of_year,year,store_avg,store_std,holiday
0,renner_ar_ex_1,2019-12-08,68,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,49,2019,46.319672,68.294485,0
1,renner_ar_ex_1,2019-12-15,376,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,50,2019,46.319672,68.294485,0
2,renner_ar_ex_1,2019-12-22,345,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,2019,46.319672,68.294485,1


Unnamed: 0,loja,data,venda,loja_web,index,c_br,c_uy,s_al,s_am,s_ap,s_ba,s_ce,s_df,s_es,s_ex,s_go,s_ma,s_mg,s_ms,s_mt,s_pa,s_pb,s_pe,s_pi,s_pr,s_rj,s_rn,s_ro,s_rr,s_rs,s_sc,s_se,s_sp,s_to,week_of_year,year,store_avg,store_std,holiday
0,renner_ar_ex_1,2022-04-10,12,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,2022,46.319672,68.294485,0
1,renner_ar_ex_1,2022-04-17,6,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,2022,46.319672,68.294485,1
2,renner_ar_ex_1,2022-04-24,4,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,2022,46.319672,68.294485,1


In [49]:
X_train = df_train.iloc[:, 3:]
X_test = df_test.iloc[:, 3:]

y_train = df_train['venda']
y_test = df_test['venda']

print(X_train.shape, X_test.shape)

(98627, 36) (2949, 36)


In [50]:
X_train['index'] = X_train['index'].astype('int64')
X_test['index'] = X_test['index'].astype('int64')

## Um modelo geral não tunado

In [36]:
xgbreg = xgb.XGBRegressor()
xgbreg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)
predictions = xgbreg.predict(X_test)
plot_diagnostics(y_test, predictions)

MSE: 6380.464862521517
RMSE: 79.87781207895918
MAE: 45.4925378674105


## Um modelo geral tunado

In [51]:
file_name = 'xgb_reg.pkl'
xgbreg_tuned = pickle.load(open(file_name, "rb"))
predictions_tuned = xgbreg_tuned.predict(X_test)
plot_diagnostics(y_test, predictions_tuned)

MSE: 3786.894850268197
RMSE: 61.53775142356273
MAE: 32.86509882453823


In [58]:
df = pd.concat([df_test, pd.Series(predictions_tuned)], axis=1)
df[['venda',0]]

Unnamed: 0,venda,0
0,12,6.445513
1,6,7.000585
2,4,7.000585
3,12,7.150594
4,2,6.064761
...,...,...
2944,1,7.975844
2945,1,7.114059
2946,4,13.553653
2947,6,12.402719


In [90]:
df_train

Unnamed: 0,loja,data,venda,loja_web,index,c_br,c_uy,s_al,s_am,s_ap,s_ba,s_ce,s_df,s_es,s_ex,s_go,s_ma,s_mg,s_ms,s_mt,s_pa,s_pb,s_pe,s_pi,s_pr,s_rj,s_rn,s_ro,s_rr,s_rs,s_sc,s_se,s_sp,s_to,week_of_year,year,store_avg,store_std,holiday
0,renner_ar_ex_1,2019-12-08,68,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,49,2019,46.319672,68.294485,0
1,renner_ar_ex_1,2019-12-15,376,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,50,2019,46.319672,68.294485,0
2,renner_ar_ex_1,2019-12-22,345,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51,2019,46.319672,68.294485,1
3,renner_ar_ex_1,2019-12-29,219,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,52,2019,46.319672,68.294485,1
4,renner_ar_ex_1,2020-01-05,140,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2020,46.319672,68.294485,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98622,renner_uy_ex_9,2022-03-06,18,0,9,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,2022,125.803279,196.759686,0
98623,renner_uy_ex_9,2022-03-13,12,0,9,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,2022,125.803279,196.759686,0
98624,renner_uy_ex_9,2022-03-20,13,0,9,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,2022,125.803279,196.759686,0
98625,renner_uy_ex_9,2022-03-27,6,0,9,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,2022,125.803279,196.759686,0


## Um modelo para cada loja

In [92]:
predictions_list = []
for loja in df_train.loja.unique():
    df = df_train.loc[df_train.loja == loja]
    dft = df_test.loc[df_test.loja == loja]
    X_train = df[['week_of_year', 'year', 'store_avg', 'store_std', 'holiday']]
    X_test = dft[['week_of_year', 'year', 'store_avg', 'store_std', 'holiday']]
    #X_train['index'] = X_train['index'].astype('int64')
    #X_test['index'] = X_test['index'].astype('int64')
    y_train = df['venda']
    y_test = dft['venda']
    model = xgb.XGBRegressor()
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)
    try: 
        predictions = model.predict(X_test)
        for value in predictions:
            predictions_list.append(value)
    except:
        pass



In [93]:
pd.Series(predictions_list)

0        2.513319
1       21.979128
2       10.686388
3       17.333849
4        0.923043
          ...    
2944     2.535826
2945     1.261061
2946    23.260344
2947    20.659792
2948    18.564026
Length: 2949, dtype: float32

In [94]:
plot_diagnostics(df_test['venda'], predictions_list)

MSE: 6510.19559780995
RMSE: 80.68578312075772
MAE: 49.712301732108074
