In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from tqdm import tqdm, tqdm_notebook
import xgboost as xgb

from itertools import product

import warnings
warnings.filterwarnings('ignore')

In [112]:
def round_thresh(x, t):
    if x % 1 > t:
        return np.ceil(x)
    else:
        return np.floor(x)

def rsme(y1, y2):
    return np.sqrt(mean_squared_error(y1, y2))

In [113]:
vendas = pd.read_csv('../../../data/interim/Vendas.csv', encoding = "ISO-8859-1")

In [114]:
vendas = vendas.replace(['ST', 'CORSÁRIO SUPLEX', 'ESP', 'ESPECIAL'], ['00','CORSARIO SUPLEX', 'GG', 'GG'])

In [115]:
vendas['DATE_BLOCK'] = LabelEncoder().fit_transform(vendas.ANO.apply(str) + vendas.MES.apply(lambda x: str(x).zfill(5)))

### Fazer as vendas mensais 

In [129]:
precos_aproximados = dict(list(zip(vendas[vendas.ANO != 2018].DESCRICAO, vendas[vendas.ANO != 2018].VLR_UNIT)))

In [130]:
precos = dict(zip(tuple(zip(vendas[vendas.ANO != 2018].DESCRICAO, vendas[vendas.ANO != 2018].TAMANHO)), vendas[vendas.ANO != 2018].VLR_UNIT))

In [6]:
a = vendas.groupby(['ANO', 'MES', 'DESCRICAO', 'NOME', 'TAMANHO', 'DATE_BLOCK'])

In [7]:
df = a.sum().reset_index().drop(['DIA', 'VLR_UNIT'], 1)
df = df.sort_values(by=['ANO','MES', 'DESCRICAO', 'TAMANHO', 'NOME'])

In [8]:
df[df.ANO == 2018].head()

Unnamed: 0,ANO,MES,DESCRICAO,NOME,TAMANHO,DATE_BLOCK,QUANTIDADE
4399,2018.0,1.0,BERMUDA CICLISTA COTTON,ALDEIA DA SERRA,2,24,1.0
4402,2018.0,1.0,BERMUDA CICLISTA COTTON,VERBO DIVINO,2,24,3.0
4396,2018.0,1.0,BERMUDA CICLISTA COTTON,ACLIMAÇÃO,4,24,8.0
4400,2018.0,1.0,BERMUDA CICLISTA COTTON,ALDEIA DA SERRA,4,24,1.0
4403,2018.0,1.0,BERMUDA CICLISTA COTTON,VERBO DIVINO,4,24,13.0


In [9]:
df_piv = df.pivot_table(index=['DESCRICAO', 'NOME', 'TAMANHO'], columns=['DATE_BLOCK'], 
                            values='QUANTIDADE',aggfunc='sum', dropna=True).fillna(0).reset_index()

In [10]:
df_piv.head()

DATE_BLOCK,DESCRICAO,NOME,TAMANHO,0,1,2,3,4,5,6,...,19,20,21,22,23,24,25,26,27,28
0,BERMUDA CICLISTA COLEGIAL,ACLIMAÇÃO,G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BERMUDA CICLISTA COLEGIAL,ACLIMAÇÃO,PP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,BERMUDA CICLISTA COLEGIAL,ALDEIA DA SERRA,P,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,BERMUDA CICLISTA COLEGIAL,ALDEIA DA SERRA,PP,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BERMUDA CICLISTA COLEGIAL,ITAIM,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
a = dict(enumerate(product([2016, 2017, 2018], [i for i in range(1,13)])))

In [12]:
def migue(x):
    if type(x) == str:
        return np.nan
    else:
        return x

In [13]:
final_df = 0
for i in tqdm(range(len(df_piv))):
    k = pd.DataFrame({'QUANTIDADE' : df_piv.iloc[i].values[3:],
                      'LAST_MONTH' : df_piv.iloc[i].shift(1).apply(lambda x: migue(x)).values[3:],
                      'LAST_YEAR' : df_piv.iloc[i].shift(12).apply(lambda x: migue(x)).values[3:],
                     'LAST_YEAR_2' : df_piv.iloc[i].shift(24).apply(lambda x: migue(x)).values[3:]})
    ano = list(map(lambda x: a[x][0], pd.DataFrame(df_piv.iloc[i].values[3:]).index))
    mes = list(map(lambda x: a[x][1], pd.DataFrame(df_piv.iloc[i].values[3:]).index))
    k['DESCRICAO'] = df_piv.iloc[i].DESCRICAO
    k['TAMANHO'] = df_piv.iloc[i].TAMANHO
    k['NOME'] = df_piv.iloc[i].NOME
    k['ANO'] = ano
    k['MES'] = mes
    
    try:
        final_df = final_df.append(k)
    except:
        final_df = k
        
final_df['LAST_YEAR_MEAN'] = (final_df['LAST_YEAR'] + final_df['LAST_YEAR_2']) / 2
final_df['DATE_BLOCK'] = final_df.index
final_df = final_df[['DATE_BLOCK', 'ANO', 'MES', 'DESCRICAO', 'TAMANHO', 'NOME', 'LAST_MONTH', 'LAST_YEAR', 'LAST_YEAR_2','LAST_YEAR_MEAN', 'QUANTIDADE']]

100%|███████████████████████████████████████████████████████████████████████████████| 670/670 [00:05<00:00, 116.77it/s]


In [14]:
final_df = final_df.sort_values(by=['ANO', 'MES', 'DESCRICAO', 'TAMANHO', 'NOME'])

#### Ultimo ano

Do ultimo ano, não temos registrado Itaim, nem as outras 2 lojas até o mes 3

In [15]:
final_df.index = np.arange(len(final_df))

In [16]:
final_df = final_df.drop(final_df[final_df.ANO == 2018][final_df.MES > 3][final_df.NOME != 'VERBO DIVINO'].index)

In [17]:
final_df = final_df.drop(final_df[final_df.ANO == 2018][final_df.NOME == 'ITAIM'].index)

In [18]:
treino = final_df[final_df.ANO < 2018]
validacao = final_df[final_df.ANO == 2018]

#### Data

In [19]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

In [20]:
le_DESC = LabelEncoder()
treino['DESCRICAO'] = le_DESC.fit_transform(treino.DESCRICAO)
validacao['DESCRICAO'] = le_DESC.transform(validacao.DESCRICAO)

In [21]:
le_TAM = LabelEncoder()
treino['TAMANHO'] = le_TAM.fit_transform(treino.TAMANHO)
validacao['TAMANHO'] = le_TAM.transform(validacao.TAMANHO)

In [22]:
le_NOME = LabelEncoder()
treino['NOME'] = le_NOME.fit_transform(treino.NOME)
validacao['NOME'] = le_NOME.transform(validacao.NOME)

In [23]:
X_train = treino.drop(['DATE_BLOCK','LAST_YEAR', 'LAST_MONTH', 'LAST_YEAR_2', 'LAST_YEAR_MEAN'], 1).iloc[:, :-1]
y_train = treino.drop(['DATE_BLOCK','LAST_YEAR', 'LAST_MONTH', 'LAST_YEAR_2', 'LAST_YEAR_MEAN'], 1).iloc[:, -1]

X_test = validacao.drop(['DATE_BLOCK','LAST_YEAR', 'LAST_MONTH', 'LAST_YEAR_2', 'LAST_YEAR_MEAN'], 1).iloc[:, :-1]
y_test = validacao.drop(['DATE_BLOCK','LAST_YEAR', 'LAST_MONTH', 'LAST_YEAR_2', 'LAST_YEAR_MEAN'], 1).iloc[:, -1]

# Prevendo os 5 meses de uma vez

## Treino - Modelo 1

#### Apenas dados normais

In [24]:
param = {'max_depth': 7, 
         'subsample': .85, 
         'colsample_bylevel': 1,
         'n_estimators': 200, 
         'learning_rate': .025,
         'random_state': 1,  # 1
         'silent':0,
         'reg_alpha':.1,
         'eval_metric':'rmse',
        }

clf_xgb = xgb.XGBRegressor(**param)

In [25]:
clf_xgb.fit(X_train.values, y_train.values, eval_set=[(X_test.values, y_test.values)], early_stopping_rounds=10)

[0]	validation_0-rmse:10.907
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:10.6838
[2]	validation_0-rmse:10.4886
[3]	validation_0-rmse:10.2764
[4]	validation_0-rmse:10.0808
[5]	validation_0-rmse:9.88736
[6]	validation_0-rmse:9.69187
[7]	validation_0-rmse:9.51029
[8]	validation_0-rmse:9.33448
[9]	validation_0-rmse:9.15975
[10]	validation_0-rmse:8.98318
[11]	validation_0-rmse:8.83837
[12]	validation_0-rmse:8.6803
[13]	validation_0-rmse:8.5221
[14]	validation_0-rmse:8.36684
[15]	validation_0-rmse:8.2227
[16]	validation_0-rmse:8.05757
[17]	validation_0-rmse:7.92075
[18]	validation_0-rmse:7.77828
[19]	validation_0-rmse:7.64843
[20]	validation_0-rmse:7.52834
[21]	validation_0-rmse:7.39321
[22]	validation_0-rmse:7.27671
[23]	validation_0-rmse:7.15606
[24]	validation_0-rmse:7.03316
[25]	validation_0-rmse:6.92963
[26]	validation_0-rmse:6.82356
[27]	validation_0-rmse:6.71488
[28]	validation_0-rmse:6.60082
[29]	validation_0-rmse:6.49664
[30]	validation_0-r

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, eval_metric='rmse', gamma=0,
       learning_rate=0.025, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=1, reg_alpha=0.1,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=0,
       subsample=0.85)

In [26]:
y_pred = clf_xgb.predict(X_test.values)

In [27]:
rsme(y_test, np.round(y_pred))

4.2670453630573304

In [28]:
list(zip(y_test, y_pred))

[(0.0, 0.47847217),
 (0.0, 1.3065271),
 (0.0, 2.0954742),
 (0.0, 0.45692986),
 (0.0, 3.9221282),
 (0.0, 0.63144958),
 (0.0, 0.66673332),
 (0.0, 4.1186838),
 (0.0, 3.0497985),
 (0.0, 3.0551116),
 (0.0, 4.9288516),
 (1.0, 1.5766906),
 (3.0, 4.625072),
 (8.0, 5.2807484),
 (1.0, 3.0953088),
 (13.0, 18.364851),
 (12.0, 5.683928),
 (2.0, 3.0605688),
 (20.0, 17.06307),
 (8.0, 5.4459391),
 (0.0, 3.0971889),
 (13.0, 19.316139),
 (12.0, 5.9631276),
 (0.0, 3.3026078),
 (8.0, 27.215607),
 (5.0, 3.5033627),
 (6.0, 1.8098735),
 (16.0, 15.503657),
 (0.0, 0.59955603),
 (1.0, 2.9220095),
 (0.0, 0.54896498),
 (0.0, 0.55116016),
 (0.0, 0.56177568),
 (4.0, 4.6761026),
 (1.0, 0.68277365),
 (0.0, 0.74143934),
 (2.0, 9.0367413),
 (3.0, 1.5060537),
 (0.0, 1.5916693),
 (8.0, 9.4237318),
 (5.0, 6.1946812),
 (3.0, 2.3954387),
 (11.0, 19.222607),
 (3.0, 6.899406),
 (2.0, 2.6700499),
 (27.0, 22.386118),
 (9.0, 7.0466871),
 (1.0, 4.1409411),
 (37.0, 39.490482),
 (9.0, 7.4088316),
 (4.0, 4.0651655),
 (26.0, 34.55720

## Treino - Modelo 2

#### Apenas dados normais + new_features

In [29]:
X_train = treino.drop(['LAST_YEAR_2', 'LAST_YEAR_MEAN'], 1).iloc[:, :-1]
y_train = treino.iloc[:, -1]

X_test = validacao.drop(['LAST_YEAR_2', 'LAST_YEAR'], 1).iloc[:, :-1]
y_test = validacao.iloc[:, -1]

In [30]:
param = {'max_depth': 4, 
         'subsample': .7, 
         'colsample_bylevel': .7,
         'n_estimators': 200, 
         'learning_rate': .05,
         'random_state':0,  # 1
         'silent':1,
         'reg_alpha': .4,
         'eval_metric':'rmse',
        'missing': np.nan
        }

clf_xgb = xgb.XGBRegressor(**param)

In [31]:
clf_xgb.fit(X_train.values, y_train.values, eval_set=[(X_test.values, y_test.values)], early_stopping_rounds=10)

[0]	validation_0-rmse:10.7833
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:10.6153
[2]	validation_0-rmse:10.275
[3]	validation_0-rmse:9.98436
[4]	validation_0-rmse:9.84564
[5]	validation_0-rmse:9.81169
[6]	validation_0-rmse:9.69375
[7]	validation_0-rmse:9.6235
[8]	validation_0-rmse:9.36605
[9]	validation_0-rmse:9.11431
[10]	validation_0-rmse:8.90043
[11]	validation_0-rmse:8.81958
[12]	validation_0-rmse:8.78285
[13]	validation_0-rmse:8.57949
[14]	validation_0-rmse:8.37482
[15]	validation_0-rmse:8.24504
[16]	validation_0-rmse:8.02325
[17]	validation_0-rmse:7.85202
[18]	validation_0-rmse:7.76011
[19]	validation_0-rmse:7.54835
[20]	validation_0-rmse:7.37405
[21]	validation_0-rmse:7.28465
[22]	validation_0-rmse:7.13609
[23]	validation_0-rmse:7.0404
[24]	validation_0-rmse:6.88986
[25]	validation_0-rmse:6.84396
[26]	validation_0-rmse:6.73296
[27]	validation_0-rmse:6.70186
[28]	validation_0-rmse:6.6005
[29]	validation_0-rmse:6.51428
[30]	validation_0-r

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
       colsample_bytree=1, eval_metric='rmse', gamma=0, learning_rate=0.05,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=200, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0.4, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=1, subsample=0.7)

In [32]:
y_pred_2 = clf_xgb.predict(X_test.values)

In [88]:
rsme(y_test, (y_pred_2 + y_pred)/2)

4.2449427071194705

## Treino - Ex de retreino de tempos em tempos

#### Apenas dados normais 

In [34]:
param = {'max_depth': 7, 
         'subsample': .85, 
         'colsample_bylevel': 1,
         'n_estimators': 200, 
         'learning_rate': .025,
         'random_state': 1,  # 1
         'silent':0,
         'reg_alpha':.1,
         'eval_metric':'rmse',
        }

In [45]:
preds = np.array([])
for i in range(24, 29):

    treino = final_df[final_df.DATE_BLOCK < i]
    validacao = final_df[final_df.DATE_BLOCK == i]

    treino['DESCRICAO'] = le_DESC.fit_transform(treino.DESCRICAO)
    validacao['DESCRICAO'] = le_DESC.transform(validacao.DESCRICAO)
    treino['TAMANHO'] = le_TAM.fit_transform(treino.TAMANHO)
    validacao['TAMANHO'] = le_TAM.transform(validacao.TAMANHO)
    treino['NOME'] = le_NOME.fit_transform(treino.NOME)
    validacao['NOME'] = le_NOME.transform(validacao.NOME)

    X_train = treino.drop(['DATE_BLOCK','LAST_YEAR', 'LAST_MONTH', 'LAST_YEAR_2', 'LAST_YEAR_MEAN'], 1).iloc[:, :-1]
    y_train = treino.drop(['DATE_BLOCK','LAST_YEAR', 'LAST_MONTH', 'LAST_YEAR_2', 'LAST_YEAR_MEAN'], 1).iloc[:, -1]

    X_test = validacao.drop(['DATE_BLOCK','LAST_YEAR', 'LAST_MONTH', 'LAST_YEAR_2', 'LAST_YEAR_MEAN'], 1).iloc[:, :-1]
    y_test = validacao.drop(['DATE_BLOCK','LAST_YEAR', 'LAST_MONTH', 'LAST_YEAR_2', 'LAST_YEAR_MEAN'], 1).iloc[:, -1]
    
    clf_xgb = xgb.XGBRegressor(**param)
    
    clf_xgb.fit(X_train.values, y_train.values, eval_set=[(X_test.values, y_test.values)], early_stopping_rounds=10)

    preds = np.append(preds, clf_xgb.predict(X_test.values))

[0]	validation_0-rmse:17.4074
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:17.0172
[2]	validation_0-rmse:16.674
[3]	validation_0-rmse:16.297
[4]	validation_0-rmse:15.9605
[5]	validation_0-rmse:15.6169
[6]	validation_0-rmse:15.2659
[7]	validation_0-rmse:14.9465
[8]	validation_0-rmse:14.636
[9]	validation_0-rmse:14.329
[10]	validation_0-rmse:14.0078
[11]	validation_0-rmse:13.7615
[12]	validation_0-rmse:13.4803
[13]	validation_0-rmse:13.1886
[14]	validation_0-rmse:12.9024
[15]	validation_0-rmse:12.6191
[16]	validation_0-rmse:12.3056
[17]	validation_0-rmse:12.057
[18]	validation_0-rmse:11.7729
[19]	validation_0-rmse:11.5248
[20]	validation_0-rmse:11.3057
[21]	validation_0-rmse:11.047
[22]	validation_0-rmse:10.8374
[23]	validation_0-rmse:10.5921
[24]	validation_0-rmse:10.3395
[25]	validation_0-rmse:10.145
[26]	validation_0-rmse:9.94427
[27]	validation_0-rmse:9.77714
[28]	validation_0-rmse:9.54057
[29]	validation_0-rmse:9.32548
[30]	validation_0-rmse

[160]	validation_0-rmse:4.33557
[161]	validation_0-rmse:4.33761
[162]	validation_0-rmse:4.33772
[163]	validation_0-rmse:4.33796
[164]	validation_0-rmse:4.33731
[165]	validation_0-rmse:4.33398
[166]	validation_0-rmse:4.33205
[167]	validation_0-rmse:4.32614
[168]	validation_0-rmse:4.32607
[169]	validation_0-rmse:4.32527
[170]	validation_0-rmse:4.32553
[171]	validation_0-rmse:4.32581
[172]	validation_0-rmse:4.32615
[173]	validation_0-rmse:4.3256
[174]	validation_0-rmse:4.3241
[175]	validation_0-rmse:4.32349
[176]	validation_0-rmse:4.32586
[177]	validation_0-rmse:4.32498
[178]	validation_0-rmse:4.32727
[179]	validation_0-rmse:4.32588
[180]	validation_0-rmse:4.32699
[181]	validation_0-rmse:4.3266
[182]	validation_0-rmse:4.32721
[183]	validation_0-rmse:4.32846
[184]	validation_0-rmse:4.33134
[185]	validation_0-rmse:4.33137
Stopping. Best iteration:
[175]	validation_0-rmse:4.32349

[0]	validation_0-rmse:4.73988
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-

[36]	validation_0-rmse:5.78769
[37]	validation_0-rmse:5.76931
[38]	validation_0-rmse:5.74971
[39]	validation_0-rmse:5.7168
[40]	validation_0-rmse:5.69587
[41]	validation_0-rmse:5.67584
[42]	validation_0-rmse:5.65528
[43]	validation_0-rmse:5.63144
[44]	validation_0-rmse:5.60355
[45]	validation_0-rmse:5.5968
[46]	validation_0-rmse:5.57172
[47]	validation_0-rmse:5.5791
[48]	validation_0-rmse:5.56216
[49]	validation_0-rmse:5.54932
[50]	validation_0-rmse:5.54197
[51]	validation_0-rmse:5.51977
[52]	validation_0-rmse:5.50547
[53]	validation_0-rmse:5.49469
[54]	validation_0-rmse:5.48001
[55]	validation_0-rmse:5.4626
[56]	validation_0-rmse:5.47248
[57]	validation_0-rmse:5.45647
[58]	validation_0-rmse:5.43006
[59]	validation_0-rmse:5.4269
[60]	validation_0-rmse:5.41611
[61]	validation_0-rmse:5.40159
[62]	validation_0-rmse:5.39088
[63]	validation_0-rmse:5.37504
[64]	validation_0-rmse:5.3723
[65]	validation_0-rmse:5.34699
[66]	validation_0-rmse:5.33749
[67]	validation_0-rmse:5.33461
[68]	validatio

In [48]:
validacao = final_df[final_df.DATE_BLOCK > 23]
y_test = validacao.QUANTIDADE

In [49]:
rsme(y_test, preds)

4.094795013466805

## Treino - Retreino de tempos em tempos

## New_features

In [72]:
param = {'max_depth': 6, 
         'subsample': .7, 
         'colsample_bylevel': .7,
         'n_estimators': 300, 
         'learning_rate': .025,
         'random_state':0,  # 1
         'silent':1,
         'reg_alpha': .4,
         'eval_metric':'rmse',
        'missing': np.nan
        }

In [73]:
preds_new_f = np.array([])
for i in range(24, 29):

    treino = final_df[final_df.DATE_BLOCK < i]
    validacao = final_df[final_df.DATE_BLOCK == i]

    treino['DESCRICAO'] = le_DESC.fit_transform(treino.DESCRICAO)
    validacao['DESCRICAO'] = le_DESC.transform(validacao.DESCRICAO)
    treino['TAMANHO'] = le_TAM.fit_transform(treino.TAMANHO)
    validacao['TAMANHO'] = le_TAM.transform(validacao.TAMANHO)
    treino['NOME'] = le_NOME.fit_transform(treino.NOME)
    validacao['NOME'] = le_NOME.transform(validacao.NOME)

    X_train = treino.drop(['DATE_BLOCK', 'LAST_YEAR_2', 'LAST_YEAR_MEAN'], 1).iloc[:, :-1]
    y_train = treino.iloc[:, -1]

    X_test = validacao.drop(['DATE_BLOCK','LAST_YEAR', 'LAST_YEAR_2'], 1).iloc[:, :-1]
    y_test = validacao.iloc[:, -1]
    
    clf_xgb = xgb.XGBRegressor(**param)
    
    clf_xgb.fit(X_train.values, y_train.values, eval_set=[(X_test.values, y_test.values)], early_stopping_rounds=10)

    preds_new_f = np.append(preds_new_f, clf_xgb.predict(X_test.values))

[0]	validation_0-rmse:17.3636
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:16.9564
[2]	validation_0-rmse:16.6364
[3]	validation_0-rmse:16.2643
[4]	validation_0-rmse:15.9186
[5]	validation_0-rmse:15.8212
[6]	validation_0-rmse:15.5317
[7]	validation_0-rmse:15.163
[8]	validation_0-rmse:14.8428
[9]	validation_0-rmse:14.5115
[10]	validation_0-rmse:14.3472
[11]	validation_0-rmse:14.0488
[12]	validation_0-rmse:13.7704
[13]	validation_0-rmse:13.4408
[14]	validation_0-rmse:13.1268
[15]	validation_0-rmse:12.8084
[16]	validation_0-rmse:12.4839
[17]	validation_0-rmse:12.2235
[18]	validation_0-rmse:11.9595
[19]	validation_0-rmse:11.7746
[20]	validation_0-rmse:11.5604
[21]	validation_0-rmse:11.3074
[22]	validation_0-rmse:11.0383
[23]	validation_0-rmse:10.7901
[24]	validation_0-rmse:10.5653
[25]	validation_0-rmse:10.3738
[26]	validation_0-rmse:10.1617
[27]	validation_0-rmse:10.0305
[28]	validation_0-rmse:9.81073
[29]	validation_0-rmse:9.62004
[30]	validation_

[166]	validation_0-rmse:4.35503
[167]	validation_0-rmse:4.35362
[168]	validation_0-rmse:4.35153
[169]	validation_0-rmse:4.34749
[170]	validation_0-rmse:4.35083
[171]	validation_0-rmse:4.35088
[172]	validation_0-rmse:4.35032
[173]	validation_0-rmse:4.3499
[174]	validation_0-rmse:4.35099
[175]	validation_0-rmse:4.3496
[176]	validation_0-rmse:4.35431
[177]	validation_0-rmse:4.35316
[178]	validation_0-rmse:4.34571
[179]	validation_0-rmse:4.34538
[180]	validation_0-rmse:4.34567
[181]	validation_0-rmse:4.34515
[182]	validation_0-rmse:4.34416
[183]	validation_0-rmse:4.34576
[184]	validation_0-rmse:4.34474
[185]	validation_0-rmse:4.34537
[186]	validation_0-rmse:4.34166
[187]	validation_0-rmse:4.34244
[188]	validation_0-rmse:4.34089
[189]	validation_0-rmse:4.34976
[190]	validation_0-rmse:4.34882
[191]	validation_0-rmse:4.3445
[192]	validation_0-rmse:4.34349
[193]	validation_0-rmse:4.34646
[194]	validation_0-rmse:4.34209
[195]	validation_0-rmse:4.3438
[196]	validation_0-rmse:4.34367
[197]	valida

[93]	validation_0-rmse:3.04683
[94]	validation_0-rmse:3.0454
[95]	validation_0-rmse:3.04228
[96]	validation_0-rmse:3.03796
[97]	validation_0-rmse:3.03408
[98]	validation_0-rmse:3.03308
[99]	validation_0-rmse:3.02906
[100]	validation_0-rmse:3.02662
[101]	validation_0-rmse:3.02331
[102]	validation_0-rmse:3.0178
[103]	validation_0-rmse:3.01298
[104]	validation_0-rmse:3.01236
[105]	validation_0-rmse:3.00902
[106]	validation_0-rmse:3.0064
[107]	validation_0-rmse:3.00504
[108]	validation_0-rmse:3.00258
[109]	validation_0-rmse:2.99481
[110]	validation_0-rmse:2.99458
[111]	validation_0-rmse:2.98922
[112]	validation_0-rmse:2.98686
[113]	validation_0-rmse:2.98518
[114]	validation_0-rmse:2.98379
[115]	validation_0-rmse:2.97926
[116]	validation_0-rmse:2.97793
[117]	validation_0-rmse:2.97576
[118]	validation_0-rmse:2.97377
[119]	validation_0-rmse:2.97229
[120]	validation_0-rmse:2.97003
[121]	validation_0-rmse:2.96497
[122]	validation_0-rmse:2.96135
[123]	validation_0-rmse:2.9623
[124]	validation_0-

[137]	validation_0-rmse:5.02059
[138]	validation_0-rmse:5.00125
[139]	validation_0-rmse:4.99976
[140]	validation_0-rmse:4.99734
[141]	validation_0-rmse:4.99861
[142]	validation_0-rmse:4.9953
[143]	validation_0-rmse:4.9921
[144]	validation_0-rmse:4.99016
[145]	validation_0-rmse:4.98504
[146]	validation_0-rmse:4.98448
[147]	validation_0-rmse:4.98494
[148]	validation_0-rmse:4.98234
[149]	validation_0-rmse:4.98329
[150]	validation_0-rmse:4.9827
[151]	validation_0-rmse:4.98011
[152]	validation_0-rmse:4.97807
[153]	validation_0-rmse:4.97317
[154]	validation_0-rmse:4.97066
[155]	validation_0-rmse:4.96719
[156]	validation_0-rmse:4.96659
[157]	validation_0-rmse:4.96412
[158]	validation_0-rmse:4.96373
[159]	validation_0-rmse:4.96219
[160]	validation_0-rmse:4.95935
[161]	validation_0-rmse:4.95983
[162]	validation_0-rmse:4.95539
[163]	validation_0-rmse:4.95535
[164]	validation_0-rmse:4.95482
[165]	validation_0-rmse:4.95506
[166]	validation_0-rmse:4.95401
[167]	validation_0-rmse:4.9542
[168]	valida

In [74]:
validacao = final_df[final_df.DATE_BLOCK > 23]
y_test = validacao.QUANTIDADE

In [75]:
rsme(y_test, preds_new_f)

4.2546024822687851

In [76]:
rsme(y_test, (preds + preds_new_f)/2)

4.0808859940916546

### Comparacao

In [77]:
y_last_year = final_df[final_df.ANO == 2018]['LAST_YEAR']

In [79]:
rsme(y_test, y_last_year)

5.6950639981669795

In [80]:
y_last_year_2 = final_df[final_df.ANO == 2018]['LAST_YEAR_2']

In [81]:
rsme(y_test, y_last_year_2)

6.140015447460847

In [82]:
y_last_year_mean = final_df[final_df.ANO == 2018]['LAST_YEAR_MEAN']

In [83]:
rsme(y_test, np.round(y_last_year_mean))

4.789359042926411

In [84]:
y_last_month = final_df[final_df.ANO == 2018]['LAST_MONTH']

In [85]:
rsme(y_test, y_last_month)

11.100297928003155

In [136]:
validacao['preds_all_no_features'] = y_pred
validacao['preds_all_features'] = y_pred_2
validacao['preds_retrain_no_features'] = preds
validacao['preds_retrain_features'] = preds_new_f
validacao['preds_all'] = (y_pred + y_pred_2) / 2
validacao['preds_retrain'] = (preds + preds_new_f) / 2

In [137]:
validacao['PRECO_PRODUTO'] = validacao.DESCRICAO.apply(lambda x: preços_aproximados[x])

In [138]:
validacao.to_csv('./results/validacao.csv', index=False)