In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
from pandas_summary import DataFrameSummary

In [3]:
# Evitar warnings molestos
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_feather('train_normalized_data.fth')
df_test = pd.read_feather('test_normalized_data.fth')

In [5]:
df.head()

Unnamed: 0,index,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,...,AfterStateHoliday_bool,BeforeStateHoliday_bool,AfterPromo,BeforePromo,SchoolHoliday_bw,StateHoliday_bool_bw,Promo_bw,SchoolHoliday_fw,StateHoliday_bool_fw,Promo_fw
0,0,0,4,2015-07-31,5263,555,1,1.273237,0,2.144211,...,0.644376,1.072424,0,0,5,0,5,1,0,1
1,1,1,4,2015-07-31,6064,625,1,1.273237,0,2.144211,...,0.965073,1.072424,0,0,5,0,5,1,0,1
2,2,2,4,2015-07-31,8314,821,1,1.273237,0,2.144211,...,0.644376,1.072424,0,0,5,0,5,1,0,1
3,3,3,4,2015-07-31,13995,1498,1,1.273237,0,2.144211,...,0.965073,1.072424,0,0,5,0,5,1,0,1
4,4,4,4,2015-07-31,4822,559,1,1.273237,0,2.144211,...,0.644376,1.072424,0,0,5,0,5,1,0,1


## Train / Test / Valid split

In [6]:
df["Date"].min(), df["Date"].max(), df_test["Date"].min(), df_test["Date"].max()

(Timestamp('2013-01-01 00:00:00'),
 Timestamp('2015-07-31 00:00:00'),
 Timestamp('2015-08-01 00:00:00'),
 Timestamp('2015-09-17 00:00:00'))

In [7]:
df_train = df[df.Date < dt.datetime(2015, 7, 1)]  
df_val = df[df.Date >= dt.datetime(2015, 7, 1)]
round(len(df_train)/len(df), 2), round(len(df_val)/len(df), 2), len(df), len(df_val)

(0.96, 0.04, 844338, 30188)

In [8]:
# df_train[["Store", "Date", "Sales"]]

## Pre Processing

In [9]:
# max_sales = df_train['Sales'].max()
# 
# df_train.loc[:, 'Sales_norm'] = df_train['Sales'].values/max_sales
# df_val.loc[:, 'Sales_norm'] = df_val['Sales'].values/max_sales

In [10]:
# def get_metric(sales, sales_):
#     return np.sqrt((((sales - sales_)/sales)**2).sum()/len(sales))
# 
# sales_norm_mean = df_train['Sales_norm'].mean()
# 
# print(f"""
#     Train: {get_metric(df_train['Sales_norm'], sales_norm_mean)}; 
#     Val: {get_metric(df_val['Sales_norm'], sales_norm_mean)}
#     """)
# 

## Feature encoding

In [11]:
from tensorflow.keras.layers import Embedding, Input, Flatten, Concatenate

2023-02-18 21:13:53.595699: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
cat_vars = [
    'Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 
    'Promo2SinceYear', 'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 
    'StateHoliday_bool_fw', 'StateHoliday_bool_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw'
]

cat_vars = ['Store', 'DayOfWeek']

In [13]:
uniques = DataFrameSummary(df[cat_vars]).summary().loc[['uniques']]

In [14]:
# contin_vars = [
#     'CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', 
#     'Precipitationmm', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
#     'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE', 'AfterStateHoliday_bool', 
#     'BeforeStateHoliday_bool', 'Promo', 'SchoolHoliday', 'StateHoliday_bool'
# ]
# 
# contin_vars = ['BeforeStateHoliday_bool', 'Max_TemperatureC']

In [15]:
# Asignación de dimensión de embeddings
cat_var_dict = {
   "Store": 8,  # 50
   "DayOfWeek": 2,
}

In [16]:

def get_cat_vars_model(cat_vars, uniques, cat_var_dict):

    cat_vars_embed_outs = []
    cat_var_inputs = []
    
    for cat_var in cat_vars:
        
        cat_var_in = Input(shape=(1,), name=f"{cat_var}_input")
        
        cat_var_inputs.append(cat_var_in)
        
        embed_out = Embedding(
            uniques[cat_var][0],  # Dimencion de entrada
            cat_var_dict[cat_var],  # Dimencion de salida
            name=f'{cat_var}_Embed'
        )(cat_var_in)
        
        flatten_out = Flatten(
            name=f"{cat_var}_flat"
        )(embed_out)
        
        cat_vars_embed_outs.append(flatten_out)

    return cat_var_inputs, cat_vars_embed_outs

# def get_cont_vars_input(contin_vars, dense_layer=False):
#     cont_vars_inputs = []
#     cont_vars_outputs = []
#     for cont_var in contin_vars:
#         cont_var_in = Input(shape=(1,), name=f"{cont_var}_input")
#         cont_vars_inputs.append(cont_var_in)
#         if dense_layer:
#             cont_var_out = Dense(1, name=f"{cont_var}_input", activation = 'linear')(cont_var_in)
#             cont_vars_outputs.append(cont_var_out)
#         else:
#             cont_vars_outputs.append(cont_var_in)
#     return cont_vars_inputs, cont_vars_outputsb

In [17]:
cat_var_inputs, cat_vars_embed_outs = get_cat_vars_model(cat_vars, uniques, cat_var_dict)
# cont_vars_inputs,  cont_vars_outs= get_cont_vars_input(contin_vars)

2023-02-18 21:13:59.496425: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
model_encoded_input = Concatenate(name='All_Concatenate')(cat_vars_embed_outs)

## Dataset

In [19]:
# Input
all_vars = cat_vars 

X_train = np.hsplit(df_train[all_vars].values, len(all_vars))
X_val = np.hsplit(df_val[all_vars].values, len(all_vars))
X_test = np.hsplit(df_test[all_vars].values, len(all_vars))

# Target
y_out_columns = "Sales"
y_max = df_train[y_out_columns].max()
y_train = df_train[y_out_columns].values/y_max
y_val = df_val[y_out_columns].values/y_max

# y_train = np.hsplit(y_train, y_train.shape[0])
# y_val = np.hsplit(y_val, y_val.shape[0])

## Neural Network

In [22]:
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import (
    Embedding, Input, Flatten, Concatenate, Dense, 
    BatchNormalization, Activation, LeakyReLU, Dropout
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend as K

In [23]:
# Metrics
def rmspe(y_true, y_pred):
    return K.sqrt(K.mean(K.square((y_true - y_pred)/y_true)))

In [24]:
# EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,                 # number of epochs with no improvement (0 means the training 
                                # is terminated as soon as the performance measure gets worse 
                                # from one epoch to the next)
    restore_best_weights=True
)

In [25]:
# Tensorboard
tensorboard = TensorBoard(
    log_dir="logs/fit/model-default",
    histogram_freq=1,
    write_graph=False,
    write_images=False
)

In [26]:
# ModelCheckpoint
mcp = ModelCheckpoint(
    'best_val_rmspe.hdf5', 
    monitor='val_rmspe', 
    mode='min', 
    verbose=1, 
    save_best_only=True
)

In [27]:
# Model
output_activation = 'linear'

l2_lambda = 1e-3

layers_config = [
    {"hidden_units": 100, "l2_lambda": l2_lambda},
    {"hidden_units": 50, "l2_lambda": l2_lambda},
]

x = None

for n_layer, layer in enumerate(layers_config):
    
    previous_layer = model_encoded_input if n_layer==0 else x

    x = Dense(
        layer["hidden_units"], 
        kernel_initializer="uniform", 
        kernel_regularizer=l2(layer["l2_lambda"])
    )(previous_layer)

    x = BatchNormalization()(x)

    x = Activation('relu')(x)

output_sales = Dense(1, name='Sales', activation=output_activation)(x)

In [28]:
model = Model(cat_var_inputs, [output_sales])

In [29]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Store_input (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 DayOfWeek_input (InputLayer)   [(None, 1)]          0           []                               
                                                                                                  
 Store_Embed (Embedding)        (None, 1, 8)         8920        ['Store_input[0][0]']            
                                                                                                  
 DayOfWeek_Embed (Embedding)    (None, 1, 2)         14          ['DayOfWeek_input[0][0]']        
                                                                                              

In [30]:
model.compile(
    optimizer=Adam(
        learning_rate=0.001
    ),  # RMSprop, Adam, Nadam
    loss="mse",
    metrics=[rmspe, 'mse']
)

In [None]:
## Tensorboard
from tensorboard import notebook
notebook.list() 

# %tensorboard --logdir logs/fit/
# !tensorboard --logdir logs/fit/ --port 6007 -> Run in your cli

In [31]:
# assert False

model_name = 'rossman'
tensorboard.log_dir = f"logs/fit/model-{model_name}-{dt.datetime.now().strftime('%Y%m%dT%H%M')}"

EPOCHS = 20

history = model.fit(
    X_train, 
    y_train, 
    epochs=EPOCHS, 
    batch_size=32, 
    validation_data=(X_val, y_val), 
    callbacks=[
        # mcp,
        early_stopping,
        tensorboard
    ]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


In [35]:
y_pred = model.predict(X_val)
y_true = y_val



In [52]:
# print(f"rmspe: {round(float(rmspe(y_true, y_pred.T)), 3)}; ref value: 0.129")
model.evaluate(X_val, y_val)



[0.001397569663822651, 0.20941533148288727, 0.0013595435302704573]

In [53]:
y_pred = model.predict(X_val, verbose=1)[:,0] * y_max
y_pred_test = model.predict(X_test, verbose=1)[:,0] * y_max
y_pred_test[df_test['Open'] == 0] = 0



In [55]:
np.sqrt((((df_val['Sales'].values - y_pred)/df_val['Sales'].values)**2).sum()/len(y_pred))

0.2190960850258629

In [57]:
y_pred_test.min()

0.0

In [None]:
# sample_csv.to_csv(f'submision_baseline_1.csv', index=False)

In [60]:
sample_csv = pd.read_csv('dataset/rossmann/sample_submission.csv')
sample_csv['Sales'] = y_pred_test
sample_csv.head()

Unnamed: 0,Id,Sales
0,1,4443.552734
1,2,6995.912598
2,3,8655.329102
3,4,5666.340332
4,5,6373.983398


In [62]:
sample_csv.to_csv(
    # f'submision_{add_customers}-{log_output}-{output_activation}-{l2_lambda}-{first_hidden_units}-{epochs}-{batch_size}-{lr}.csv', 
    f'submision_baseline.csv', 
    index=False
)
