In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
from pandas_summary import DataFrameSummary

In [3]:
# Evitar warnings molestos
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_feather('train_normalized_data.fth')
df_test = pd.read_feather('test_normalized_data.fth')

In [5]:
df.head()

Unnamed: 0,index,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,...,AfterStateHoliday_bool,BeforeStateHoliday_bool,AfterPromo,BeforePromo,SchoolHoliday_bw,StateHoliday_bool_bw,Promo_bw,SchoolHoliday_fw,StateHoliday_bool_fw,Promo_fw
0,0,0,4,2015-07-31,5263,555,1,1.273237,0,2.144211,...,0.644376,1.072424,0,0,5,0,5,1,0,1
1,1,1,4,2015-07-31,6064,625,1,1.273237,0,2.144211,...,0.965073,1.072424,0,0,5,0,5,1,0,1
2,2,2,4,2015-07-31,8314,821,1,1.273237,0,2.144211,...,0.644376,1.072424,0,0,5,0,5,1,0,1
3,3,3,4,2015-07-31,13995,1498,1,1.273237,0,2.144211,...,0.965073,1.072424,0,0,5,0,5,1,0,1
4,4,4,4,2015-07-31,4822,559,1,1.273237,0,2.144211,...,0.644376,1.072424,0,0,5,0,5,1,0,1


## Train / Test / Valid split

In [6]:
df["Date"].min(), df["Date"].max(), df_test["Date"].min(), df_test["Date"].max()

(Timestamp('2013-01-01 00:00:00'),
 Timestamp('2015-07-31 00:00:00'),
 Timestamp('2015-08-01 00:00:00'),
 Timestamp('2015-09-17 00:00:00'))

In [7]:
df_train = df[df.Date < dt.datetime(2015, 7, 1)]  
df_val = df[df.Date >= dt.datetime(2015, 7, 1)]
round(len(df_train)/len(df), 2), round(len(df_val)/len(df), 2), len(df), len(df_val)

(0.96, 0.04, 844338, 30188)

In [8]:
df_train = df_train[~(
        (df_train.Sales > df_train.Sales.quantile(.975)) | 
        (df_train.Sales < df_train.Sales.quantile(.025))
)]  # Elimino outliers

In [9]:
# sns.histplot(data=df_train, x="Sales")

# df_train["Sales_log"] = np.log(df_train.Sales)
# sns.histplot(data=df_train, x="Sales_log")

## Feature Eng

In [10]:
# WeekOfYear
df_train['WeekOfYear'] = df_train['Date'].dt.isocalendar().week.astype(str)
df_val['WeekOfYear'] = df_val['Date'].dt.isocalendar().week.astype(str)
df_test['WeekOfYear'] = df_test['Date'].dt.isocalendar().week.astype(str)
df['WeekOfYear'] = df['Date'].dt.isocalendar().week.astype(str)

## Feature encoding

In [11]:
from tensorflow.keras.layers import Embedding, Input, Flatten, Concatenate

2023-02-20 02:47:33.992565: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
contin_vars = [
    'CompetitionDistance', 
    'Promo', 
    'SchoolHoliday',    
    'StateHoliday_bool',  # 'AfterStateHoliday_bool', 'BeforeStateHoliday_bool', 

    'CloudCover',
    'Precipitationmm',   
    'Max_TemperatureC',  # 'Mean_TemperatureC',   'Min_TemperatureC', 
    'Max_Humidity',  # 'Mean_Humidity', 'Min_Humidity', 
    'Max_Wind_SpeedKm_h',  # 'Mean_Wind_SpeedKm_h',

    # 'trend', 'trend_DE', 
]

In [13]:
cat_vars = [
    'Store', 
    'StoreType',
    'Assortment',
    'CompetitionMonthsOpen', 
    'CompetitionOpenSinceYear',  
    # 'CompetitionOpenSinceMonth'
    'State', 
    'Events', 
    'Promo2SinceYear', 
    'Promo2Weeks',
    'PromoInterval', 
    'Promo_fw', 'Promo_bw', 
    'StateHoliday_bool_fw', 'StateHoliday_bool_bw', 
    'SchoolHoliday_fw', 'SchoolHoliday_bw',
    'Year', 
    'Month', 
    'Day', 
    'Week', 
    'DayOfWeek',  
    # 'WeekOfYear'
]  

In [14]:
uniques = DataFrameSummary(df[cat_vars]).summary().loc[['uniques']]
# uniques

In [15]:
# Asignación de dimensión de embeddings
cat_var_dict = {
    "Store":50,
    
    "State":6,
    "Events":4,    
 
    "StoreType":2,
    "Assortment": 3,
   
    "Promo_fw":1,
    "Promo_bw":1,
    "Promo2Weeks":1,
    "Promo2SinceYear":4,
    "PromoInterval":3,
    
    
    "StateHoliday":3,  # 2
    "StateHoliday_bool_fw":1,
    "StateHoliday_bool_bw":1,
    
    "SchoolHoliday_fw":1,
    "SchoolHoliday_bw":1,
    
    "CompetitionMonthsOpen":2,
    "CompetitionOpenSinceYear":4,

    "Year":2,
    "Month":6,  # 2 
    "Day":10,  # 4
    "DayOfWeek":6,  # 2
    "Week":4,
    "WeekOfYear":4
}

In [16]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import l2

l2_lambda = 0.001


def get_cat_vars_model(cat_vars, uniques, cat_var_dict):

    cat_vars_embed_outs = []
    cat_var_inputs = []
    
    for cat_var in cat_vars:
        
        cat_var_in = Input(shape=(1,), name=f"{cat_var}_input")
        
        cat_var_inputs.append(cat_var_in)
        
        embed_out = Embedding(
            uniques[cat_var][0],  # Dimencion de entrada
            cat_var_dict[cat_var],  # Dimencion de salida
            # embeddings_regularizer=l2(l2_lambda),  # no ayuda
            name=f'{cat_var}_Embed'
        )(cat_var_in)
        
        flatten_out = Flatten(
            name=f"{cat_var}_flat"
        )(embed_out)
        
        cat_vars_embed_outs.append(flatten_out)

    return cat_var_inputs, cat_vars_embed_outs


def get_cont_vars_input(contin_vars, dense_layer=False):
    
    cont_vars_inputs = []
    cont_vars_outputs = []
    
    for cont_var in contin_vars:
        
        cont_var_in = Input(shape=(1,), name=f"{cont_var}_input")
        
        cont_vars_inputs.append(cont_var_in)
        
        if dense_layer:
            cont_var_out = Dense(1, name=f"{cont_var}_input", activation = 'linear')(cont_var_in)
            cont_vars_outputs.append(cont_var_out)
        else:
            cont_vars_outputs.append(cont_var_in)

    return cont_vars_inputs, cont_vars_outputs

In [17]:
cat_var_inputs, cat_vars_embed_outs = get_cat_vars_model(cat_vars, uniques, cat_var_dict)
cont_vars_inputs,  cont_vars_outs= get_cont_vars_input(contin_vars, True)

2023-02-20 02:47:41.349647: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
model_encoded_input = Concatenate(name='All_Concatenate')(
    cat_vars_embed_outs + cont_vars_inputs
)

## Dataset

In [19]:
log_output = False

In [20]:
# Input
all_vars = cat_vars + contin_vars

X_train = np.hsplit(df_train[all_vars].values, len(all_vars))
X_val = np.hsplit(df_val[all_vars].values, len(all_vars))
X_test = np.hsplit(df_test[all_vars].values, len(all_vars))

# Target
y_out_columns = ["Sales"]

if log_output:
    max_log_y = np.max(np.log(df_train[y_out_columns])).values
    y_train = np.log(df_train[y_out_columns].values)/max_log_y
    y_val = np.log(df_val[y_out_columns].values)/max_log_y
else:
    y_max = df_train[y_out_columns].max().values
    y_train = df_train[y_out_columns].values/y_max
    y_val = df_val[y_out_columns].values/y_max

## Neural Network

In [21]:
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from tensorflow.keras.optimizers import Adam, Nadam, RMSprop
from tensorflow.keras.layers import (
    Embedding, Input, Flatten, Concatenate, Dense, 
    BatchNormalization, Activation, Dropout
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend as K

In [22]:
# Metrics
def rmspe(y_true, y_pred):
    return K.sqrt(K.mean(K.square((y_true - y_pred)/y_true)))

In [23]:
# EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,                 # number of epochs with no improvement (0 means the training 
                                 # is terminated as soon as the performance measure gets worse 
                                 # from one epoch to the next)
    restore_best_weights=True
)

In [24]:
# Tensorboard
tensorboard = TensorBoard(
    log_dir="logs/fit/model-default",
    histogram_freq=1,
    write_graph=False,
    write_images=False
)

In [25]:
# ModelCheckpoint
mcp = ModelCheckpoint(
    'best_val_rmspe.hdf5', 
    monitor='val_rmspe', 
    mode='min', 
    verbose=1, 
    save_best_only=True
)

In [26]:
# Model
output_activation = "sigmoid"  # 'linear'

hidden_activation = "relu"
kernel_initializer = "uniform"

layers_config = [
    {"hidden_units": 1000},
    {"hidden_units": 500},
    {"hidden_units": 100},
]

x = None

for n_layer, layer in enumerate(layers_config):

    x = Dropout(rate=0.02)(model_encoded_input if n_layer==0 else x)
    
    x = Dense(
        layer["hidden_units"], 
        kernel_initializer=kernel_initializer,
        activation=hidden_activation
    )(x)

output_sales = Dense(1, name='Sales', activation=output_activation)(x)

In [27]:
model = Model(cat_var_inputs + cont_vars_inputs, [output_sales])

In [28]:
# model.summary()

In [29]:
model.compile(
    optimizer=Adam(), # RMSprop, Adam, Nadam
    loss="mean_absolute_error",
    metrics=[rmspe]
)

In [30]:
## Tensorboard
from tensorboard import notebook
notebook.list() 

# %tensorboard --logdir logs/fit/
# !tensorboard --logdir logs/fit/ --port 6007 -> Run in your cli

Known TensorBoard instances:
  - port 6006: logdir logs/fit (started 2 days, 6:10:14 ago; pid 29744)
  - port 6007: logdir logs/fit/ (started 1 day, 5:27:57 ago; pid 50796)


In [31]:
model_name = 'rossman-DateInfo-NoDay'
tensorboard.log_dir = f"logs/fit/model-{model_name}-{dt.datetime.now().strftime('%Y%m%dT%H%M')}"

EPOCHS = 20

history = model.fit(
    X_train, 
    y_train, 
    epochs=EPOCHS, 
    batch_size=128,
    validation_data=(X_val, y_val), 
    callbacks=[
        # mcp,
        early_stopping,
        tensorboard
    ]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [32]:
y_pred = model.predict(X_val)
y_true = y_val



In [33]:
# print(f"rmspe: {round(float(rmspe(y_true, y_pred.T)), 3)}; ref value: 0.129")
model.evaluate(X_val, y_val)



[0.042117372155189514, 0.11537401378154755]

In [34]:
if log_output:
    y_pred = np.exp(
        model.predict(X_val, verbose=1) * max_log_y
    )[:,0]
    y_pred_test = np.exp(
        model.predict(X_test, verbose=1) * max_log_y
    )[:,0]
else:
    y_pred = model.predict(X_val, verbose=1)[:,0] * y_max
    y_pred_test = model.predict(X_test, verbose=1)[:,0] * y_max

y_pred_test[df_test['Open'] == 0] = 0



In [35]:
np.sqrt((((df_val['Sales'].values - y_pred)/df_val['Sales'].values)**2).sum()/len(y_pred))

# 0.15541181993250597

0.1247471297854046

In [36]:
sample_csv = pd.read_csv('dataset/rossmann/sample_submission.csv')
sample_csv['Sales'] = y_pred_test
sample_csv.head()

Unnamed: 0,Id,Sales
0,1,3998.825221
1,2,6921.223005
2,3,9056.508061
3,4,7005.629444
4,5,6937.571876


In [37]:
sample_csv.to_csv(
    f'submision_baseline.csv', 
    index=False
)
