# Trabajo Práctico - Rossmann 

* Alumnos:
    - Arribére, María Paz - 62280
    - Dávila, Manuel - 62099

In [1]:
import pandas as pd
import numpy as np
import datetime

from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, Input, Flatten, Concatenate, Dense, BatchNormalization, Activation, LeakyReLU, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras import optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import callbacks

In [2]:
df = pd.read_feather('train_normalized_data.fth')
df_test = pd.read_feather('test_normalized_data.fth')

In [3]:
df_train = df[df.Date < datetime.datetime(2015, 7, 1)]  
df_val = df[df.Date >= datetime.datetime(2015, 7, 1)]
len(df_train)/len(df), len(df_val)/len(df), len(df), len(df_val)

(0.9642465458145908, 0.035753454185409164, 844338, 30188)

In [4]:
final_train = False

In [None]:
max_sales = df_train['Sales'].max()
df.loc[:, 'Sales_norm'] = df['Sales'].values/max_sales

df_train.loc[:, 'Sales_norm'] = df_train['Sales'].values/max_sales
df_val.loc[:, 'Sales_norm'] = df_val['Sales'].values/max_sales

In [6]:
def get_metric(sales, sales_):
    return np.sqrt((((sales - sales_)/sales)**2).sum()/len(sales))


$\textrm{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} \left(\frac{\hat{y}_i - y_i}{y_i}\right)^2}$


In [7]:
def rmspe(y_true, y_pred):
    return K.sqrt(K.mean(K.square((y_true - y_pred)/y_true)))

In [8]:
def get_keras_LR(X_columns, hidden_units=1):
    inputs = []
    activation = 'linear'
    if hidden_units>1:
        activation = 'relu'
    for i, col in enumerate(X_columns):
        inp = Input(shape=(X_train[i].shape[1],), name=f"{col}_input")
        inputs.append(inp)
    if len(X_columns)>1:
        concat_out = Concatenate()(inputs)
        dense_out = Dense(hidden_units, name='Dense', activation=activation)(concat_out)
    else:
        dense_out = Dense(hidden_units, name='Dense', activation=activation)(inputs[0])
    if hidden_units>1:
        dense_out = Dense(1, name='Dense_out')(dense_out)
    model = Model(inputs, dense_out)
    model.compile(optimizers.Adam(learning_rate=0.001), loss='mse', metrics=[rmspe, 'mse'])
    return model


def get_embedings_NN(X_columns, hidden_units = 32, activation = 'relu'):
    embed_outs = []
    inputs = []
    for i, col in enumerate(X_columns):
        inp = Input(shape=(1,), name=f"{col}_input")
        inputs.append(inp)
        if col in embed_outs_dict:
            embed_out = Embedding(len(np.unique(X_train[i])), embed_outs_dict[col], name=f"{col}_embedding", mask_zero=False)(inp)
            out = Flatten(name=f"{col}_flatten")(embed_out)
            embed_outs.append(out)
        else:
            embed_outs.append(inp)
        
    if len(X_columns)>1:
        concat_out = Concatenate()(embed_outs)
        dense_out = Dense(hidden_units, activation=activation)(concat_out)
    else:
        dense_out = Dense(hidden_units, activation=activation)(out)
    out = Dense(1)(dense_out)
    model = Model(inputs, out)
    model.compile(optimizers.Adam(learning_rate=0.001), loss='mse', metrics=[rmspe, 'mse'])
    return model

In [9]:
with_embed = True

embed_outs_dict = {'Store': 2, 'DayOfWeek': 2, 'Promo': 5, 'Year': 2, 'Month': 2, 'Week': 2, 'Day': 2, 
                   'StoreType': 3, 'Assortment': 3, 'CompetitionDistance': 5, 'Promo2': 3,'PromoInterval': 2,
                    'trend': 2, 'Precipitationmm': 2, 'Mean_TemperatureC':2, 'CloudCover':2, 'Events':2}

X_columns = list(embed_outs_dict.keys())

if final_train:
    X_train = np.hsplit(df[X_columns].values, len(X_columns))
    y_train = df['Sales_norm']
else:
    X_train = np.hsplit(df_train[X_columns].values, len(X_columns))
    y_train = df_train['Sales_norm']
    
X_val = np.hsplit(df_val[X_columns].values, len(X_columns))
X_test = np.hsplit(df_test[X_columns].values, len(X_columns))

if not with_embed:
    for i in range(len(X_train)):
        X_train[i] = to_categorical(X_train[i])
        X_val[i] = to_categorical(X_val[i])
        X_test[i] = to_categorical(X_test[i])

y_val = df_val['Sales_norm']

In [10]:
if with_embed:
    model = get_embedings_NN(X_columns)
else:
    model = get_keras_LR(X_columns, hidden_units=16)
model.summary()

In [11]:
epochs = 15
model.compile(optimizers.Adam(learning_rate=0.001), loss='mse', metrics=[rmspe, 'mse'])
cbs = [callbacks.ReduceLROnPlateau(monitor='val_rmspe', mode='min', verbose=1, patience=2), 
       callbacks.ModelCheckpoint('best_val_rmspe_final.keras', monitor='val_rmspe', mode='min', verbose=1, save_best_only=True),
       callbacks.EarlyStopping(monitor='val_rmspe', mode='min', patience=5, verbose=1, restore_best_weights=True)]
if final_train:
    model.fit(X_train, y_train, epochs=epochs, callbacks=cbs)
else:
    model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val), callbacks=cbs)

Epoch 1/15




[1m25443/25443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0010 - mse: 0.0010 - rmspe: 0.2077




Epoch 1: val_rmspe improved from inf to 0.14845, saving model to best_val_rmspe_final.keras
[1m25443/25443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 9ms/step - loss: 0.0010 - mse: 0.0010 - rmspe: 0.2077 - val_loss: 7.1347e-04 - val_mse: 7.1347e-04 - val_rmspe: 0.1484 - learning_rate: 0.0010
Epoch 2/15
[1m25442/25443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - loss: 5.3835e-04 - mse: 5.3835e-04 - rmspe: 0.1472
Epoch 2: val_rmspe did not improve from 0.14845
[1m25443/25443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 8ms/step - loss: 5.3835e-04 - mse: 5.3835e-04 - rmspe: 0.1472 - val_loss: 7.2221e-04 - val_mse: 7.2221e-04 - val_rmspe: 0.1596 - learning_rate: 0.0010
Epoch 3/15
[1m25435/25443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - loss: 5.0635e-04 - mse: 5.0635e-04 - rmspe: 0.1433
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.

Epoch 3: val_rmspe did not improve from 0.14845
[1m25443

In [12]:
model.load_weights('best_val_rmspe_final.keras')

In [13]:
test_predictions = model.predict(X_test)*max_sales
test_predictions[df_test['Open'] == 0] = 0

sample_csv = pd.read_csv('rossmann/sample_submission.csv')
sample_csv['Sales'] = test_predictions
sample_csv.head()

sample_csv.to_csv(f'submision_Arribere_Davila_final.csv', index=False)

[1m1284/1284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 8ms/step
