In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# 1. Read data by pandas 
train = pd.read_csv('train.csv', parse_dates=['Date'], low_memory=False) 
test = pd.read_csv('test.csv', parse_dates=['Date'], low_memory=False)
store = pd.read_csv('store.csv', low_memory=False)
#train.count()
#train.head()
#test.head()
#test.count()
#store.head()
#store.count()

# 2. Check data to be sure that all the stores in test and train set have information in store.csv
print("Unique stores in Train:", train['Store'].nunique())
print("Unique stores in Store:", store['Store'].nunique())
print("Unique stores in Test:", test['Store'].nunique())
# T_train = set(pd.read_csv('train.csv', usecols=['Store'])['Store'].unique())
# T_test = set(pd.read_csv('test.csv', usecols=['Store'])['Store'].unique())
# T_store = set(pd.read_csv('store.csv', usecols=['Store'])['Store'].unique())
# is_train_equal_store = T_train.issubset(T_store) and T_store.issubset(T_train)
# is_test_in_train = T_test.issubset(T_train)
# is_test_in_store = T_test.issubset(T_store)


#3. Merge data, get the store information from store.csv into train and test set
train = pd.merge(train, store, on='Store', how='left')
test = pd.merge(test, store, on='Store', how='left')

#4. Handle with Missing Values
def handle_missing_values(df):
    df['CompetitionDistance'] = df['CompetitionDistance'].fillna(df['CompetitionDistance'].median())
    df['CompetitionOpenSinceMonth'] = df['CompetitionOpenSinceMonth'].fillna(0)
    df['CompetitionOpenSinceYear'] = df['CompetitionOpenSinceYear'].fillna(0)
    df['Promo2SinceWeek'] = df['Promo2SinceWeek'].fillna(0)
    df['Promo2SinceYear'] = df['Promo2SinceYear'].fillna(0)
    df['PromoInterval'] = df['PromoInterval'].fillna('0')
    if 'Open' in df.columns:
        df['Open'] = df['Open'].fillna(1)
    return df
# call instance
train = handle_missing_values(train)
test = handle_missing_values(test)

# handling with noise 
train = train[(train["Open"] != 0) & (train["Sales"] > 0)].copy()
y_train = np.log1p(train['Sales'])
test_ids = test['Id']



def feature_engineer_safe(df):
    # time
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week.astype(int)
    
    # Competition Open Time
    df['CompetitionOpen'] = 12 * (df['Year'] - df['CompetitionOpenSinceYear']) + \
        (df['Month'] - df['CompetitionOpenSinceMonth']) #How many months has the competitor been open?"
    df['CompetitionOpen'] = df['CompetitionOpen'].apply(lambda x: x if x > 0 else 0) 
    #If the result < 0 (ie the opponent has not opened at that time), set it to 0
    # --- FIXED ENCODING (MANUAL MAPPING) ---
    # Instead of letting the machine number itself, I  stipulate: a->1, b->2...
    
    # 1. StateHoliday
    df['StateHoliday'] = df['StateHoliday'].astype(str).replace({'0':0, 'a':1, 'b':2, 'c':3}).astype(int)
    
    # 2. StoreType
    df['StoreType'] = df['StoreType'].replace({'a':1, 'b':2, 'c':3, 'd':4}).astype(int)
    
    # 3. Assortment
    df['Assortment'] = df['Assortment'].replace({'a':1, 'b':2, 'c':3}).astype(int)
    
    # 4. PromoInterval
    intervals = {'0':0, 'Jan,Apr,Jul,Oct':1, 'Feb,May,Aug,Nov':2, 'Mar,Jun,Sep,Dec':3}
    # Use map to be safe, if there is a strange value it will become NaN -> fillna(0)
    df['PromoInterval'] = df['PromoInterval'].map(intervals).fillna(0).astype(int)

    #5. Store ID: Subtract 1 to run from 0 -> 1114 (suitable for Embedding's index)
    df['Store'] = df['Store'] - 1
    
    # 6. Year: Map year 2013->0, 2014->1, 2015->2
    year_map = {2013:0, 2014:1, 2015:2}
    df['Year'] = df['Year'].map(year_map).fillna(0).astype(int)
    
    # 7. Month: minus 1 to run from  0 -> 11
    df['Month'] = df['Month'] - 1
    
    #8. WeekOfYear: Subtract 1 to run from 0 -> 51
    df['WeekOfYear'] = df['WeekOfYear'] - 1

    return df

train = feature_engineer_safe(train)
test = feature_engineer_safe(test)

# --- PREPARATION OF INPUT FOR MODEL ---
cat_features = ['Store', 'DayOfWeek', 'Month', 'Year', 'WeekOfYear', 'StoreType', 
                'Assortment', 'PromoInterval', 'StateHoliday']

cont_features = ['CompetitionDistance', 'CompetitionOpen', 'Promo', 'SchoolHoliday', 
                 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'Day'] # fill out Open because Train = 1

# scaler
scaler = StandardScaler()
X_train_cont = scaler.fit_transform(train[cont_features]) #dealing with the problem of difference between 1 year vs 75000m
X_test_cont = scaler.transform(test[cont_features])

# --- Build MODEL (EMBEDDING) ---

inputs = []
embeddings = []

vocab_map = {
    'Store': 1115,
    'DayOfWeek': 7,
    'Month': 12,
    'Year': 3,
    'WeekOfYear': 53,
    'StoreType': 5,     # 1-4, add 1 
    'Assortment': 4,    # 1-3
    'PromoInterval': 4, # 0-3
    'StateHoliday': 4   # 0-3
}
#Keras TensorFlow
for col in cat_features:
    inp = Input(shape=(1,), name=f'in_{col}')
    inputs.append(inp)
    # Dim = min(50, size/2)~ Number of characteristics using half the value (but the maximum level is not more than 50).
    dim = min(50, (vocab_map[col]+1)//2) 
    emb = Embedding(vocab_map[col], dim, name=f'emb_{col}')(inp)
    embeddings.append(Flatten()(emb))

# Continuous Input
inp_cont = Input(shape=(len(cont_features),), name='in_cont')
inputs.append(inp_cont)

# concate
x = concatenate(embeddings + [inp_cont])
x = BatchNormalization()(x)

# Hidden Layers
x = Dense(256, activation='relu')(x) #256 neurons
x = Dropout(0.3)(x)
x = BatchNormalization()(x)

x = Dense(128, activation='relu')(x) #128 neurons
x = Dropout(0.3)(x)
x = BatchNormalization()(x)

output = Dense(1, activation='linear')(x) #1 nreuron sales

model = Model(inputs, output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse') #Adaptive Moment Estimation

# --- TRAIN ---
X_train_list = [train[col].values for col in cat_features] + [X_train_cont]
X_test_list = [test[col].values for col in cat_features] + [X_test_cont]

model.fit(X_train_list, y_train, 
          validation_split=0.1, # 10% for validation
          epochs=20, 
          batch_size=256, # 256 records/turn
          callbacks=[EarlyStopping(patience=4, restore_best_weights=True)]) #if after 4 epochs the val_loss doesn't improve->stop
# 1 epoch: 2969 turns, 256 each turn 


Unique stores in Train: 1115
Unique stores in Store: 1115
Unique stores in Test: 856


  df['StateHoliday'] = df['StateHoliday'].astype(str).replace({'0':0, 'a':1, 'b':2, 'c':3}).astype(int)
  df['StoreType'] = df['StoreType'].replace({'a':1, 'b':2, 'c':3, 'd':4}).astype(int)
  df['Assortment'] = df['Assortment'].replace({'a':1, 'b':2, 'c':3}).astype(int)
  df['StateHoliday'] = df['StateHoliday'].astype(str).replace({'0':0, 'a':1, 'b':2, 'c':3}).astype(int)
  df['StoreType'] = df['StoreType'].replace({'a':1, 'b':2, 'c':3, 'd':4}).astype(int)
  df['Assortment'] = df['Assortment'].replace({'a':1, 'b':2, 'c':3}).astype(int)


Epoch 1/20
[1m2969/2969[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 3.2391 - val_loss: 0.0559
Epoch 2/20
[1m2969/2969[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.0395 - val_loss: 0.0394
Epoch 3/20
[1m2969/2969[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.0251 - val_loss: 0.0355
Epoch 4/20
[1m2969/2969[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.0195 - val_loss: 0.0338
Epoch 5/20
[1m2969/2969[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 0.0172 - val_loss: 0.0339
Epoch 6/20
[1m2969/2969[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.0160 - val_loss: 0.0325
Epoch 7/20
[1m2969/2969[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.0153 - val_loss: 0.0311
Epoch 8/20
[1m2969/2969[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.0149 - val_loss: 0.0317
Epoch 9/20
[1m2969/29

<keras.src.callbacks.history.History at 0x185990e7980>

In [2]:
# --- Predict ---
preds_log = model.predict(X_test_list).flatten() # (41088, 1) (41088,) array
preds = np.expm1(preds_log) #e^x - 1

submission = pd.DataFrame({'Id': test_ids, 'Sales': preds})

# Handle closure: Retrieve original Open information from test file
test_origin = pd.read_csv('test.csv', usecols=['Id', 'Open'])
# Fill NaN Open = 1 
test_origin['Open'] = test_origin['Open'].fillna(1)
# Handling closure: Retrieve information # Assign Sales = 0 for closed days Open original from test file
submission.loc[test_origin['Open'] == 0, 'Sales'] = 0

submission['Sales'] = submission['Sales'].apply(lambda x: max(0, x))

submission.to_csv('submission.csv', index=False)

[1m1284/1284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 900us/step
