In [1]:
%%time
import fastai
from fastai.tabular.all import *
from pathlib import Path
import os
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
import IPython
from sklearn.ensemble import RandomForestRegressor
from IPython.display import Image, display_svg, SVG
from sklearn.tree import export_graphviz
import waterfall_chart
from treeinterpreter import treeinterpreter
from sklearn.inspection import plot_partial_dependence
from scipy.cluster import hierarchy as hc
import xgboost
from xgboost import XGBRegressor
import optuna

CPU times: user 1.83 s, sys: 1.04 s, total: 2.87 s
Wall time: 2.43 s


In [2]:
df = pd.read_csv("train.csv")

test = pd.read_csv("test.csv")

test["num_sold"] = 0

combined = pd.concat([df, test])

In [3]:
dep_var = "num_sold"

In [4]:
holiday = pd.read_csv("nordic_holidays.csv")

In [5]:
holiday.head()

Unnamed: 0.1,Unnamed: 0,date,holiday,country
0,0,2016-01-01,Uudenvuodenpäivä,Finland
1,1,2016-01-06,Loppiainen,Finland
2,2,2016-03-25,Pitkäperjantai,Finland
3,3,2016-03-27,Pääsiäispäivä,Finland
4,4,2016-03-28,2. pääsiäispäivä,Finland


In [6]:
holiday["holiday"] = str("in_") + holiday["country"]
display(holiday.head())

Unnamed: 0.1,Unnamed: 0,date,holiday,country
0,0,2016-01-01,in_Finland,Finland
1,1,2016-01-06,in_Finland,Finland
2,2,2016-03-25,in_Finland,Finland
3,3,2016-03-27,in_Finland,Finland
4,4,2016-03-28,in_Finland,Finland


In [7]:
combined = combined.merge(holiday[["date", "holiday", "country"]], left_on = ["date", "country"], right_on = ["date", "country"], how = "left")

In [8]:
combined["holiday"] = combined["holiday"].fillna("no_holiday")

In [9]:
country_map = pickle.load(open("embs/country_map.pkl", "rb"))
month_map = pickle.load(open("embs/month_map.pkl", "rb"))
product_map = pickle.load(open("embs/product_map.pkl", "rb"))
store_map = pickle.load(open("embs/store_map.pkl", "rb"))

In [10]:
emb_dim = country_map['Finland'].shape[0]
col_name = [f'country_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_country = pd.DataFrame(combined['country'].map(country_map).to_list(), columns = col_name)

emb_dim = product_map['Kaggle Mug'].shape[0]
col_name = [f'product_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_product = pd.DataFrame(combined['product'].map(product_map).to_list(), columns = col_name)

emb_dim = store_map['KaggleMart'].shape[0]
col_name = [f'store_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_store = pd.DataFrame(combined['store'].map(store_map).to_list(), columns = col_name)

In [11]:
combined.reset_index(drop = True, inplace = True)

In [12]:
combined = pd.concat([combined, df_emb_store, df_emb_product, df_emb_country], axis = 1)

In [13]:
combined[dep_var] = np.log(combined[dep_var])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [14]:
combined = add_datepart(combined, 'date')

In [15]:
emb_dim = month_map[1].shape[0]
col_name = [f'month_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_month = pd.DataFrame(combined['Month'].map(month_map).to_list(), columns = col_name)

In [16]:
combined = pd.concat([combined, df_emb_month], axis = 1)

In [17]:
gdp = pd.read_csv("GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv")

gdp = gdp.melt(id_vars = 'year', value_vars = ['Finland', 'Norway', 'Sweden'], var_name = 'country', value_name = 'gdp')

In [18]:
combined = combined.rename(columns = {"Year" : "year"})

In [19]:
combined = combined.merge(gdp, on = ["country", "year"], how = "left")

In [20]:
train_idx = combined.iloc[0:22986].index
valid_idx = combined.iloc[22986:26298].index
test_idx = combined.iloc[26298:].index

In [21]:
cont, cat = cont_cat_split(combined, 1, dep_var)

In [22]:
cont

['row_id',
 'store_emb_1',
 'store_emb_2',
 'store_emb_3',
 'product_emb_1',
 'product_emb_2',
 'product_emb_3',
 'country_emb_1',
 'country_emb_2',
 'country_emb_3',
 'year',
 'Month',
 'Week',
 'Day',
 'Dayofweek',
 'Dayofyear',
 'Elapsed',
 'month_emb_1',
 'month_emb_2',
 'month_emb_3',
 'month_emb_4',
 'month_emb_5',
 'month_emb_6',
 'month_emb_7',
 'gdp']

In [23]:
cont.remove("row_id")

In [24]:
cat

['country',
 'store',
 'product',
 'holiday',
 'Is_month_end',
 'Is_month_start',
 'Is_quarter_end',
 'Is_quarter_start',
 'Is_year_end',
 'Is_year_start']

In [25]:
procs = [Categorify, FillMissing]
#train_idx = combined.iloc[:26298]
splits = (list(train_idx), list(valid_idx))
to = TabularPandas(combined, procs, cat, cont, y_names = dep_var, splits = splits)

In [26]:
def SMAPE(preds, targs):
    denominator = (targs + np.abs(preds)) / 200.0
    diff = np.abs(preds - targs) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [27]:
trainxs, trainy = to.train.xs, to.train.y
print (trainxs.shape, trainy.shape)

(22986, 34) (22986,)


In [28]:
validxs, validy = to.valid.xs, to.valid.y
print (validxs.shape, validy.shape)

(3312, 34) (3312,)


In [29]:
model_log = XGBRegressor(learning_rate = 0.44217554329478864, gamma = 0.008005901796296285,
                     alpha = 7.198845069865988, max_depth = 3,
                     reg_lambda = 0.002454298512769308)

In [30]:
model_log.fit(trainxs, trainy)

XGBRegressor(alpha=7.198845069865988, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             enable_categorical=False, gamma=0.008005901796296285, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.44217554329478864, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=7, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=7.19884491,
             reg_lambda=0.002454298512769308, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [32]:
SMAPE(model_log.predict(validxs), validy)

0.9098933320428614

In [33]:
model_exp = XGBRegressor(learning_rate = 0.03939193777334498, gamma = 4.0544219317998165,
                     alpha = 5.32579150349582, max_depth = 8,
                     reg_lambda = 20.69660325703409)

In [34]:
trainy_exp = np.exp(trainy)
validy_exp = np.exp(validy)

In [35]:
model_exp.fit(trainxs, trainy_exp)

XGBRegressor(alpha=5.32579150349582, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             enable_categorical=False, gamma=4.0544219317998165, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.03939193777334498, max_delta_step=0, max_depth=8,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=7, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=5.32579136, reg_lambda=20.69660325703409,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [36]:
SMAPE(model_exp.predict(validxs), validy_exp)

6.34229734015435

In [37]:
## loading neural network model

In [44]:
procs = [Categorify, FillMissing, Normalize]
#train_idx = combined.iloc[:26298]
splits = (list(train_idx), list(valid_idx))
to_nn = TabularPandas(combined, procs, cat, cont, y_names = dep_var, splits = splits)

In [45]:
dls = to_nn.dataloaders(1024, device = "cuda")

In [47]:
learn = load_learner("nn_21.pkl")

In [48]:
ynn, _ = learn.get_preds(dl = dls.valid)

In [49]:
ynn

tensor([[5.4392],
        [6.2024],
        [4.9063],
        ...,
        [6.8990],
        [7.2861],
        [5.9372]])

In [50]:
ynn = np.concatenate([ynn])
ynn = np.squeeze(ynn)
ynn.shape

(3312,)

In [51]:
SMAPE(ynn, validy)

0.6842138562382427

In [52]:
pred_exp = model_exp.predict(validxs)
pred_log = model_log.predict(validxs)

In [53]:
ynn = np.exp(ynn)
pred_log_exp = np.exp(pred_log)

In [54]:
final_pred = (pred_exp + pred_log_exp + ynn) / 3

In [55]:
final_pred[:5]

array([229.87091, 461.1084 , 135.02922, 397.93457, 770.3143 ],
      dtype=float32)

In [56]:
SMAPE(final_pred, validy_exp)

4.623243678663578

In [57]:
## Ensembling

In [58]:
from functools import partial
from scipy.optimize import fmin
class OptimizeSMAPE :
    def __init__(self):
        self.coef_ = 0
        
    def _smape(self, coef, X, y):
        '''
        coef = coef list, of the same length as the no of models
        X = predictions made by every model
        y = targets
        '''
        x_coef = X * coef
        predictions = np.sum(x_coef, axis = 1)
        # calculate the SMAPE score
        smape_score = SMAPE(predictions, y)
        
        return smape_score
    
    def fit(self, X, y):
        loss_partial = partial(self._smape, X = X, y = y)
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size = 1)
        
        self.coef_ = fmin(loss_partial, initial_coef, disp = True)
        
    def predict(self, X):
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis = 1)
        return predictions

In [59]:
stack_preds = np.column_stack((pred_exp, pred_log_exp, ynn, final_pred))

In [60]:
stack_preds

array([[ 219.11063,  240.23576,  230.26634,  229.87091],
       [ 420.2454 ,  469.1584 ,  493.9214 ,  461.1084 ],
       [ 126.38105,  143.5669 ,  135.13972,  135.02922],
       ...,
       [ 919.86707, 1054.3684 ,  991.2897 ,  988.50836],
       [1430.302  , 1529.3706 , 1459.8527 , 1473.1752 ],
       [ 405.79718,  410.81213,  378.87787,  398.49573]], dtype=float32)

In [61]:
opt = OptimizeSMAPE()

In [62]:
opt.coef_

0

In [63]:
%%time
opt.fit(stack_preds[:, :-1], validy_exp)

Optimization terminated successfully.
         Current function value: 3.876234
         Iterations: 96
         Function evaluations: 171
CPU times: user 337 ms, sys: 0 ns, total: 337 ms
Wall time: 335 ms


In [64]:
opt.coef_

array([0.02141535, 0.05087608, 0.93324078])

In [65]:
ensemble_preds = opt.predict(stack_preds[:, :-1])

In [66]:
SMAPE(ensemble_preds, validy_exp)

3.876233964507752

In [67]:
test_idx

Int64Index([26298, 26299, 26300, 26301, 26302, 26303, 26304, 26305, 26306,
            26307,
            ...
            32858, 32859, 32860, 32861, 32862, 32863, 32864, 32865, 32866,
            32867],
           dtype='int64', length=6570)

In [68]:
combined_test = combined.iloc[test_idx]

In [69]:
combined_test.head()

Unnamed: 0,row_id,country,store,product,num_sold,holiday,store_emb_1,store_emb_2,store_emb_3,product_emb_1,...,Is_year_start,Elapsed,month_emb_1,month_emb_2,month_emb_3,month_emb_4,month_emb_5,month_emb_6,month_emb_7,gdp
26298,26298,Finland,KaggleMart,Kaggle Mug,-inf,in_Finland,0.96775,-0.929185,1.11733,-0.345529,...,True,1546301000.0,0.139456,0.072881,-0.318967,0.426407,0.22274,0.328248,0.634298,48712
26299,26299,Finland,KaggleMart,Kaggle Hat,-inf,in_Finland,0.96775,-0.929185,1.11733,0.764051,...,True,1546301000.0,0.139456,0.072881,-0.318967,0.426407,0.22274,0.328248,0.634298,48712
26300,26300,Finland,KaggleMart,Kaggle Sticker,-inf,in_Finland,0.96775,-0.929185,1.11733,-1.27834,...,True,1546301000.0,0.139456,0.072881,-0.318967,0.426407,0.22274,0.328248,0.634298,48712
26301,26301,Finland,KaggleRama,Kaggle Mug,-inf,in_Finland,-0.300273,0.358641,-0.296122,-0.345529,...,True,1546301000.0,0.139456,0.072881,-0.318967,0.426407,0.22274,0.328248,0.634298,48712
26302,26302,Finland,KaggleRama,Kaggle Hat,-inf,in_Finland,-0.300273,0.358641,-0.296122,0.764051,...,True,1546301000.0,0.139456,0.072881,-0.318967,0.426407,0.22274,0.328248,0.634298,48712


In [70]:
len(combined_test)

6570

In [71]:
procs = [Categorify, FillMissing]
#train_idx = combined.iloc[:26298]
splits = (list(train_idx), list(test_idx))
to = TabularPandas(combined, procs, cat, cont, y_names = dep_var, splits = splits)

In [72]:
testxs = to.valid.xs

In [73]:
testxs.shape

(6570, 34)

In [74]:
testxs

Unnamed: 0,country,store,product,holiday,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,...,Dayofyear,Elapsed,month_emb_1,month_emb_2,month_emb_3,month_emb_4,month_emb_5,month_emb_6,month_emb_7,gdp
26298,1,1,2,1,1,2,1,2,1,2,...,1,1.546301e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,48712
26299,1,1,1,1,1,2,1,2,1,2,...,1,1.546301e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,48712
26300,1,1,3,1,1,2,1,2,1,2,...,1,1.546301e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,48712
26301,1,2,2,1,1,2,1,2,1,2,...,1,1.546301e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,48712
26302,1,2,1,1,1,2,1,2,1,2,...,1,1.546301e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,48712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32863,3,1,1,3,2,1,2,1,2,1,...,365,1.577750e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,51687
32864,3,1,3,3,2,1,2,1,2,1,...,365,1.577750e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,51687
32865,3,2,2,3,2,1,2,1,2,1,...,365,1.577750e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,51687
32866,3,2,1,3,2,1,2,1,2,1,...,365,1.577750e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,51687


In [75]:
dl = learn.dls.test_dl(combined_test)

In [76]:
testpred, _ = learn.get_preds(dl = dl)

  diff = np.abs(preds - targs) / denominator


In [77]:
testpred

tensor([[5.9997],
        [6.3896],
        [5.2451],
        ...,
        [6.8322],
        [7.2280],
        [5.9246]])

In [78]:
np.exp(testpred)

tensor([[ 403.3218],
        [ 595.6145],
        [ 189.6268],
        ...,
        [ 927.2502],
        [1377.4723],
        [ 374.1382]])

In [79]:
testpred = np.concatenate([testpred])
testpred = np.squeeze(testpred)
testpred = np.exp(testpred)
testpred.shape

(6570,)

In [80]:
testpred_exp = model_exp.predict(testxs)

In [81]:
testpred_exp[:5]

array([371.77856, 473.92432, 159.85696, 579.4384 , 798.0326 ],
      dtype=float32)

In [82]:
testpred_log = model_log.predict(testxs)
testpred_log = np.exp(testpred_log)

In [83]:
testpred_log[:5]

array([345.60696, 551.2417 , 144.10938, 597.68536, 964.5431 ],
      dtype=float32)

In [84]:
testpred_stack = np.column_stack((testpred_exp, testpred_log, testpred))

In [85]:
testpred_stack

array([[ 371.77856,  345.60696,  403.32184],
       [ 473.92432,  551.2417 ,  595.6145 ],
       [ 159.85696,  144.10938,  189.62682],
       ...,
       [ 819.78033,  940.8123 ,  927.25024],
       [1277.4098 , 1351.1858 , 1377.4723 ],
       [ 361.28314,  368.18112,  374.13824]], dtype=float32)

In [86]:
ensemble_test_preds = opt.predict(testpred_stack)

In [87]:
ensemble_test_preds

array([ 401.9412856 ,  594.04601848,  187.72259174, ...,  930.76847333,
       1381.61253864,  375.62968717])

In [88]:
submission = pd.read_csv("sample_submission.csv")

In [89]:
submission.head()

Unnamed: 0,row_id,num_sold
0,26298,100
1,26299,100
2,26300,100
3,26301,100
4,26302,100


In [90]:
submission["num_sold"] = list(ensemble_test_preds)

In [91]:
submission.head()

Unnamed: 0,row_id,num_sold
0,26298,401.941286
1,26299,594.046018
2,26300,187.722592
3,26301,739.815826
4,26302,1124.907473


In [92]:
submission.to_csv("subm/submission.csv", index = False)

In [93]:
! kaggle competitions submit -c tabular-playground-series-jan-2022 -f subm/submission.csv -m "letsgoooo"

100%|████████████████████████████████████████| 157k/157k [00:04<00:00, 32.8kB/s]
Successfully submitted to Tabular Playground Series - Jan 2022

In [96]:
validxs["year"].value_counts()

2018    3312
Name: year, dtype: int64

In [97]:
trainxs["year"].value_counts()

2016    6588
2015    6570
2017    6570
2018    3258
Name: year, dtype: int64