In [1]:
%%time
import fastai
from fastai.tabular.all import *
from pathlib import Path
import os
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
import IPython
from sklearn.ensemble import RandomForestRegressor
from IPython.display import Image, display_svg, SVG
from sklearn.tree import export_graphviz
import waterfall_chart
from treeinterpreter import treeinterpreter
from sklearn.inspection import plot_partial_dependence
from scipy.cluster import hierarchy as hc
import xgboost
from xgboost import XGBRegressor
import optuna

CPU times: user 1.75 s, sys: 1.07 s, total: 2.82 s
Wall time: 2.06 s


In [2]:
df = pd.read_csv("train.csv")

In [3]:
test = pd.read_csv("test.csv")

In [4]:
df.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


In [5]:
test.head()

Unnamed: 0,row_id,date,country,store,product
0,26298,2019-01-01,Finland,KaggleMart,Kaggle Mug
1,26299,2019-01-01,Finland,KaggleMart,Kaggle Hat
2,26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker
3,26301,2019-01-01,Finland,KaggleRama,Kaggle Mug
4,26302,2019-01-01,Finland,KaggleRama,Kaggle Hat


In [6]:
test["num_sold"] = 0

In [7]:
combined = pd.concat([df, test])

In [8]:
combined.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


In [9]:
combined.tail()

Unnamed: 0,row_id,date,country,store,product,num_sold
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,0
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,0
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,0
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,0
6569,32867,2019-12-31,Sweden,KaggleRama,Kaggle Sticker,0


In [10]:
len(test)

6570

In [11]:
dep_var = "num_sold"

In [12]:
country_map = pickle.load(open("embs/country_map.pkl", "rb"))
month_map = pickle.load(open("embs/month_map.pkl", "rb"))
product_map = pickle.load(open("embs/product_map.pkl", "rb"))
store_map = pickle.load(open("embs/store_map.pkl", "rb"))

In [13]:
emb_dim = country_map['Finland'].shape[0]
col_name = [f'country_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_country = pd.DataFrame(combined['country'].map(country_map).to_list(), columns = col_name)

emb_dim = product_map['Kaggle Mug'].shape[0]
col_name = [f'product_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_product = pd.DataFrame(combined['product'].map(product_map).to_list(), columns = col_name)

emb_dim = store_map['KaggleMart'].shape[0]
col_name = [f'store_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_store = pd.DataFrame(combined['store'].map(store_map).to_list(), columns = col_name)

In [14]:
combined.reset_index(drop = True, inplace = True)

In [15]:
combined = pd.concat([combined, df_emb_store, df_emb_product, df_emb_country], axis = 1)

In [16]:
combined.head()

Unnamed: 0,row_id,date,country,store,product,num_sold,store_emb_1,store_emb_2,store_emb_3,product_emb_1,product_emb_2,product_emb_3,country_emb_1,country_emb_2,country_emb_3
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329,0.96775,-0.929185,1.11733,-0.345529,-0.192691,-0.329195,-0.885037,0.88509,-0.866939
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520,0.96775,-0.929185,1.11733,0.764051,0.657293,0.70616,-0.885037,0.88509,-0.866939
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146,0.96775,-0.929185,1.11733,-1.27834,-1.089227,-1.059847,-0.885037,0.88509,-0.866939
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572,-0.300273,0.358641,-0.296122,-0.345529,-0.192691,-0.329195,-0.885037,0.88509,-0.866939
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911,-0.300273,0.358641,-0.296122,0.764051,0.657293,0.70616,-0.885037,0.88509,-0.866939


In [17]:
combined.tail()

Unnamed: 0,row_id,date,country,store,product,num_sold,store_emb_1,store_emb_2,store_emb_3,product_emb_1,product_emb_2,product_emb_3,country_emb_1,country_emb_2,country_emb_3
32863,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat,0,0.96775,-0.929185,1.11733,0.764051,0.657293,0.70616,-0.464027,0.427549,-0.515419
32864,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker,0,0.96775,-0.929185,1.11733,-1.27834,-1.089227,-1.059847,-0.464027,0.427549,-0.515419
32865,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug,0,-0.300273,0.358641,-0.296122,-0.345529,-0.192691,-0.329195,-0.464027,0.427549,-0.515419
32866,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat,0,-0.300273,0.358641,-0.296122,0.764051,0.657293,0.70616,-0.464027,0.427549,-0.515419
32867,32867,2019-12-31,Sweden,KaggleRama,Kaggle Sticker,0,-0.300273,0.358641,-0.296122,-1.27834,-1.089227,-1.059847,-0.464027,0.427549,-0.515419


In [18]:
combined = add_datepart(combined, 'date')

In [19]:
emb_dim = month_map[1].shape[0]
col_name = [f'month_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_month = pd.DataFrame(combined['Month'].map(month_map).to_list(), columns = col_name)

In [20]:
combined = pd.concat([combined, df_emb_month], axis = 1)

In [21]:
gdp = pd.read_csv("GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv")

gdp = gdp.melt(id_vars = 'year', value_vars = ['Finland', 'Norway', 'Sweden'], var_name = 'country', value_name = 'gdp')

In [22]:
combined = combined.rename(columns = {"Year" : "year"})

In [23]:
combined = combined.merge(gdp, on = ["country", "year"], how = "left")

In [24]:
train_idx = combined.iloc[0:26298].index
#valid_idx = combined.iloc[22986:26298].index
test_idx = combined.iloc[26298:].index

In [25]:
len(test_idx)

6570

In [26]:
len(test)

6570

In [29]:
cont, cat = cont_cat_split(combined, 1, dep_var)

In [30]:
cont

['row_id',
 'store_emb_1',
 'store_emb_2',
 'store_emb_3',
 'product_emb_1',
 'product_emb_2',
 'product_emb_3',
 'country_emb_1',
 'country_emb_2',
 'country_emb_3',
 'year',
 'Month',
 'Week',
 'Day',
 'Dayofweek',
 'Dayofyear',
 'Elapsed',
 'month_emb_1',
 'month_emb_2',
 'month_emb_3',
 'month_emb_4',
 'month_emb_5',
 'month_emb_6',
 'month_emb_7',
 'gdp']

In [31]:
cont.remove("row_id")

In [32]:
cat

['country',
 'store',
 'product',
 'Is_month_end',
 'Is_month_start',
 'Is_quarter_end',
 'Is_quarter_start',
 'Is_year_end',
 'Is_year_start']

In [33]:
procs = [Categorify, FillMissing]
#train_idx = combined.iloc[:26298]
splits = (list(train_idx), list(test_idx))
to = TabularPandas(combined, procs, cat, cont, y_names = dep_var, splits = splits)

In [34]:
train_valid_xs, train_valid_y = to.train.xs, to.train.y
testxs, testy = to.valid.xs, to.valid.y

In [35]:
testy

26298    0
26299    0
26300    0
26301    0
26302    0
        ..
32863    0
32864    0
32865    0
32866    0
32867    0
Name: num_sold, Length: 6570, dtype: int16

In [37]:
train_valid_xs

Unnamed: 0,country,store,product,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,store_emb_1,...,Dayofyear,Elapsed,month_emb_1,month_emb_2,month_emb_3,month_emb_4,month_emb_5,month_emb_6,month_emb_7,gdp
0,1,1,2,1,2,1,2,1,2,0.967750,...,1,1.420070e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,42802
1,1,1,1,1,2,1,2,1,2,0.967750,...,1,1.420070e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,42802
2,1,1,3,1,2,1,2,1,2,0.967750,...,1,1.420070e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,42802
3,1,2,2,1,2,1,2,1,2,-0.300273,...,1,1.420070e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,42802
4,1,2,1,1,2,1,2,1,2,-0.300273,...,1,1.420070e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,42802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26293,3,1,1,2,1,2,1,2,1,0.967750,...,365,1.546214e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,54589
26294,3,1,3,2,1,2,1,2,1,0.967750,...,365,1.546214e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,54589
26295,3,2,2,2,1,2,1,2,1,-0.300273,...,365,1.546214e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,54589
26296,3,2,1,2,1,2,1,2,1,-0.300273,...,365,1.546214e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,54589


In [38]:
trainxs = train_valid_xs.iloc[0:22986]
trainy = train_valid_y.iloc[0:22986]

In [39]:
validxs = train_valid_xs.iloc[22986:26298]
validy = train_valid_y.iloc[22986:26298]

In [40]:
print (trainxs.shape, trainy.shape)
print (validxs.shape, validy.shape)

(22986, 33) (22986,)
(3312, 33) (3312,)


In [41]:
def SMAPE(preds, targs):
    denominator = (targs + np.abs(preds)) / 200.0
    diff = np.abs(preds - targs) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [42]:
model = XGBRegressor(learning_rate = 0.03939193777334498, gamma = 4.0544219317998165,
                     alpha = 5.32579150349582, max_depth = 8,
                     reg_lambda = 20.69660325703409)

In [43]:
model.fit(trainxs, trainy)

XGBRegressor(alpha=5.32579150349582, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             enable_categorical=False, gamma=4.0544219317998165, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.03939193777334498, max_delta_step=0, max_depth=8,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=7, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=5.32579136, reg_lambda=20.69660325703409,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [44]:
SMAPE(model.predict(trainxs), trainy)

5.685738

In [45]:
SMAPE(model.predict(validxs), validy)

6.359276

In [46]:
testxs

Unnamed: 0,country,store,product,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,store_emb_1,...,Dayofyear,Elapsed,month_emb_1,month_emb_2,month_emb_3,month_emb_4,month_emb_5,month_emb_6,month_emb_7,gdp
26298,1,1,2,1,2,1,2,1,2,0.967750,...,1,1.546301e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,48712
26299,1,1,1,1,2,1,2,1,2,0.967750,...,1,1.546301e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,48712
26300,1,1,3,1,2,1,2,1,2,0.967750,...,1,1.546301e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,48712
26301,1,2,2,1,2,1,2,1,2,-0.300273,...,1,1.546301e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,48712
26302,1,2,1,1,2,1,2,1,2,-0.300273,...,1,1.546301e+09,0.139456,0.072881,-0.318967,0.426407,0.222740,0.328248,0.634298,48712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32863,3,1,1,2,1,2,1,2,1,0.967750,...,365,1.577750e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,51687
32864,3,1,3,2,1,2,1,2,1,0.967750,...,365,1.577750e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,51687
32865,3,2,2,2,1,2,1,2,1,-0.300273,...,365,1.577750e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,51687
32866,3,2,1,2,1,2,1,2,1,-0.300273,...,365,1.577750e+09,0.133122,-0.090387,0.225697,-0.282197,-0.173948,0.021887,0.936223,51687


In [47]:
preds_test = model.predict(testxs)

In [48]:
preds_test

array([ 379.579  ,  482.33713,  158.26404, ...,  830.3325 , 1272.2435 ,
        360.27972], dtype=float32)

In [49]:
sub = pd.read_csv("sample_submission.csv")

In [50]:
sub["num_sold"] = list(preds_test)

In [51]:
sub.to_csv("submission.csv", index = False)

In [52]:
! kaggle competitions submit -c tabular-playground-series-jan-2022 -f submission.csv -m "idk man"

100%|████████████████████████████████████████| 101k/101k [00:04<00:00, 25.1kB/s]
Successfully submitted to Tabular Playground Series - Jan 2022