In [2]:
%%time
import fastai
from fastai.tabular.all import *
from pathlib import Path
import os
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
import IPython
from sklearn.ensemble import RandomForestRegressor
from IPython.display import Image, display_svg, SVG
from sklearn.tree import export_graphviz
import waterfall_chart
from treeinterpreter import treeinterpreter
from sklearn.inspection import plot_partial_dependence
from scipy.cluster import hierarchy as hc

CPU times: user 20.4 ms, sys: 4.08 ms, total: 24.5 ms
Wall time: 28.7 ms


In [3]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/home/.kaggle’: File exists


In [4]:
! cp kaggle.json ~/.kaggle/

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle datasets list

ref                                                         title                                              size  lastUpdated          downloadCount  voteCount  usabilityRating  
----------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
gpreda/reddit-vaccine-myths                                 Reddit Vaccine Myths                              237KB  2021-12-12 11:59:54          18544       1440  1.0              
crowww/a-large-scale-fish-dataset                           A Large Scale Fish Dataset                          3GB  2021-04-28 17:03:01          11215        786  0.9375           
imsparsh/musicnet-dataset                                   MusicNet Dataset                                   22GB  2021-02-18 14:12:19           5751        370  1.0              
dhruvildave/wikibooks-dataset                               Wikibooks Dataset             

In [7]:
! kaggle competitions download -c tabular-playground-series-jan-2022

Downloading tabular-playground-series-jan-2022.zip to /home/kaggle-tabular
100%|████████████████████████████████████████| 230k/230k [00:00<00:00, 1.55MB/s]
100%|████████████████████████████████████████| 230k/230k [00:00<00:00, 1.54MB/s]


In [9]:
! unzip /home/kaggle-tabular/tabular-playground-series-jan-2022.zip

Archive:  /home/kaggle-tabular/tabular-playground-series-jan-2022.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [10]:
%%time
df = pd.read_csv("train.csv", low_memory = False)

CPU times: user 31.3 ms, sys: 1.32 ms, total: 32.6 ms
Wall time: 29.9 ms


In [11]:
df.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


In [12]:
%%time
df[["country", "store", "product"]].nunique()

CPU times: user 14.6 ms, sys: 5.31 ms, total: 19.9 ms
Wall time: 16.6 ms


country    3
store      2
product    3
dtype: int64

In [13]:
for i in ["country", "store", "product"] :
    display(df[i].value_counts())

Finland    8766
Norway     8766
Sweden     8766
Name: country, dtype: int64

KaggleMart    13149
KaggleRama    13149
Name: store, dtype: int64

Kaggle Mug        8766
Kaggle Hat        8766
Kaggle Sticker    8766
Name: product, dtype: int64

In [14]:
%%time
df = add_datepart(df, 'date')

CPU times: user 75.6 ms, sys: 2.66 ms, total: 78.3 ms
Wall time: 75.2 ms


In [15]:
df.head()

Unnamed: 0,row_id,country,store,product,num_sold,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,0,Finland,KaggleMart,Kaggle Mug,329,2015,1,1,1,3,1,False,True,False,True,False,True,1420070000.0
1,1,Finland,KaggleMart,Kaggle Hat,520,2015,1,1,1,3,1,False,True,False,True,False,True,1420070000.0
2,2,Finland,KaggleMart,Kaggle Sticker,146,2015,1,1,1,3,1,False,True,False,True,False,True,1420070000.0
3,3,Finland,KaggleRama,Kaggle Mug,572,2015,1,1,1,3,1,False,True,False,True,False,True,1420070000.0
4,4,Finland,KaggleRama,Kaggle Hat,911,2015,1,1,1,3,1,False,True,False,True,False,True,1420070000.0


In [16]:
df["Year"].min(), df["Year"].max()

(2015, 2018)

In [17]:
df["Year"].value_counts()

2016    6588
2015    6570
2017    6570
2018    6570
Name: Year, dtype: int64

In [18]:
cond = (df.Year < 2018)
train_idx = np.where(cond)[0]
valid_idx = np.where(~cond)[0]

In [19]:
%%time
print (len(train_idx))
print (len(valid_idx))

19728
6570
CPU times: user 318 µs, sys: 155 µs, total: 473 µs
Wall time: 260 µs


In [20]:
splits = (list(train_idx), list(valid_idx))
dep_var = "num_sold"
cont, cat = cont_cat_split(df,1, dep_var)

In [21]:
cont

['row_id', 'Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Elapsed']

In [22]:
cat

['country',
 'store',
 'product',
 'Is_month_end',
 'Is_month_start',
 'Is_quarter_end',
 'Is_quarter_start',
 'Is_year_end',
 'Is_year_start']

In [23]:
cont.remove('row_id')

In [24]:
procs = [Categorify, FillMissing]
to = TabularPandas(df, procs, cat, cont, y_names = dep_var, splits = splits )

In [25]:
len(to.train), len(to.valid)

(19728, 6570)

In [26]:
to.show(3)

Unnamed: 0,country,store,product,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Year,Month,Week,Day,Dayofweek,Dayofyear,Elapsed,num_sold
0,Finland,KaggleMart,Kaggle Mug,False,True,False,True,False,True,2015,1,1,1,3,1,1420070000.0,329
1,Finland,KaggleMart,Kaggle Hat,False,True,False,True,False,True,2015,1,1,1,3,1,1420070000.0,520
2,Finland,KaggleMart,Kaggle Sticker,False,True,False,True,False,True,2015,1,1,1,3,1,1420070000.0,146


In [27]:
to.items.head(3)

Unnamed: 0,row_id,country,store,product,num_sold,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,0,1,1,2,329,2015,1,1,1,3,1,1,2,1,2,1,2,1420070000.0
1,1,1,1,1,520,2015,1,1,1,3,1,1,2,1,2,1,2,1420070000.0
2,2,1,1,3,146,2015,1,1,1,3,1,1,2,1,2,1,2,1420070000.0


In [28]:
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

In [29]:
m = DecisionTreeRegressor(max_leaf_nodes = 4)

In [30]:
%%time
m.fit(xs, y)

CPU times: user 29.4 ms, sys: 4.3 ms, total: 33.7 ms
Wall time: 30 ms


DecisionTreeRegressor(max_leaf_nodes=4)

In [33]:
def calculate_smape(m, valid_xs, valid_y):
    preds = list(m.predict(valid_xs))
    total = len(preds)
    smape = 0
    for i,j in zip(preds, valid_y) :
        diff = np.abs(i - j)
        avg = ((np.abs(i)) + np.abs(j)) / 2
        fin = diff/avg
        smape += fin
        
    smape = (smape/total) * 100 
    
    return smape

In [34]:
calculate_smape(m, valid_xs, valid_y)

29.362668566080412

In [36]:
%%time
test = pd.read_csv("test.csv", low_memory = False)

CPU times: user 12.9 ms, sys: 4.34 ms, total: 17.3 ms
Wall time: 14.2 ms


In [37]:
test.head()

Unnamed: 0,row_id,date,country,store,product
0,26298,2019-01-01,Finland,KaggleMart,Kaggle Mug
1,26299,2019-01-01,Finland,KaggleMart,Kaggle Hat
2,26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker
3,26301,2019-01-01,Finland,KaggleRama,Kaggle Mug
4,26302,2019-01-01,Finland,KaggleRama,Kaggle Hat


In [38]:
%%time
len(test)

CPU times: user 16 µs, sys: 7 µs, total: 23 µs
Wall time: 31 µs


6570

In [40]:
test = add_datepart(test, 'date')

In [41]:
test.head()

Unnamed: 0,row_id,country,store,product,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,26298,Finland,KaggleMart,Kaggle Mug,2019,1,1,1,1,1,False,True,False,True,False,True,1546301000.0
1,26299,Finland,KaggleMart,Kaggle Hat,2019,1,1,1,1,1,False,True,False,True,False,True,1546301000.0
2,26300,Finland,KaggleMart,Kaggle Sticker,2019,1,1,1,1,1,False,True,False,True,False,True,1546301000.0
3,26301,Finland,KaggleRama,Kaggle Mug,2019,1,1,1,1,1,False,True,False,True,False,True,1546301000.0
4,26302,Finland,KaggleRama,Kaggle Hat,2019,1,1,1,1,1,False,True,False,True,False,True,1546301000.0


In [42]:
test["num_sold"] = 0

In [43]:
to_test = TabularPandas(test, procs, cat, cont, y_names = dep_var, splits = None)

In [44]:
len(to_test)

6570

In [45]:
test_xs = to_test.train.xs

In [46]:
len(test_xs)

6570

In [48]:
preds_test = m.predict(test_xs)

In [49]:
preds_test

array([342.86237835, 444.15358881, 173.59367397, ..., 342.86237835,
       773.2810219 , 173.59367397])

In [52]:
ids = list(test["row_id"].values)
submission = pd.DataFrame()
submission["row_id"] = ids
submission["num_sold"] = list(preds_test)

In [53]:
submission.head()

Unnamed: 0,row_id,num_sold
0,26298,342.862378
1,26299,444.153589
2,26300,173.593674
3,26301,342.862378
4,26302,773.281022


In [54]:
submission.to_csv("submission.csv", index = False)