In [1]:
%%time
import fastai
from fastai.tabular.all import *
from pathlib import Path
import os
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
import IPython
from sklearn.ensemble import RandomForestRegressor
from IPython.display import Image, display_svg, SVG
from sklearn.tree import export_graphviz
import waterfall_chart
from treeinterpreter import treeinterpreter
from sklearn.inspection import plot_partial_dependence
from scipy.cluster import hierarchy as hc
import xgboost
from xgboost import XGBRegressor
import optuna

CPU times: user 2.23 s, sys: 1.53 s, total: 3.76 s
Wall time: 4.09 s


In [2]:
df = pd.read_csv("train.csv")

test = pd.read_csv("test.csv")

test["num_sold"] = 0

combined = pd.concat([df, test])

In [3]:
dep_var = "num_sold"

holiday = pd.read_csv("nordic_holidays.csv")

holiday["holiday"] = str("in_") + holiday["country"]

In [4]:
combined = combined.merge(holiday[["date", "holiday", "country"]], left_on = ["date", "country"], right_on = ["date", "country"], how = "left")

combined["holiday"] = combined["holiday"].fillna("no_holiday")

In [5]:
country_map = pickle.load(open("embs/country_map.pkl", "rb"))
month_map = pickle.load(open("embs/month_map.pkl", "rb"))
product_map = pickle.load(open("embs/product_map.pkl", "rb"))
store_map = pickle.load(open("embs/store_map.pkl", "rb"))

In [6]:
emb_dim = country_map['Finland'].shape[0]
col_name = [f'country_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_country = pd.DataFrame(combined['country'].map(country_map).to_list(), columns = col_name)

emb_dim = product_map['Kaggle Mug'].shape[0]
col_name = [f'product_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_product = pd.DataFrame(combined['product'].map(product_map).to_list(), columns = col_name)

emb_dim = store_map['KaggleMart'].shape[0]
col_name = [f'store_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_store = pd.DataFrame(combined['store'].map(store_map).to_list(), columns = col_name)

In [7]:
combined.reset_index(drop = True, inplace = True)

In [8]:
combined = pd.concat([combined, df_emb_store, df_emb_product, df_emb_country], axis = 1)

In [9]:
combined[dep_var] = np.log(combined[dep_var])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [10]:
combined = add_datepart(combined, 'date')

In [11]:
def find_weekend(df):
    '''this function finds whether the day is saturday or sunday i.e weekend
    1 - if saturday or sunday else 0
    '''
    weekends = []
    for i in df["Dayofweek"].values :
        if (i == 5) | (i == 6) :
            weekends.append(1)
        else :
            weekends.append(0)
    
    return weekends

In [12]:
weekends = find_weekend(combined)

In [13]:
combined["is_weekend"] = weekends

In [14]:
emb_dim = month_map[1].shape[0]
col_name = [f'month_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_month = pd.DataFrame(combined['Month'].map(month_map).to_list(), columns = col_name)

In [15]:
combined = pd.concat([combined, df_emb_month], axis = 1)

In [16]:
gdp = pd.read_csv("GDP_per_capita_2015_to_2019_Finland_Norway_Sweden.csv")

gdp = gdp.melt(id_vars = 'year', value_vars = ['Finland', 'Norway', 'Sweden'], var_name = 'country', value_name = 'gdp')

In [17]:
combined = combined.rename(columns = {"Year" : "year"})

In [18]:
combined = combined.merge(gdp, on = ["country", "year"], how = "left")

In [19]:
combined.to_csv("models/final_data.csv")

In [20]:
cond_tr = (combined.year < 2018)
cond_val = (combined.year == 2018)
train_idx = np.where(cond_tr)[0]
valid_idx = np.where(cond_val)[0]
test_idx = combined.iloc[26298:].index

In [21]:
cont, cat = cont_cat_split(combined, 1, dep_var)

In [22]:
cont.remove("row_id")
cont.remove("Elapsed")

In [23]:
procs = [Categorify, FillMissing, Normalize]
#train_idx = combined.iloc[:26298]
splits = (list(train_idx), list(valid_idx))
to = TabularPandas(combined, procs, cat, cont, y_names = dep_var, splits = splits)

In [24]:
def SMAPE(preds, targs):
    denominator = (targs + np.abs(preds)) / 200.0
    diff = np.abs(preds - targs) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [25]:
trainxs, trainy = to.train.xs, to.train.y
print (trainxs.shape, trainy.shape)

(19728, 34) (19728,)


In [26]:
validxs, validy = to.valid.xs, to.valid.y
print (validxs.shape, validy.shape)

(6570, 34) (6570,)


In [27]:
a,b,c,d,e = [],[],[],[],[]
score = []

g_index = 1
def hp_tuning(trial) :
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.5, log = True)
    gamma = trial.suggest_float("gamma", 0.001, 10)
    alpha = trial.suggest_float("alpha", 0.001, 10)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 0.001, 25)
    
    model = XGBRegressor(verbosity = 0, alpha = alpha, reg_lambda = reg_lambda, gamma = gamma, max_depth = max_depth, learning_rate = learning_rate)
    
    model.fit(trainxs, trainy)
                            
    preds_val = model.predict(validxs)
   
        
    a.append(learning_rate)
    b.append(gamma)
    c.append(alpha)
    d.append(max_depth)
    e.append(reg_lambda)
    score.append(SMAPE(preds_val, validy))
    
        
        
    return (SMAPE(preds_val, validy))

In [28]:
study = optuna.create_study(direction = "minimize")
study.optimize(hp_tuning, n_trials=500)
print('Number of finished trials:', len(study.trials))
print("Best Params: ",study.best_params)

[32m[I 2022-01-26 18:54:25,253][0m A new study created in memory with name: no-name-54caceba-61df-43db-99d2-29aabc65c5de[0m
[32m[I 2022-01-26 18:54:25,790][0m Trial 0 finished with value: 164.87555889667166 and parameters: {'learning_rate': 0.00010951278049881244, 'gamma': 8.07376473430963, 'alpha': 0.7342146481752688, 'max_depth': 10, 'reg_lambda': 3.877755838218292}. Best is trial 0 with value: 164.87555889667166.[0m
[32m[I 2022-01-26 18:54:26,124][0m Trial 1 finished with value: 161.8352869205724 and parameters: {'learning_rate': 0.00021423736344587744, 'gamma': 2.785231592727786, 'alpha': 9.144315528816767, 'max_depth': 9, 'reg_lambda': 22.32761759447454}. Best is trial 1 with value: 161.8352869205724.[0m
[32m[I 2022-01-26 18:54:26,615][0m Trial 2 finished with value: 157.5895066855269 and parameters: {'learning_rate': 0.0003656898416957749, 'gamma': 8.390282442251634, 'alpha': 5.85867267686364, 'max_depth': 15, 'reg_lambda': 0.1599561756411485}. Best is trial 2 with val

Number of finished trials: 500
Best Params:  {'learning_rate': 0.10938767703585212, 'gamma': 0.011030404391536446, 'alpha': 0.7444861151955272, 'max_depth': 5, 'reg_lambda': 2.5235750367083396}


In [29]:
model_log = XGBRegressor(learning_rate = study.best_params['learning_rate'], gamma = study.best_params['gamma'],
                     alpha = study.best_params['alpha'], max_depth = study.best_params['max_depth'],
                     reg_lambda = study.best_params['reg_lambda'])

In [30]:
model_log.fit(trainxs, trainy)

XGBRegressor(alpha=0.7444861151955272, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             enable_categorical=False, gamma=0.011030404391536446, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.10938767703585212, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=7, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0.744486094,
             reg_lambda=2.5235750367083396, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [31]:
SMAPE(model_log.predict(validxs), validy)

1.455758321462413

In [32]:
trainy_exp = np.exp(trainy)
validy_exp = np.exp(validy)

In [33]:
a,b,c,d,e = [],[],[],[],[]
score = []

g_index = 1
def hp_tuning(trial) :
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.5, log = True)
    gamma = trial.suggest_float("gamma", 0.001, 10)
    alpha = trial.suggest_float("alpha", 0.001, 10)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 0.001, 25)
    
    model = XGBRegressor(verbosity = 0, alpha = alpha, reg_lambda = reg_lambda, gamma = gamma, max_depth = max_depth, learning_rate = learning_rate)
    
    model.fit(trainxs, trainy_exp)
                            
    preds_val = model.predict(validxs)
   
        
    a.append(learning_rate)
    b.append(gamma)
    c.append(alpha)
    d.append(max_depth)
    e.append(reg_lambda)
    score.append(SMAPE(preds_val, validy_exp))
    
        
        
    return (SMAPE(preds_val, validy_exp))

In [34]:
study = optuna.create_study(direction = "minimize")
study.optimize(hp_tuning, n_trials=500)
print('Number of finished trials:', len(study.trials))
print("Best Params: ",study.best_params)

[32m[I 2022-01-26 19:02:34,605][0m A new study created in memory with name: no-name-7ce76a2e-bebe-468a-b071-5eb7264fbffc[0m
[32m[I 2022-01-26 19:02:36,548][0m Trial 0 finished with value: 187.48628961057352 and parameters: {'learning_rate': 0.0003320071092119321, 'gamma': 1.512443305176949, 'alpha': 9.161085460194917, 'max_depth': 14, 'reg_lambda': 0.022217034897369228}. Best is trial 0 with value: 187.48628961057352.[0m
[32m[I 2022-01-26 19:02:37,524][0m Trial 1 finished with value: 72.12054227652315 and parameters: {'learning_rate': 0.0071401515580421515, 'gamma': 7.152897007742262, 'alpha': 4.333917461031001, 'max_depth': 8, 'reg_lambda': 0.0013212720920532212}. Best is trial 1 with value: 72.12054227652315.[0m
[32m[I 2022-01-26 19:02:37,868][0m Trial 2 finished with value: 104.21553833873061 and parameters: {'learning_rate': 0.004062029183172311, 'gamma': 6.98497077740171, 'alpha': 1.0550285437268894, 'max_depth': 3, 'reg_lambda': 0.0505283980067197}. Best is trial 1 wit

Number of finished trials: 500
Best Params:  {'learning_rate': 0.08098677124265354, 'gamma': 6.247970987710917, 'alpha': 3.0488768437585794, 'max_depth': 6, 'reg_lambda': 9.581658914035454}


In [35]:
model_exp = XGBRegressor(learning_rate = study.best_params['learning_rate'], gamma = study.best_params['gamma'],
                     alpha = study.best_params['alpha'], max_depth = study.best_params['max_depth'],
                     reg_lambda = study.best_params['reg_lambda'])

In [36]:
model_exp.fit(trainxs, trainy_exp)

XGBRegressor(alpha=3.0488768437585794, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             enable_categorical=False, gamma=6.247970987710917, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.08098677124265354, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=7, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=3.04887676, reg_lambda=9.581658914035454,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [37]:
SMAPE(model_exp.predict(validxs), validy_exp)

8.104169354397023

In [38]:
learn = load_learner("models/latest_nn.pkl")

## Fitting the top models on the full dataset

In [39]:
model_exp_full = XGBRegressor(learning_rate = study.best_params['learning_rate'], gamma = study.best_params['gamma'],
                     alpha = study.best_params['alpha'], max_depth = study.best_params['max_depth'],
                     reg_lambda = study.best_params['reg_lambda'])

In [40]:
procs = [Categorify, FillMissing, Normalize]
train_idx = combined.iloc[:26298].index
splits = (list(train_idx), list(test_idx))
to = TabularPandas(combined, procs, cat, cont, y_names = dep_var, splits = splits)

In [41]:
fulltrainxs, fulltrainy = to.train.xs, to.train.y
print (fulltrainxs.shape, fulltrainy.shape)

(26298, 34) (26298,)


In [42]:
fulltrainy_exp = np.exp(fulltrainy)

In [43]:
model_exp_full.fit(fulltrainxs, fulltrainy_exp)

XGBRegressor(alpha=3.0488768437585794, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             enable_categorical=False, gamma=6.247970987710917, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.08098677124265354, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=7, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=3.04887676, reg_lambda=9.581658914035454,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [44]:
model_log_full = XGBRegressor(learning_rate = 0.10938767703585212, gamma = 0.011030404391536446,
                     alpha = 0.7444861151955272, max_depth = 5,
                     reg_lambda = 2.5235750367083396)

In [45]:
model_log_full.fit(fulltrainxs, fulltrainy)

XGBRegressor(alpha=0.7444861151955272, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             enable_categorical=False, gamma=0.011030404391536446, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.10938767703585212, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=7, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0.744486094,
             reg_lambda=2.5235750367083396, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [46]:
procs = [Categorify, FillMissing, Normalize]
train_idx = combined.iloc[:26298].index
combined_train = combined.iloc[train_idx]
to_full = TabularPandas(combined_train, procs, cat, cont, y_names = dep_var, splits = None)

In [47]:
dls_full = to_full.dataloaders(1024, device = "cuda")

In [49]:
smape = AccumMetric(SMAPE, to_np = True)

In [50]:
learn_full = tabular_learner(dls_full, y_range = (3, 8), layers = [500, 250, 64, 32, 16], n_out = 1,
                             metrics = [smape], loss_func = F.mse_loss)

In [51]:
learn_full.fit_one_cycle(10, slice(1e-4, 1e-1))

epoch,train_loss,valid_loss,SMAPE,time
0,0.069698,,,00:00
1,0.035717,,,00:00
2,0.024066,,,00:00
3,0.019041,,,00:00
4,0.015135,,,00:00
5,0.012425,,,00:00
6,0.010592,,,00:00
7,0.009135,,,00:00
8,0.008075,,,00:00
9,0.007229,,,00:00


  warn("Your generator is empty.")


In [52]:
combined_test = combined.iloc[test_idx]
display(combined_test.head())

Unnamed: 0,row_id,country,store,product,num_sold,holiday,store_emb_1,store_emb_2,store_emb_3,product_emb_1,...,Elapsed,is_weekend,month_emb_1,month_emb_2,month_emb_3,month_emb_4,month_emb_5,month_emb_6,month_emb_7,gdp
26298,26298,Finland,KaggleMart,Kaggle Mug,-inf,in_Finland,0.96775,-0.929185,1.11733,-0.345529,...,1546301000.0,0,0.139456,0.072881,-0.318967,0.426407,0.22274,0.328248,0.634298,48712
26299,26299,Finland,KaggleMart,Kaggle Hat,-inf,in_Finland,0.96775,-0.929185,1.11733,0.764051,...,1546301000.0,0,0.139456,0.072881,-0.318967,0.426407,0.22274,0.328248,0.634298,48712
26300,26300,Finland,KaggleMart,Kaggle Sticker,-inf,in_Finland,0.96775,-0.929185,1.11733,-1.27834,...,1546301000.0,0,0.139456,0.072881,-0.318967,0.426407,0.22274,0.328248,0.634298,48712
26301,26301,Finland,KaggleRama,Kaggle Mug,-inf,in_Finland,-0.300273,0.358641,-0.296122,-0.345529,...,1546301000.0,0,0.139456,0.072881,-0.318967,0.426407,0.22274,0.328248,0.634298,48712
26302,26302,Finland,KaggleRama,Kaggle Hat,-inf,in_Finland,-0.300273,0.358641,-0.296122,0.764051,...,1546301000.0,0,0.139456,0.072881,-0.318967,0.426407,0.22274,0.328248,0.634298,48712


In [53]:
dl = learn_full.dls.test_dl(combined_test)

In [54]:
ynn_full, _ = learn_full.get_preds(dl = dl)

  diff = np.abs(preds - targs) / denominator


In [55]:
learn_full.export("full_learner.pkl")

In [56]:
testxs = to.valid.xs

In [57]:
pred_log = model_log_full.predict(testxs)

In [58]:
pred_log = np.exp(pred_log)

In [59]:
pred_exp = model_exp_full.predict(testxs)

In [60]:
pred_exp

array([ 404.6835 ,  619.34033,  185.13258, ...,  828.2744 , 1226.2178 ,
        362.25034], dtype=float32)

In [61]:
pred_log

array([ 404.82336,  586.354  ,  186.75986, ...,  855.2202 , 1268.9943 ,
        394.7851 ], dtype=float32)

In [62]:
ynn_full = np.concatenate([ynn_full])
ynn_full = np.squeeze(ynn_full)
ynn_full = np.exp(ynn_full)
ynn_full.shape

(6570,)

In [63]:
ynn_full

array([ 404.1807 ,  636.3921 ,  182.71416, ...,  959.51624, 1458.6606 ,
        398.17352], dtype=float32)

In [64]:
basic_blend = (ynn_full + pred_exp + pred_log) / 3

In [65]:
basic_blend

array([ 404.5625 ,  614.0288 ,  184.86885, ...,  881.0036 , 1317.9575 ,
        385.06967], dtype=float32)

In [66]:
submission = pd.read_csv("sample_submission.csv")
submission["num_sold"] = list(basic_blend)

In [67]:
submission.to_csv("subm/submission.csv", index = False)

In [68]:
! kaggle competitions submit -c tabular-playground-series-jan-2022 -f subm/submission.csv -m "jgj"

100%|████████████████████████████████████████| 101k/101k [00:04<00:00, 21.9kB/s]
Successfully submitted to Tabular Playground Series - Jan 2022

## Finding the correct weights

In [70]:
from functools import partial
from scipy.optimize import fmin
class OptimizeSMAPE :
    def __init__(self):
        self.coef_ = 0
        
    def _smape(self, coef, X, y):
        '''
        coef = coef list, of the same length as the no of models
        X = predictions made by every model
        y = targets
        '''
        x_coef = X * coef
        predictions = np.sum(x_coef, axis = 1)
        # calculate the SMAPE score
        smape_score = SMAPE(predictions, y)
        
        return smape_score
    
    def fit(self, X, y):
        loss_partial = partial(self._smape, X = X, y = y)
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size = 1)
        
        self.coef_ = fmin(loss_partial, initial_coef, disp = True)
        
    def predict(self, X):
        x_coef = X * self.coef_
        predictions = np.sum(x_coef, axis = 1)
        return predictions

In [71]:
pickle.dump(model_log, open("model_log_latest.pkl", "wb"))
pickle.dump(model_exp, open("model_exp_latest.pkl", "wb"))

In [72]:
pickle.dump(model_exp_full, open("models/model_exp_full_latest.pkl", "wb"))
pickle.dump(model_log_full, open("models/model_log_full_latest.pkl", "wb"))