In [1]:
%%time
import fastai
from fastai.tabular.all import *
from pathlib import Path
import os
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *
import IPython
from sklearn.ensemble import RandomForestRegressor
from IPython.display import Image, display_svg, SVG
from sklearn.tree import export_graphviz
import waterfall_chart
from treeinterpreter import treeinterpreter
from sklearn.inspection import plot_partial_dependence
from scipy.cluster import hierarchy as hc
import xgboost
from xgboost import XGBRegressor
import optuna

CPU times: user 1.82 s, sys: 946 ms, total: 2.77 s
Wall time: 2.11 s


In [2]:
df = pd.read_csv("train.csv", low_memory = False)
dep_var = "num_sold"

In [3]:
%%time
import pickle

country_map = pickle.load(open("embs/country_map.pkl", "rb"))
product_map = pickle.load(open("embs/product_map.pkl", "rb"))
store_map = pickle.load(open("embs/store_map.pkl", "rb"))

CPU times: user 0 ns, sys: 4.53 ms, total: 4.53 ms
Wall time: 6.18 ms


In [4]:
emb_dim = country_map['Finland'].shape[0]
col_name = [f'country_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_country = pd.DataFrame(df['country'].map(country_map).to_list(), columns = col_name)

In [5]:
emb_dim = product_map['Kaggle Mug'].shape[0]
col_name = [f'product_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_product = pd.DataFrame(df['product'].map(product_map).to_list(), columns = col_name)

In [6]:
emb_dim = store_map['KaggleMart'].shape[0]
col_name = [f'store_emb_{i}' for i in range(1, emb_dim + 1)]
df_emb_store = pd.DataFrame(df['store'].map(store_map).to_list(), columns = col_name)

In [7]:
df_tr = pd.concat([df[["date", "num_sold"]], df_emb_store, df_emb_product, df_emb_country], axis = 1)

In [8]:
df_tr = add_datepart(df_tr, 'date')

In [20]:
df_tr[dep_var] = np.log(df_tr[dep_var])

In [21]:
cond = (df_tr.Year < 2018)
train_idx = np.where(cond)[0]
valid_idx = np.where(~cond)[0]

In [22]:
splits = (list(train_idx), list(valid_idx))
cont, cat = cont_cat_split(df_tr, 1, dep_var)

In [23]:
procs = [Categorify, FillMissing]
to = TabularPandas(df_tr, procs, cat, cont, y_names = dep_var, splits = splits)

In [24]:
xs, y = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

In [25]:
y, valid_y

(0        5.796058
 1        6.253829
 2        4.983607
 3        6.349139
 4        6.814543
            ...   
 19723    6.944087
 19724    5.669881
 19725    7.080027
 19726    7.484931
 19727    6.272877
 Name: num_sold, Length: 19728, dtype: float32,
 19728    6.003887
 19729    6.431331
 19730    5.170484
 19731    6.570883
 19732    6.949856
            ...   
 26293    6.712956
 26294    5.521461
 26295    6.911747
 26296    7.273093
 26297    5.961005
 Name: num_sold, Length: 6570, dtype: float32)

In [26]:
def cal_smape(preds, targs):
    total = len(targs)
    smp = 0
    for i,j in zip(preds, targs):
        diff = np.abs(i - j)
        avg = ((np.abs(i)) + np.abs(j)) / 2
        fin = diff/avg
        smp += fin
        
    smp = (smp/total) * 100 
    
    return smp

In [31]:
a,b,c,d,e = [],[],[],[],[]
score = []

g_index = 1
def hp_tuning(trial) :
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.5, log = True)
    gamma = trial.suggest_float("gamma", 0.001, 10)
    alpha = trial.suggest_float("alpha", 0.001, 10)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 0.001, 25)
    
    model = XGBRegressor(verbosity = 0, alpha = alpha, reg_lambda = reg_lambda, gamma = gamma, max_depth = max_depth, learning_rate = learning_rate)
    
    model.fit(xs, y)
                            
    preds_val = model.predict(valid_xs)
   
        
    a.append(learning_rate)
    b.append(gamma)
    c.append(alpha)
    d.append(max_depth)
    e.append(reg_lambda)
    score.append(cal_smape(np.exp(preds_val), np.exp(valid_y)))
    
        
        
    return (cal_smape(np.exp(preds_val), np.exp(valid_y)))

In [32]:
study = optuna.create_study(direction = "minimize")
study.optimize(hp_tuning, n_trials=200)
print('Number of finished trials:', len(study.trials))
print("Best Params: ",study.best_params)

[32m[I 2022-01-11 05:15:50,433][0m A new study created in memory with name: no-name-2691954a-2190-4657-ac9e-48d13795f00d[0m
[32m[I 2022-01-11 05:15:50,904][0m Trial 0 finished with value: 179.78842134534273 and parameters: {'learning_rate': 0.005903798870820417, 'gamma': 5.02961790547787, 'alpha': 0.8015520360567749, 'max_depth': 3, 'reg_lambda': 0.0010584861548731644}. Best is trial 0 with value: 179.78842134534273.[0m
[32m[I 2022-01-11 05:15:51,284][0m Trial 1 finished with value: 197.54174278500034 and parameters: {'learning_rate': 0.00012051532040623042, 'gamma': 5.213486086662463, 'alpha': 7.399879026138942, 'max_depth': 3, 'reg_lambda': 0.002710010039869358}. Best is trial 0 with value: 179.78842134534273.[0m
[32m[I 2022-01-11 05:15:51,915][0m Trial 2 finished with value: 197.42004522957615 and parameters: {'learning_rate': 0.00022321486487225845, 'gamma': 5.915797923006861, 'alpha': 0.9638344713291868, 'max_depth': 13, 'reg_lambda': 0.9810665519885435}. Best is trial 

Number of finished trials: 200
Best Params:  {'learning_rate': 0.17004187706883167, 'gamma': 0.009638640724377645, 'alpha': 1.928728380121548, 'max_depth': 14, 'reg_lambda': 0.8119670489027239}


In [33]:
study.best_params

{'learning_rate': 0.17004187706883167,
 'gamma': 0.009638640724377645,
 'alpha': 1.928728380121548,
 'max_depth': 14,
 'reg_lambda': 0.8119670489027239}

In [36]:
model = XGBRegressor(learning_rate = study.best_params['learning_rate'], gamma = study.best_params['gamma'],
                     alpha = study.best_params['alpha'], max_depth = study.best_params['max_depth'],
                     reg_lambda = study.best_params['reg_lambda'])

In [37]:
model.fit(xs, y)

XGBRegressor(alpha=1.928728380121548, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             enable_categorical=False, gamma=0.009638640724377645, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.17004187706883167, max_delta_step=0, max_depth=14,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=7, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=1.92872834,
             reg_lambda=0.8119670489027239, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [38]:
cal_smape(model.predict(xs), y)

0.732540580094194

In [39]:
cal_smape(model.predict(valid_xs), valid_y)

1.1831831605186387

In [40]:
test = pd.read_csv("test.csv")

In [41]:
test["num_sold"] = 0

In [42]:
emb_dim = country_map['Finland'].shape[0]
col_name = [f'country_emb_{i}' for i in range(1, emb_dim + 1)]
test_emb_country = pd.DataFrame(test['country'].map(country_map).to_list(), columns = col_name)

In [43]:
emb_dim = product_map['Kaggle Mug'].shape[0]
col_name = [f'product_emb_{i}' for i in range(1, emb_dim + 1)]
test_emb_product = pd.DataFrame(test['product'].map(product_map).to_list(), columns = col_name)

In [44]:
emb_dim = store_map['KaggleMart'].shape[0]
col_name = [f'store_emb_{i}' for i in range(1, emb_dim + 1)]
test_emb_store = pd.DataFrame(test['store'].map(store_map).to_list(), columns = col_name)

In [45]:
cols = xs.columns

In [46]:
test_fn = pd.concat([test[["date", "num_sold"]],test_emb_country, test_emb_product, test_emb_store ], axis = 1)

In [47]:
test_fn = add_datepart(test_fn, 'date')

In [48]:
to_test = TabularPandas(test_fn, procs, cat, cont, y_names = dep_var, splits = None)

In [49]:
test_xs = to_test.train.xs

In [50]:
preds_test = model.predict(test_xs)

In [51]:
preds_test

array([5.6141477, 6.108438 , 4.81558  , ..., 6.5863504, 7.2239237,
       5.988612 ], dtype=float32)

In [52]:
preds_test_inv = np.exp(preds_test)

In [53]:
ids = list(test["row_id"].values)
submission = pd.DataFrame()
submission["row_id"] = ids
submission["num_sold"] = list(preds_test_inv)

In [54]:
submission.to_csv("submission.csv", index = False)

In [55]:
def rf_feat_importance(m, df):
  return pd.DataFrame({'cols' : df.columns, 'imp' : m.feature_importances_}).sort_values(by = 'imp', ascending = False)

In [56]:
fi = rf_feat_importance(model, xs)
fi[:10]

Unnamed: 0,cols,imp
6,store_emb_1,0.466867
9,product_emb_1,0.460176
12,country_emb_1,0.037249
19,Dayofweek,0.018492
20,Dayofyear,0.007207
17,Week,0.004519
21,Elapsed,0.001693
15,Year,0.001455
16,Month,0.00076
4,Is_year_end,0.000384


In [57]:
to_keep = fi[fi.imp>0.005].cols

len(to_keep)

5

In [58]:
to_keep

6       store_emb_1
9     product_emb_1
12    country_emb_1
19        Dayofweek
20        Dayofyear
Name: cols, dtype: object

In [59]:
xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]

In [60]:
a,b,c,d,e = [],[],[],[],[]
score = []

g_index = 1
def hp_tuning(trial) :
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.5, log = True)
    gamma = trial.suggest_float("gamma", 0.001, 10)
    alpha = trial.suggest_float("alpha", 0.001, 10)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 0.001, 25)
    
    model = XGBRegressor(verbosity = 0, alpha = alpha, reg_lambda = reg_lambda, gamma = gamma, max_depth = max_depth, learning_rate = learning_rate)
    
    model.fit(xs_imp, y)
                            
    preds_val = model.predict(valid_xs_imp)
   
        
    a.append(learning_rate)
    b.append(gamma)
    c.append(alpha)
    d.append(max_depth)
    e.append(reg_lambda)
    score.append(cal_smape(np.exp(preds_val), np.exp(valid_y)))
    
        
        
    return (cal_smape(np.exp(preds_val), np.exp(valid_y)))

In [61]:
study = optuna.create_study(direction = "minimize")
study.optimize(hp_tuning, n_trials=200)
print('Number of finished trials:', len(study.trials))
print("Best Params: ",study.best_params)

[32m[I 2022-01-11 05:29:06,945][0m A new study created in memory with name: no-name-1774d8a1-4b93-4bd9-9a36-f0c36eff9433[0m
[32m[I 2022-01-11 05:29:07,316][0m Trial 0 finished with value: 16.416385348234424 and parameters: {'learning_rate': 0.048886935197622106, 'gamma': 5.269651194818462, 'alpha': 7.435744996004705, 'max_depth': 6, 'reg_lambda': 0.017380582811614163}. Best is trial 0 with value: 16.416385348234424.[0m
[32m[I 2022-01-11 05:29:07,624][0m Trial 1 finished with value: 197.4696255692913 and parameters: {'learning_rate': 0.0001819269100325073, 'gamma': 2.439671150252262, 'alpha': 5.252774520303946, 'max_depth': 12, 'reg_lambda': 0.03018725903625502}. Best is trial 0 with value: 16.416385348234424.[0m
[32m[I 2022-01-11 05:29:07,878][0m Trial 2 finished with value: 194.8515177928342 and parameters: {'learning_rate': 0.0017963612227292008, 'gamma': 8.049537575772485, 'alpha': 5.6822424742987705, 'max_depth': 4, 'reg_lambda': 0.0010955539424089096}. Best is trial 0 w

Number of finished trials: 200
Best Params:  {'learning_rate': 0.40089542009081797, 'gamma': 0.0018082548930865158, 'alpha': 0.04409617347640854, 'max_depth': 9, 'reg_lambda': 0.001354450034496302}


In [62]:
print("Best Params: ",study.best_params)

Best Params:  {'learning_rate': 0.40089542009081797, 'gamma': 0.0018082548930865158, 'alpha': 0.04409617347640854, 'max_depth': 9, 'reg_lambda': 0.001354450034496302}


In [63]:
model = XGBRegressor(learning_rate = study.best_params['learning_rate'], gamma = study.best_params['gamma'],
                     alpha = study.best_params['alpha'], max_depth = study.best_params['max_depth'],
                     reg_lambda = study.best_params['reg_lambda'])

In [65]:
model.fit(xs_imp, y)

XGBRegressor(alpha=0.04409617347640854, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             enable_categorical=False, gamma=0.0018082548930865158, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.40089542009081797, max_delta_step=0, max_depth=9,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=7, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0.0440961719,
             reg_lambda=0.001354450034496302, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [66]:
cal_smape(model.predict(xs_imp), y)

0.5416211080642811

In [67]:
cal_smape(model.predict(valid_xs_imp), valid_y)

1.7524731854916282

In [68]:
cal_smape(np.exp(model.predict(valid_xs_imp)), np.exp(valid_y))

10.045394924282206

In [69]:
cal_smape(np.exp(model.predict(xs_imp)), np.exp(y))

3.0608827082544843