## Initialize

In [54]:
%reload_ext autoreload
%autoreload 2

In [1]:
# connect to google drive first
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Collab settings

In [59]:
name_notebook = 'Rossman_collab'

from pathlib import  Path
data = Path(f'/content/drive/MyDrive/data/data_{name_notebook}')
print(data.exists())
src_path = Path(f'/content/drive/MyDrive/src/src_{name_notebook}')
print(src_path.exists())

True
True


In [60]:
!cp -r {src_path} .

In [None]:
!pip -q install -r {src_path.name}/requirements.txt

## Main


In [62]:
import pandas as pd
import numpy as np

from src_Rossman_collab.utils import add_datepart, apply_cats, get_cv_idxs, proc_df
from src_Rossman_collab.datasets import ColumnarDataLoader

## 01 Create Feaures for modelling

In [10]:
joined      = pd.read_parquet(data / 'joined.p')
joined_test = pd.read_parquet(data / 'joined_test.p')

In [11]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw']

contin_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']

n = len(joined); print(n)

844338


In [12]:
dep = 'Sales'
joined = joined[cat_vars+contin_vars+[dep, 'Date']].copy()

joined_test[dep] = 0
joined_test = joined_test[cat_vars+contin_vars+[dep, 'Date', 'Id']].copy()

In [13]:
for v in cat_vars: 
    joined[v] = joined[v].astype('category').cat.as_ordered()
apply_cats(joined_test, joined)

for v in contin_vars:
    joined[v] = joined[v].fillna(0).astype('float32')
    joined_test[v] = joined_test[v].fillna(0).astype('float32')

In [14]:
# SAMPLE DATA
idx_samp = get_cv_idxs(n, val_pct=150000/n)
joined_samp = joined.iloc[idx_samp].set_index("Date")
samp_size = len(joined_samp); 
print(samp_size)

150000


In [None]:
# FULL DATA
samp_size = n
joined_samp = joined.set_index("Date")

In [15]:
df, y, nas, mapper = proc_df(joined_samp, 'Sales', do_scale=True)
yl = np.log(y)

joined_test = joined_test.set_index("Date")
df_test, _, nas, mapper = proc_df(joined_test, 'Sales', do_scale=True, skip_flds=['Id'],
                                  mapper=mapper, na_dict=nas)

In [16]:
train_ratio = 0.75
# train_ratio = 0.9
train_size = int(samp_size * train_ratio); print(train_size)
val_idx = list(range(train_size, len(df)))

112500


In [None]:
# val_idx = np.flatnonzero(
#    (df.index<=datetime.datetime(2014,9,17)) & (df.index>=datetime.datetime(2014,8,1)))

In [None]:
val_idx = [0] # for training on all data

## DL

In [70]:
from src_Rossman_collab.tabular_learner import Learner, MixedInputModel, to_gpu

In [65]:
torch.cuda.is_available()

True

In [66]:
def exp_rmspe(y_pred, targ):
    targ   = np.exp(targ)
    y_pred = np.exp(y_pred)
    pct_var = (targ - y_pred)/targ
    return np.sqrt((pct_var**2).mean())

max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)

In [67]:
cat_sz = [(c, len(joined_samp[c].cat.categories)+1) for c in cat_vars]
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [68]:
def get_learner(data, semb_szs, n_cont, emb_drop, out_sz, szs, drops,
                    y_range=None, use_bn=False):
        model = MixedInputModel(emb_szs, n_cont, emb_drop, out_sz, szs, drops)
        return Learner(data, to_gpu(model), opt_fn=optim.Adam,metrics=[exp_rmspe])


In [71]:
md = ColumnarDataLoader.from_data_frame(df, val_idx, 
                                        yl.astype(np.float32), 
                                        cat_flds=cat_vars, 
                                        bs=128,
                                        test_df=df_test)

m = get_learner(md, emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)


## Fitting


In [72]:
lr = 1e-3
m.fit(lr, 1)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

loss: 0.03643504558298772


Epoch: 100%|██████████| 1/1 [00:09<00:00,  9.62s/it]

[0.       0.036435 0.0332   0.210902]





In [73]:
m.fit(lr, 5)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

loss: 0.026294755806733856


Epoch:  20%|██        | 1/5 [00:09<00:37,  9.49s/it]

[0.       0.026295 0.025577 0.17422 ]
loss: 0.021735352803326495


Epoch:  40%|████      | 2/5 [00:18<00:28,  9.50s/it]

[1.       0.021735 0.020561 0.156475]
loss: 0.01924942725629197


Epoch:  60%|██████    | 3/5 [00:28<00:19,  9.51s/it]

[2.       0.019249 0.019354 0.145997]
loss: 0.017770700473689147


Epoch:  80%|████████  | 4/5 [00:38<00:09,  9.54s/it]

[3.       0.017771 0.017582 0.142784]
loss: 0.019477576190744535


Epoch: 100%|██████████| 5/5 [00:47<00:00,  9.52s/it]

[4.       0.019478 0.020409 0.142678]





In [75]:
y_pred,y_targ = m.predict()

In [76]:
exp_rmspe(y_pred,y_targ)

0.14711055

# Light GBM

In [None]:
import lightgbm as lgb

In [None]:
!pip install bayesian-optimization

In [None]:
from bayes_opt import BayesianOptimization

In [None]:
from src_Rossman.datasets import split_by_idx

In [None]:
((val_df, trn_df), (val_y, trn_y)) = split_by_idx(val_idx, df, yl)

In [None]:
train_data = lgb.Dataset(data=trn_df, label=trn_y)
eval_data = lgb.Dataset(data=val_df, label=val_y)

In [None]:
def lgb_exp_rms(pred, train_data):
    label = train_data.get_label()
    val_ = exp_rmspe(pred,label)
    return "exp_rmspe", val_, False

In [None]:
def lgbm_evaluate(**params):
    params_lgbm = {
        #"device_type":"gpu",
        "objective": "mse",
        "num_iterations": 250,
        "learning_rate": params["learning_rate"],
        "max_depth": int(params["max_depth"]),
        "min_data_in_leaf": int(params["min_data_in_leaf"]),
        "num_leaves": int(params["num_leaves"]),
        "reg_lambda": params["reg_lambda"],
        "reg_alpha": params["reg_alpha"],
        "feature_fraction": params["feature_fraction"],
        "bagging_fraction": params["bagging_fraction"],
        "min_gain_to_split": params["min_gain_to_split"],
        "verbose": -1,
        # "num_threads": 1,
        "seed": 42,
    }
    model = lgb.train(
        params_lgbm,
        train_data,
        num_boost_round = 200,
        early_stopping_rounds = 5,
        categorical_feature=cat_vars,
        verbose_eval=0,
        valid_sets=[train_data, eval_data],
        feval=lgb_exp_rms
    )
    
    return -model.best_score["valid_1"]["exp_rmspe"]

In [None]:
params = {
    "num_leaves": (20, 100),
    "min_data_in_leaf": (50, 150),
    "max_depth": (1, 50),
    "learning_rate": (0.01, 0.4),
    "feature_fraction": (0.2, 1),
    "bagging_fraction": (0.5, 1),
    "reg_lambda": (0, 20),
    "reg_alpha": (0, 20),
    "min_gain_to_split": (0, 1),
}
bo = BayesianOptimization(lgbm_evaluate, params,verbose=1)


In [None]:
bo.maximize(init_points=100,n_iter=50)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_da... | min_ga... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------



Found `num_iterations` in params. Will use it instead of argument



In [None]:
print(bo.max)

{'target': -7726224.502622837, 'params': {'bagging_fraction': 1.0, 'feature_fraction': 0.2, 'learning_rate': 0.2, 'max_depth': 10.0, 'min_data_in_leaf': 138.29384659085886, 'min_gain_to_split': 0.6709405675640878, 'num_leaves': 86.40759153780651, 'reg_alpha': 7.50327124771608, 'reg_lambda': 6.80854198081176}}
