In [1]:
import numpy as np
import pandas as pd

import category_encoders as ce
from tsforest.forecast import LightGBMForecaster

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

  import pandas.util.testing as tm


***
## data preparation

In [2]:
dataset = pd.read_parquet("../data/dataset.parquet")
dataset

Unnamed: 0,date,sku,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available,item_domain_id,item_id,site_id,product_id,product_family_id
0,2021-02-01,0,0,172.789993,REA,classic,cross_docking,free_shipping,0.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
1,2021-02-02,0,0,172.789993,REA,classic,cross_docking,free_shipping,0.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
2,2021-02-03,0,0,179.990005,REA,classic,cross_docking,free_shipping,872.650024,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
3,2021-02-04,0,0,179.990005,REA,classic,cross_docking,free_shipping,1440.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
4,2021-02-05,0,0,179.990005,REA,classic,cross_docking,free_shipping,1440.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38994039,2021-03-27,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,
38994040,2021-03-28,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,
38994041,2021-03-29,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,
38994042,2021-03-30,660915,0,99.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,


In [3]:
dataset.sold_quantity.describe()

count    3.899404e+07
mean     9.562279e-01
std      9.818856e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.000000e+00
max      6.951000e+03
Name: sold_quantity, dtype: float64

In [4]:
dataset["fold"] = dataset.date.dt.month - 2

categorical_features = {
    "sku":"default",
    "currency":"default",
    "listing_type":"default",
    "shipping_logistic_type":"default",
    "shipping_payment":"default",
    "item_domain_id":"default",
    "item_id":"default",
    "site_id":"default",
    "product_id":"default",
    "product_family_id":"default",
}

exclude_features = ["month","sold_quantity",]

#for col in categorical_features:
#    dataset[col] = pd.Categorical(dataset[col])

In [5]:
dataset.rename({"date":"ds", "sold_quantity":"y"}, axis=1, inplace=True)
dataset

Unnamed: 0,ds,sku,y,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,available,item_domain_id,item_id,site_id,product_id,product_family_id,fold
0,2021-02-01,0,0,172.789993,REA,classic,cross_docking,free_shipping,0.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732,0
1,2021-02-02,0,0,172.789993,REA,classic,cross_docking,free_shipping,0.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732,0
2,2021-02-03,0,0,179.990005,REA,classic,cross_docking,free_shipping,872.650024,1,MLB-SNEAKERS,492155,MLB,,MLB15832732,0
3,2021-02-04,0,0,179.990005,REA,classic,cross_docking,free_shipping,1440.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732,0
4,2021-02-05,0,0,179.990005,REA,classic,cross_docking,free_shipping,1440.000000,1,MLB-SNEAKERS,492155,MLB,,MLB15832732,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38994039,2021-03-27,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,,1
38994040,2021-03-28,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,,1
38994041,2021-03-29,660915,0,79.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,,1
38994042,2021-03-30,660915,0,99.989998,MEX,classic,fulfillment,paid_shipping,0.000000,1,MLM-SURGICAL_MASKS,423179,MLM,,,1


***
## model training

In [6]:
# model configuration
model_params = {
    'objective':'tweedie',
    'tweedie_variance_power': 1.1,
    'metric':'l1',
    'num_iterations':500,
    'max_bin': 127,
    'bin_construct_sample_cnt':20000000,
    'num_leaves': 2**10-1,
    'min_data_in_leaf': 2**10-1,
    'learning_rate': 0.05,
    'feature_fraction':0.8,
    'bagging_fraction':0.8,
    'bagging_freq':1,
    'lambda_l2':0.1,
    'boost_from_average': False,
}

time_features = [
    #"year",
    #"month",
    #"year_week",
    #"year_day",
    "week_day",
    "week_day_cos",
    "week_day_sin",
    "month_progress",
    #"year_day_cos",
    #"year_day_sin",
    #"year_week_cos",
    #"year_week_sin",
    #"month_cos",
    #"month_sin"
]

In [7]:
model_kwargs = {
    "model_params":model_params,
    "time_features":time_features,
    "exclude_features":exclude_features,
    "categorical_features":categorical_features,
    "ts_uid_columns":["sku",],
}

In [8]:
model = LightGBMForecaster(**model_kwargs)

In [9]:
for valid_fold in [1,0]:
    break

In [10]:
valid_idx = dataset.query("fold == @valid_fold").index

In [None]:
%%time
model.prepare_features(dataset, valid_idx);

In [None]:
model.train_features

In [None]:
model.valid_features

In [None]:
help(model.fit)

In [None]:
%%time
model.fit(fit_kwargs={"verbose_eval":50,})

***