deepar

# Load data

In [1]:
import pandas as pd
import numpy as np
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
all = pd.concat([train, test], axis = 0)
transactions = pd.read_csv("data/transactions.csv")
holiday_events = pd.read_csv("data/holidays_events.csv")
stores = pd.read_csv("data/stores.csv")
oil = pd.read_csv("data/oil.csv")

# feature engineering

In [2]:
# https://www.kaggle.com/code/ekrembayar/holiday_events-events-data-manipulation-time-features/notebook
# Deal multiple holiday
# 확인 후 리팩토링 필요

tr1 = holiday_events[(holiday_events.type == "Holiday") & (holiday_events.transferred == True)].drop("transferred", axis = 1).reset_index(drop = True)
tr2 = holiday_events[(holiday_events.type == "Transfer")].drop("transferred", axis = 1).reset_index(drop = True)
tr = pd.concat([tr1,tr2], axis = 1)
tr = tr.iloc[:, [5,1,2,3,4]]

holiday_events = holiday_events[(holiday_events.transferred == False) & (holiday_events.type != "Transfer")].drop("transferred", axis = 1)
holiday_events = holiday_events.append(tr).reset_index(drop = True)


# Additional Holidays
# ------------------------------------------------------
holiday_events["description"] = holiday_events["description"].str.replace("-", "").str.replace("+", "").str.replace('\d+', '')
holiday_events["type"] = np.where(holiday_events["type"] == "Additional", "Holiday", holiday_events["type"])

# Bridge Holidays
# ------------------------------------------------------
holiday_events["description"] = holiday_events["description"].str.replace("Puente ", "")
holiday_events["type"] = np.where(holiday_events["type"] == "Bridge", "Holiday", holiday_events["type"])

 
# Work Day Holidays, that is meant to payback the Bridge.
# ------------------------------------------------------
work_day = holiday_events[holiday_events.type == "Work Day"]  
holiday_events = holiday_events[holiday_events.type != "Work Day"]  


# Split
# ------------------------------------------------------

# Events are national
events = holiday_events[holiday_events.type == "Event"].drop(["type", "locale", "locale_name"], axis = 1).rename({"description":"events"}, axis = 1)

holiday_events = holiday_events[holiday_events.type != "Event"].drop("type", axis = 1)
regional = holiday_events[holiday_events.locale == "Regional"].rename({"locale_name":"state", "description":"holiday_regional"}, axis = 1).drop("locale", axis = 1).drop_duplicates()
national = holiday_events[holiday_events.locale == "National"].rename({"description":"holiday_national"}, axis = 1).drop(["locale", "locale_name"], axis = 1).drop_duplicates()
local = holiday_events[holiday_events.locale == "Local"].rename({"description":"holiday_local", "locale_name":"city"}, axis = 1).drop("locale", axis = 1).drop_duplicates()

# EVENTS
events["events"] =np.where(events.events.str.contains("futbol"), "Futbol", events.events)

# Merge
holiday_events = pd.concat([events, regional, national, local], axis = 0).reset_index(drop = True)

# convert datetime to str
# some additional date features
date_info = pd.DataFrame({"date": pd.date_range(start=all["date"].min(), end=all["date"].max())})

date_info["year"] = date_info["date"].dt.year
date_info["month"] = date_info["date"].dt.month
date_info["day"] = date_info["date"].dt.day
date_info["dayofweek"] = date_info["date"].dt.dayofweek
date_info["weekend"] = (date_info["dayofweek"] >= 5).astype(int)
date_info["week"] = date_info["date"].dt.week
date_info["quarter"] = date_info["date"].dt.quarter
date_info["season"] = date_info["month"] % 12 // 3 + 1

date_info["date"] = date_info["date"].astype(str)

  holiday_events = holiday_events.append(tr).reset_index(drop = True)
  holiday_events["description"] = holiday_events["description"].str.replace("-", "").str.replace("+", "").str.replace('\d+', '')
  holiday_events["description"] = holiday_events["description"].str.replace("-", "").str.replace("+", "").str.replace('\d+', '')
  date_info["week"] = date_info["date"].dt.week


# make train

In [3]:
all

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0
...,...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,,1
28508,3029396,2017-08-31,9,PREPARED FOODS,,0
28509,3029397,2017-08-31,9,PRODUCE,,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,,9


In [4]:
# make train
merged = pd.merge(all, date_info, on=["date"], how="outer")
merged = pd.merge(merged, transactions, on=["date", "store_nbr"], how="left")
merged = pd.merge(merged, stores, on=["store_nbr"], how="left")
merged = pd.merge(merged, holiday_events, on=["date", "city", "state"], how="left")
merged = pd.merge(merged, oil, on=["date"], how="left")

# 없는 값은 거래가 발생하지 않은 것으로 가정
merged["transactions"] = merged["transactions"].fillna(0)

In [5]:
# fill na
# TODO: Normal은 0으로 취급될 수 있게 변경 필요
merged["events"] = merged["events"].fillna("Normal")
merged["holiday_regional"] = merged["holiday_regional"].fillna("Normal")
merged["holiday_national"] = merged["holiday_national"].fillna("Normal")
merged["holiday_local"] = merged["holiday_local"].fillna("Normal")
merged["id"] = merged["id"].fillna(-1)
merged = merged.fillna(0)

In [6]:
pd.isna(merged).sum()

id                  0
date                0
store_nbr           0
family              0
sales               0
onpromotion         0
year                0
month               0
day                 0
dayofweek           0
weekend             0
week                0
quarter             0
season              0
transactions        0
city                0
state               0
type                0
cluster             0
events              0
holiday_regional    0
holiday_national    0
holiday_local       0
dcoilwtico          0
dtype: int64

In [7]:
merged

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,year,month,day,dayofweek,...,transactions,city,state,type,cluster,events,holiday_regional,holiday_national,holiday_local,dcoilwtico
0,0.0,2013-01-01,1.0,AUTOMOTIVE,0.0,0.0,2013,1,1,1,...,0.0,Quito,Pichincha,D,13.0,Normal,Normal,Normal,Normal,0.00
1,1.0,2013-01-01,1.0,BABY CARE,0.0,0.0,2013,1,1,1,...,0.0,Quito,Pichincha,D,13.0,Normal,Normal,Normal,Normal,0.00
2,2.0,2013-01-01,1.0,BEAUTY,0.0,0.0,2013,1,1,1,...,0.0,Quito,Pichincha,D,13.0,Normal,Normal,Normal,Normal,0.00
3,3.0,2013-01-01,1.0,BEVERAGES,0.0,0.0,2013,1,1,1,...,0.0,Quito,Pichincha,D,13.0,Normal,Normal,Normal,Normal,0.00
4,4.0,2013-01-01,1.0,BOOKS,0.0,0.0,2013,1,1,1,...,0.0,Quito,Pichincha,D,13.0,Normal,Normal,Normal,Normal,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3029399,3029399.0,2017-08-31,9.0,SEAFOOD,0.0,0.0,2017,8,31,3,...,0.0,Quito,Pichincha,B,6.0,Normal,Normal,Normal,Normal,47.26
3029400,-1.0,2013-12-25,0.0,0,0.0,0.0,2013,12,25,2,...,0.0,0,0,0,0.0,Normal,Normal,Navidad,Normal,0.00
3029401,-1.0,2014-12-25,0.0,0,0.0,0.0,2014,12,25,3,...,0.0,0,0,0,0.0,Normal,Normal,Navidad,Normal,0.00
3029402,-1.0,2015-12-25,0.0,0,0.0,0.0,2015,12,25,4,...,0.0,0,0,0,0.0,Normal,Normal,Navidad,Normal,0.00


In [8]:
# FIXME: for simplicity
# use only family=AUTOMOTIVE, store_nbr=1
# idx = (merged["family"] == "AUTOMOTIVE") & (merged["store_nbr"] == 1)
# merged = merged[idx]

In [9]:
# make item_id
merged["item_id"] = merged["family"].astype(str) + "_" + merged["store_nbr"].astype(str)

# separate static and covariates
drop_cols = []
static_cols = ["store_nbr","family","type","cluster", "state", "city"]

static_features = merged[static_cols + ["item_id"]].drop_duplicates()
static_features.set_index("item_id", inplace=True)
merged = merged.drop(columns=drop_cols + static_cols)

merged["date"] = pd.to_datetime(merged["date"], format="%Y-%m-%d")
merged.sort_values(by=["item_id","date"], inplace=True)

In [10]:
# fill na
# TODO: Normal은 0으로 취급될 수 있게 변경 필요
merged["events"] = merged["events"].fillna("Normal")
merged["holiday_regional"] = merged["holiday_regional"].fillna("Normal")
merged["holiday_national"] = merged["holiday_national"].fillna("Normal")
merged["holiday_local"] = merged["holiday_local"].fillna("Normal")
merged["dcoilwtico"] = merged["dcoilwtico"].fillna(value = 0)

In [11]:
# one hot encoding
categorical_col = ["events", "holiday_regional", "holiday_national", "holiday_local"]
encoded = pd.get_dummies(merged[categorical_col + ["item_id", "date"]], columns=categorical_col)
merged = pd.merge(merged, encoded, on=["item_id", "date"], how="left")
merged.drop(columns=categorical_col, inplace=True)

In [12]:
merged.head()

Unnamed: 0,id,date,sales,onpromotion,year,month,day,dayofweek,weekend,week,quarter,season,transactions,dcoilwtico,item_id,events_Normal,holiday_regional_Normal,holiday_national_Navidad,holiday_national_Normal,holiday_local_Normal
0,-1.0,2013-12-25,0.0,0.0,2013,12,25,2,0,52,4,1,0.0,0.0,0_0.0,1,1,1,0,1
1,-1.0,2014-12-25,0.0,0.0,2014,12,25,3,0,52,4,1,0.0,0.0,0_0.0,1,1,1,0,1
2,-1.0,2015-12-25,0.0,0.0,2015,12,25,4,0,52,4,1,0.0,0.0,0_0.0,1,1,1,0,1
3,-1.0,2016-12-25,0.0,0.0,2016,12,25,6,1,51,4,1,0.0,0.0,0_0.0,1,1,1,0,1
4,0.0,2013-01-01,0.0,0.0,2013,1,1,1,0,1,1,1,0.0,0.0,AUTOMOTIVE_1.0,1,1,0,1,1


# Make TimeSeriesDataFrame

In [13]:
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

timeseries_df = TimeSeriesDataFrame.from_data_frame(
    merged,
    id_column="item_id",
    timestamp_column="date"
)
timeseries_df = timeseries_df.to_regular_index(freq="D")

categorical_col = ["store_nbr", "family", "type", "cluster", "city", "state"]
for col in categorical_col:
    # encoding to number
    static_features[col] = static_features[col].astype("category")
    
# for continuous timeseries
timeseries_df = timeseries_df.fillna(0)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
train_data, test_data = timeseries_df.split_by_time(pd.Timestamp("2017-08-16"))

train_data.static_features = static_features
test_data.static_features = static_features

train_data["sales"] = np.log1p(train_data["sales"])

In [15]:
known_covariates_cols = list(train_data.columns)
known_covariates_cols.remove("sales")
known_covariates_cols.remove("id")
known_covariates_cols

['onpromotion',
 'year',
 'month',
 'day',
 'dayofweek',
 'weekend',
 'week',
 'quarter',
 'season',
 'transactions',
 'dcoilwtico',
 'events_Normal',
 'holiday_regional_Normal',
 'holiday_national_Navidad',
 'holiday_national_Normal',
 'holiday_local_Normal']

# Model test
* DeepAR

context_length (int, optional) – lag값을 몇 개 참조할 것인지

disable_static_features (bool, default = False)

disable_known_covariates (bool, default = False)

num_layers (int, default = 2) – Number of RNN layers

hidden_size (int, default = 40) – Number of RNN cells for each layer

dropout_rate (float, default = 0.1) – Dropout regularization parameter

embedding_dimension (int, optional) – Dimension of the embeddings for categorical features (if None, defaults to [min(50, (cat+1)//2) for cat in cardinality])

distr_output (gluonts.torch.distributions.DistributionOutput, default = StudentTOutput()) – Distribution to use to evaluate observations and sample predictions

scaling (bool, default = True) – Whether to automatically scale the target values

epochs (int, default = 100) – Number of epochs the model will be trained for

batch_size (int, default = 64) – Size of batches used during training

num_batches_per_epoch (int, default = 50) – Number of batches processed every epoch

learning_rate (float, default = 1e-3,) – Learning rate used during training

* 이어서 재학습 가능한지? => 안되는듯..?
* 먀지막에 저장되는 게 val_loss 기준 best인지 last epoch인지? => best인듯

In [17]:
import autogluon.core as ag
from autogluon.timeseries.splitter import MultiWindowSplitter
splitter = MultiWindowSplitter(num_windows=1)
predictor = TimeSeriesPredictor(known_covariates_names=known_covariates_cols, target="sales", prediction_length=16, validation_splitter=splitter,
                                eval_metric="RMSE", verbosity = 4)
# predictor.fit(train_data=train_data,
#               hyperparameters={
#             "DeepAR": {
#                 "hidden_size": ag.space.Int(20, 100),
#                 "dropout_rate": ag.space.Categorical(0.1, 0.3),
#                 "context_length": ag.space.Categorical(16, 32, 64),
#                 "scaling": ag.space.Categorical(True, False),
#                 "learning_rate": ag.space.Real(1e-4, 1e-2, log=True),
#                 "batch_size": ag.space.Categorical(16, 32, 64, 128),
#             }
#             },hyperparameter_tune_kwargs={
#                 "scheduler": "local",
#                 "searcher": "auto",
#                 "num_trials": 30,
#             }, enable_ensemble=False)

In [18]:
predictor.fit(train_data=train_data,
              hyperparameters={
            "DeepAR": {"epochs": 10}
            })

TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': {'DeepAR': {'epochs': 10}},
 'prediction_length': 16,
 'random_seed': None,
 'target': 'sales',
 'time_limit': None}
Provided training data set with 3009113 rows, 1783 items (item = single time series). Average time series length is 1687.7.
Training artifacts will be saved to: /home/mysunk/PJT/automl_study/msun/AutogluonModels/ag-20230606_103643
Beginning AutoGluon training with TimeSeriesLearner 
AutoGluon will save models to AutogluonModels/ag-20230606_103643/
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'sales'
	known covariates: ['onpromotion', 'year', 'month', 'day', 'dayofweek'

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7f1818f162e0>

In [20]:
predictor = TimeSeriesPredictor(known_covariates_names=known_covariates_cols, target="sales", prediction_length=16, validation_splitter=splitter,
                                eval_metric="RMSE", verbosity = 4)
predictor_saved = predictor.load(path = "AutogluonModels/ag-20230606_103643/")
predictor_saved.fit(train_data=train_data,
              hyperparameters={
            "DeepAR": {"epochs": 10}
            })

No path specified. Models will be saved in: "AutogluonModels/ag-20230606_104214/"
Loading predictor from path AutogluonModels/ag-20230606_103643/
Loading: AutogluonModels/ag-20230606_103643/learner.pkl
Loading: AutogluonModels/ag-20230606_103643/predictor.pkl


AssertionError: Predictor is already fit! To fit additional models create a new `Predictor`.