In [1]:
import gc
import numpy as np 
import pandas as pd 
from category_encoders.ordinal import OrdinalEncoder
import matplotlib.pyplot as plt
from tsforest.utils import make_time_range

import matplotlib.pyplot as plt
import seaborn as sns


# local modules
import sys
sys.path.append("../lib/")
from utils import compute_scaling, reduce_mem_usage

  import pandas.util.testing as tm


***
## data loading

In [2]:
sales_train = pd.read_csv("../input/sales_train_evaluation.csv")
sales_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1947 entries, id to d_1941
dtypes: int64(1941), object(6)
memory usage: 452.9+ MB


In [3]:
calendar = pd.read_csv("../input/calendar.csv", parse_dates=["date"])
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          1969 non-null   datetime64[ns]
 1   wm_yr_wk      1969 non-null   int64         
 2   weekday       1969 non-null   object        
 3   wday          1969 non-null   int64         
 4   month         1969 non-null   int64         
 5   year          1969 non-null   int64         
 6   d             1969 non-null   object        
 7   event_name_1  162 non-null    object        
 8   event_type_1  162 non-null    object        
 9   event_name_2  5 non-null      object        
 10  event_type_2  5 non-null      object        
 11  snap_CA       1969 non-null   int64         
 12  snap_TX       1969 non-null   int64         
 13  snap_WI       1969 non-null   int64         
dtypes: datetime64[ns](1), int64(7), object(6)
memory usage: 215.5+ KB


***
## hierarchy

In [5]:
sales_train["id"] = sales_train.id.map(lambda x: x.replace("_evaluation", ""))
hierarchy = (sales_train.loc[:, ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]]
             .drop_duplicates())

In [7]:
# hierarchy encoder
id_encoder = OrdinalEncoder()
id_encoder.fit(hierarchy.loc[:, ["id"]])
hierarchy["ts_id"]  = id_encoder.transform(hierarchy.loc[:, ["id"]])

item_encoder = OrdinalEncoder()
item_encoder.fit(hierarchy.loc[:, ["item_id"]])
hierarchy.loc[:, "item_id"]  = item_encoder.transform(hierarchy.loc[:, ["item_id"]])

dept_encoder = OrdinalEncoder()
dept_encoder.fit(hierarchy.loc[:, ["dept_id"]])
hierarchy.loc[:, "dept_id"]  = dept_encoder.transform(hierarchy.loc[:, ["dept_id"]])

cat_encoder = OrdinalEncoder()
cat_encoder.fit(hierarchy.loc[:, ["cat_id"]])
hierarchy.loc[:, "cat_id"]   = cat_encoder.transform(hierarchy.loc[:, ["cat_id"]])

store_encoder = OrdinalEncoder()
store_encoder.fit(hierarchy.loc[:, ["store_id"]])
hierarchy.loc[:, "store_id"] = store_encoder.transform(hierarchy.loc[:, ["store_id"]])

state_encoder = OrdinalEncoder()
state_encoder.fit(hierarchy.loc[:, ["state_id"]])
hierarchy.loc[:, "state_id"] = state_encoder.transform(hierarchy.loc[:, ["state_id"]])

***
## categorical encoding

In [8]:
sales_train["ts_id"] = id_encoder.transform(sales_train.loc[:, ["id"]])
sales_train.loc[:, "item_id"]  = item_encoder.transform(sales_train.loc[:, ["item_id"]])
sales_train.loc[:, "dept_id"]  = dept_encoder.transform(sales_train.loc[:, ["dept_id"]])
sales_train.loc[:, "cat_id"]   = cat_encoder.transform(sales_train.loc[:, ["cat_id"]])
sales_train.loc[:, "store_id"] = store_encoder.transform(sales_train.loc[:, ["store_id"]])
sales_train.loc[:, "state_id"] = state_encoder.transform(sales_train.loc[:, ["state_id"]])

***
## data wrangling

In [11]:
data = pd.melt(sales_train, 
               id_vars=["ts_id","item_id","dept_id","cat_id","store_id","state_id"],
               value_vars=[f"d_{i}" for i in range(1,1942)],
               var_name="d",
               value_name="q")
data = pd.merge(data, 
                calendar.loc[:, ["d","date"]],
                how="left",
                on="d")
data.drop("d", axis=1, inplace=True)

In [12]:
data = reduce_mem_usage(data)
gc.collect()

110

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59181090 entries, 0 to 59181089
Data columns (total 8 columns):
 #   Column    Dtype         
---  ------    -----         
 0   ts_id     int16         
 1   item_id   int16         
 2   dept_id   int8          
 3   cat_id    int8          
 4   store_id  int8          
 5   state_id  int8          
 6   q         int16         
 7   date      datetime64[ns]
dtypes: datetime64[ns](1), int16(3), int8(4)
memory usage: 1.4 GB


***
## cleaning


### removes zeros at the start of the time series

In [14]:
def remove_starting_zeros(dataframe):
    idxmin = dataframe.query("q > 0").index.min()
    return dataframe.loc[idxmin:, :]

In [15]:
data = (data
        .groupby(["item_id","store_id"])
        .apply(remove_starting_zeros)
        .reset_index(drop=True)
       )

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46796220 entries, 0 to 46796219
Data columns (total 8 columns):
 #   Column    Dtype         
---  ------    -----         
 0   ts_id     int16         
 1   item_id   int16         
 2   dept_id   int8          
 3   cat_id    int8          
 4   store_id  int8          
 5   state_id  int8          
 6   q         int16         
 7   date      datetime64[ns]
dtypes: datetime64[ns](1), int16(3), int8(4)
memory usage: 803.3 MB


In [17]:
data.to_parquet("../input/scaling_input.parquet", index=False)

***
### Precomputation of scaling for all levels

In [14]:
ts_id_columns_by_level = {
    1: [],
    2: ["state_id"],
    3: ["store_id"],
    4: ["cat_id"],
    5: ["dept_id"],
    6: ["state_id", "cat_id"],
    7: ["state_id", "dept_id"],
    8: ["store_id", "cat_id"],
    9: ["store_id", "dept_id"],
    10: ["item_id"],
    11: ["item_id", "state_id"],
    12: ["item_id", "store_id"]
}

In [15]:
for level,ts_uid_columns in ts_id_columns_by_level.items():
    print(level, ts_uid_columns)
    if level == 1: continue
    scales = compute_scaling(data, agg_columns=ts_uid_columns).rename({"q":"s"}, axis=1)
    scales.to_parquet(f"../input/scales_level{level}.parquet", index=False)
    
# scaling factor for root level
_data = (data
         .groupby(["date"])["q"]
         .sum()
         .reset_index())
scales = pd.DataFrame([np.sqrt(np.nanmean(_data.q.diff(1)**2))], columns=["s"])
scales.to_parquet(f"../input/scales_level1.parquet", index=False)

1 []
2 ['state_id']
3 ['store_id']
4 ['cat_id']
5 ['dept_id']
6 ['state_id', 'cat_id']
7 ['state_id', 'dept_id']
8 ['store_id', 'cat_id']
9 ['store_id', 'dept_id']
10 ['item_id']
11 ['item_id', 'state_id']
12 ['item_id', 'store_id']


***