In [24]:
import numpy as np  
import pandas as pd  

from datetime import datetime, timedelta
import gc
import lightgbm as lgb
import os

In [25]:
c_col = {'event_name_1':'category', 'event_type_1':'category','event_name_2':'category', 'event_type_2':'category', 
            'weekday':'category', 'wm_yr_wk': 'int16', 'wday': 'int16', 'month': 'int16', 'year':'int16', 
            'snap_CA': 'float32', 'snap_TX': 'float32', 'snap_WI':'float32'}

sp_col= {'store_id':'category', 'item_id': 'category', 'wm_yr_wk': 'int16', 'sell_price':'float32'}

In [26]:
path0= "~/Desktop/DemandLocalGlobal/empirical/data/m5_content/sales_train_validation.csv"
path1= "~/Desktop/DemandLocalGlobal/empirical/data/m5_content/calendar.csv"
path2= "~/Desktop/DemandLocalGlobal/empirical/data/m5_content/sample_submission.csv"
path3= "~/Desktop/DemandLocalGlobal/empirical/data/m5_content/sell_prices.csv"

#stv_df= pd.read_csv(path0)
c_df= pd.read_csv(path1, dtype= c_col)
ss_df= pd.read_csv(path2)
sp_df= pd.read_csv(path3, dtype= sp_col)

In [27]:
c_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   date          1969 non-null   object  
 1   wm_yr_wk      1969 non-null   int16   
 2   weekday       1969 non-null   category
 3   wday          1969 non-null   int16   
 4   month         1969 non-null   int16   
 5   year          1969 non-null   int16   
 6   d             1969 non-null   object  
 7   event_name_1  162 non-null    category
 8   event_type_1  162 non-null    category
 9   event_name_2  5 non-null      category
 10  event_type_2  5 non-null      category
 11  snap_CA       1969 non-null   float32 
 12  snap_TX       1969 non-null   float32 
 13  snap_WI       1969 non-null   float32 
dtypes: category(5), float32(3), int16(4), object(2)
memory usage: 81.0+ KB


In [28]:
sp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 4 columns):
 #   Column      Dtype   
---  ------      -----   
 0   store_id    category
 1   item_id     category
 2   wm_yr_wk    int16   
 3   sell_price  float32 
dtypes: category(2), float32(1), int16(1)
memory usage: 58.8 MB


In [29]:
c_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0.0,0.0,0.0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0.0,0.0,0.0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0.0,0.0,0.0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1.0,1.0,0.0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1.0,0.0,1.0


In [30]:
max_lags = 57
last_training_day = 1913
fday= datetime(2016, 4, 25, 0, 0)


In [31]:
fday

datetime.datetime(2016, 4, 25, 0, 0)

In [32]:
def create_df(is_train=True, nrows=None, first_day=1200):
    sp_df2 = sp_df.copy()
    for col, col_dtype in sp_col.items():
        #LightGBM: for categorical feature w/ high cardinality, it often works best to treat feature as numeric
        if col_dtype == "category":
            sp_df2[col] = sp_df2[col].cat.codes.astype("int16")
            sp_df2[col] -= sp_df2[col].min()
    
    c_df2 = c_df.copy()
    c_df2["date"] = pd.to_datetime(c_df["date"])
    for col, col_dtype in c_col.items():
        if col_dtype == "category":
            c_df2[col] = c_df2[col].cat.codes.astype("int16")
            #compound assignment operator: 1)find min of column, 2)subtract min from every value in column,
            #3)Updates the column in place with the result #Effect:Operation shifts all values in the column 
            #so that the min becomes 0 (hence subtracting min w/ itself), effectively NORMALIZING the columns
            #to start from ZERO
            c_df2[col] -= c_df2[col].min()
    
    start_day = max(1 if is_train else last_training_day - max_lags, first_day) #look into this line of code
    dcols= [f"d_{day}" for day in range(start_day, last_training_day+1)]
    catcols= ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]
    dtype= {dcol: "float32" for dcol in dcols}
    dtype.update({col:"category" for col in catcols if col!="id"})
    stv_df= pd.read_csv(path0, nrows=nrows, usecols= catcols+dcols, dtype=dtype) #look into this line of code
    
    #convert stv_df categorical colums to numerical
    for col in catcols:
        if col != "id":
            stv_df[col] = stv_df[col].cat.codes.astype("int16")
            stv_df[col] -= stv_df[col].min() #subtracts all numbers by its "min" value
    
    if not is_train:
        for day in range(last_training_day+1, last_training_day+28+1):
            stv_df[f"d_{day}"] = np.nan
    
    stv_df2 = pd.melt(stv_df,
                      id_vars=catcols,
                      value_vars=[col for col in stv_df.columns if col.startswith("d_")],
                      var_name="d",
                      value_name="sales"
                     )
    
    stv_df2 = stv_df2.merge(c_df2, on="d", copy=False)
    stv_df2 = stv_df2.merge(sp_df2, on=["store_id", "item_id", "wm_yr_wk"], copy=False)
    return stv_df2
    
    
        

In [33]:
def create_fea(df):
    lags=[7, 28]
    lag_cols= [f"d_{lag}" for lag in lags]
    
    for lag, lag_col in zip(lags, lag_cols):
        df[lag_col] = df[["id", "sales"]].groupby("id")["sales"].shift(lag)
    
    wins= [7, 28]
    for win in wins:
        for lag, lag_col in zip(lags, lag_cols):
            df[f"rmean_{lag}_{win}"] = df[["id", lag_col]].groupby("id")[lag_col].transform(lambda x: x.rolling(win).mean())
            
    date_features = {
        "wday":"weekday",
        "week":"weekofyear",
        "month":"month",
        "quarter":"quarter",
        "year":"year",
        "mday":"day"
    }
    
    for date_feature_name, date_feature_func in date_features.items():
        if date_feature_name in df.columns:
            df[date_feature_name] = df[date_feature_name].astype("int16")
        else:
            df[date_feature_name] = getattr(df["date"].dt, date_feature_func).astype("int16")
        
        

In [34]:
#del df_test
def create_fea(df):
    lags =[7, 28]
    lag_cols = [f'd_{lag}' for lag in lags]
    
    # shift the sales by lag value and append a new column
    for lag, lag_col in zip(lags, lag_cols):
        df[lag_col] = df[['id', 'sales']].groupby('id')['sales'].shift(lag)
        
    wins = [7, 28]
    for win in wins:
        for lag, lag_col in zip(lags, lag_cols):
            df[f'rmean_{lag}_{win}'] = df[['id', lag_col]].groupby('id')[lag_col].transform(lambda x: x.rolling(win).mean())
    
    date_features = {
        'wday':'weekday',
        "week":"isocalendar().week",
#         'week':'weekofyear',
        'month':'month',
        'quarter':'quarter',
        'year':'year',
        'mday':'day'
    }
    
    # This code will add missing features of date in the dataframe & for the existing features it 
    # will change their type to int16
    for date_feature_name, date_feature_func in date_features.items():
        if date_feature_name in df.columns:
            df[date_feature_name] = df[date_feature_name].astype('int16')
        else:
            if date_feature_func == "isocalendar().week":
                df[date_feature_name] = df["date"].dt.isocalendar().week.astype("int16")
            else:
                df[date_feature_name] = getattr(df['date'].dt, date_feature_func).astype('int16')

In [35]:
FIRST_DAY=1

In [36]:
sp_df.dtypes

store_id      category
item_id       category
wm_yr_wk         int16
sell_price     float32
dtype: object

In [37]:
%%time

df = create_df(is_train=True, first_day= FIRST_DAY)

CPU times: user 45.6 s, sys: 33.2 s, total: 1min 18s
Wall time: 1min 39s


In [38]:
df.loc[df["event_name_1"] > 3][["event_name_1"]]

Unnamed: 0,event_name_1
88499,27
88500,27
88501,27
88502,27
88503,27
...,...
45174232,5
45174233,5
45174234,5
45174235,5


In [39]:
len(df["event_name_1"].unique())

31

In [40]:
c_df["event_name_1"].unique()

[NaN, 'SuperBowl', 'ValentinesDay', 'PresidentsDay', 'LentStart', ..., 'Chanukah End', 'NewYear', 'OrthodoxChristmas', 'MartinLutherKingDay', 'Easter']
Length: 31
Categories (30, object): ['Chanukah End', 'Christmas', 'Cinco De Mayo', 'ColumbusDay', ..., 'SuperBowl', 'Thanksgiving', 'ValentinesDay', 'VeteransDay']

In [41]:
    date_features = {
        "wday":"weekday",
        "week":"weekofyear",
        "month":"month",
        "quarter":"quarter",
        "year":"year",
        "mday":"day"
    }

In [42]:
%%time

create_fea(df)
print(df.shape)

(46027957, 31)
CPU times: user 2min 59s, sys: 38.4 s, total: 3min 37s
Wall time: 4min 12s


In [43]:
df.dropna(inplace=True)
df.shape

(44351007, 31)

In [44]:
#del sp_df, c_df

In [45]:
cat_features = ['item_id', 'store_id', 'cat_id', 'dept_id', 'state_id'] + ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
useless_cols = ['id', 'date', 'sales', 'd', 'wm_yr_wk', 'weekday']
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df['sales']

In [46]:
import sys
import pyarrow


In [47]:
print(sys.executable)

/Users/ericklopez/Desktop/DemandLocalGlobal/venv/bin/python


In [54]:
des_path = "../data/m5_content"
des_path2 = f"{des_path}/m5_data.parquet"
df.to_parquet(des_path2, engine="pyarrow", compression="snappy")
print(f"Successfully saved to {des_path2}")

Successfully saved to ../data/m5_content//m5_data.parquet


In [55]:
des_path = "../data/m5_content"
des_path2 = f"{des_path}/X_train.parquet"
df.to_parquet(des_path2, engine="pyarrow", compression="snappy")
print(f"Successfully saved to {des_path2}")

Successfully saved to ../data/m5_content//X_train.parquet


In [56]:
des_path = "../data/m5_content/"
des_path2 = f"{des_path}/y_train.parquet"
df.to_parquet(des_path2, engine="pyarrow", compression="snappy")
print(f"Successfully saved to {des_path2}")

Successfully saved to ../data/m5_content//y_train.parquet


In [49]:

# %%time

# np.random.seed(777)

# # This is a random sample, we're not gonna apply any time series train-test-split tricks here!
# fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace=False) # Validation dataset
# train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds) # Training dataset

# train_data = lgb.Dataset(X_train.loc[train_inds], label=y_train.loc[train_inds], categorical_feature=cat_features, free_raw_data=False)
# fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label=y_train.loc[fake_valid_inds], categorical_feature=cat_features, free_raw_data=False)

In [50]:
# del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()


In [51]:
# params={
# #    'device':'gpu',
#     'objective':'poisson',
#     'metric':['rmse'],
#     'force_row_wise':True,
#     'learning_rate':0.075,
#     'sub_row': 0.75,
#     'bagging_freq': 1,
#     'lambda_12':0.1,
#     'verbosity':1,
#     'num_iterations':1200,
#     'num_leaves':2**11-1,
#     'min_data_in_leaf':2**12-1
# }

In [52]:
# %%time

# m_lgb = lgb.train(params, train_data, valid_sets=[fake_valid_data], verbose_eval=20)

In [53]:
!pwd

/Users/ericklopez/Desktop/DemandLocalGlobal/empirical/ii_notebooks


In [57]:
#https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html

In [61]:
c_df["date"]= pd.to_datetime(c_df["date"])