## Import Libraries and Load data

In [3]:
from datetime import date, timedelta

import pandas as pd
import numpy as np

In [4]:
df_train = pd.read_csv(
    '../project_2/favorita-grocery-sales-forecasting/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "../project_2/favorita-grocery-sales-forecasting/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    "../project_2/favorita-grocery-sales-forecasting/items.csv",
).set_index("item_nbr")

In [5]:
df_train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2016-01-01,25,105574,2.564949,False
1,2016-01-01,25,105575,2.302585,False
2,2016-01-01,25,105857,1.386294,False
3,2016-01-01,25,108634,1.386294,False
4,2016-01-01,25,108701,1.098612,True


In [6]:
df_train.shape

(59038132, 5)

In [7]:
df_train.isnull().sum()

date           0
store_nbr      0
item_nbr       0
unit_sales     0
onpromotion    0
dtype: int64

In [8]:
df_train['onpromotion'].value_counts(normalize=True, dropna=False)

False    0.893895
True     0.106105
Name: onpromotion, dtype: float64

In [9]:
df_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,False
1,99197,2017-08-16,125497041,False
1,103501,2017-08-16,125497042,False
1,103520,2017-08-16,125497043,False
1,103665,2017-08-16,125497044,False


In [10]:
df_test.shape

(3370464, 2)

In [11]:
df_test['onpromotion'].value_counts(normalize=True, dropna=False)

False    0.941077
True     0.058923
Name: onpromotion, dtype: float64

In [12]:
items.head()

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103501,CLEANING,3008,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1


In [13]:
items.shape

(4100, 3)

In [None]:
### Create dataframe df_2017 contains data from May 31st 2017 to upto 11 weeks i.e., May 31st 2017 - August 15 2017

In [14]:
df_2017 = df_train[df_train.date.isin(
    pd.date_range("2017-05-31", periods=7 * 11))].copy()
del df_train

In [15]:
df_2017.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
50912462,2017-05-31,1,96995,0.693147,False
50912463,2017-05-31,1,99197,0.693147,False
50912464,2017-05-31,1,103520,1.386294,False
50912465,2017-05-31,1,103665,2.197225,False
50912466,2017-05-31,1,105574,1.386294,False


In [16]:
df_2017.shape

(8125670, 5)

In [None]:
## Data Preprocessing

In [None]:
#### Create promo_2017_train dataframe with promotion data from May 31st 2017 to August 15th 2017 data

In [17]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr","date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)

In [18]:
promo_2017_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion,onpromotion
Unnamed: 0_level_1,date,2017-05-31,2017-06-01,2017-06-02,2017-06-03,2017-06-04,2017-06-05,2017-06-06,2017-06-07,2017-06-08,2017-06-09,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [20]:
promo_2017_train.head()

Unnamed: 0_level_0,date,2017-05-31,2017-06-01,2017-06-02,2017-06-03,2017-06-04,2017-06-05,2017-06-06,2017-06-07,2017-06-08,2017-06-09,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
#### Create promo_2017_test dataframe with promotion data from August 16th 2017 to August 31st, 2017(test data)

In [21]:
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)

In [22]:
promo_2017_test.head()

Unnamed: 0_level_0,date,2017-08-16,2017-08-17,2017-08-18,2017-08-19,2017-08-20,2017-08-21,2017-08-22,2017-08-23,2017-08-24,2017-08-25,2017-08-26,2017-08-27,2017-08-28,2017-08-29,2017-08-30,2017-08-31
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103501,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [23]:
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)

In [24]:
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)# Combine both train and test data of promotions

In [25]:
promo_2017.head()# May 31st - August 31st 

Unnamed: 0_level_0,date,2017-05-31,2017-06-01,2017-06-02,2017-06-03,2017-06-04,2017-06-05,2017-06-06,2017-06-07,2017-06-08,2017-06-09,...,2017-08-22,2017-08-23,2017-08-24,2017-08-25,2017-08-26,2017-08-27,2017-08-28,2017-08-29,2017-08-30,2017-08-31
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [26]:
promo_2017.shape

(156790, 93)

In [27]:
del promo_2017_test, promo_2017_train 

In [28]:
promo_2017.columns

DatetimeIndex(['2017-05-31', '2017-06-01', '2017-06-02', '2017-06-03',
               '2017-06-04', '2017-06-05', '2017-06-06', '2017-06-07',
               '2017-06-08', '2017-06-09', '2017-06-10', '2017-06-11',
               '2017-06-12', '2017-06-13', '2017-06-14', '2017-06-15',
               '2017-06-16', '2017-06-17', '2017-06-18', '2017-06-19',
               '2017-06-20', '2017-06-21', '2017-06-22', '2017-06-23',
               '2017-06-24', '2017-06-25', '2017-06-26', '2017-06-27',
               '2017-06-28', '2017-06-29', '2017-06-30', '2017-07-01',
               '2017-07-02', '2017-07-03', '2017-07-04', '2017-07-05',
               '2017-07-06', '2017-07-07', '2017-07-08', '2017-07-09',
               '2017-07-10', '2017-07-11', '2017-07-12', '2017-07-13',
               '2017-07-14', '2017-07-15', '2017-07-16', '2017-07-17',
               '2017-07-18', '2017-07-19', '2017-07-20', '2017-07-21',
               '2017-07-22', '2017-07-23', '2017-07-24', '2017-07-25',
      

In [29]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)
df_2017.shape

(156790, 77)

In [30]:
df_2017.head()

Unnamed: 0_level_0,date,2017-05-31,2017-06-01,2017-06-02,2017-06-03,2017-06-04,2017-06-05,2017-06-06,2017-06-07,2017-06-08,2017-06-09,...,2017-08-06,2017-08-07,2017-08-08,2017-08-09,2017-08-10,2017-08-11,2017-08-12,2017-08-13,2017-08-14,2017-08-15
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.693147,1.386294,1.098612,1.94591,1.098612,1.098612,0.0,0.0,0.693147,0.693147,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,1.386294,1.098612,1.098612,0.693147,0.0,0.693147,1.609438,0.693147,0.693147,1.098612,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
1,103665,2.197225,0.0,1.791759,1.791759,1.098612,1.386294,1.791759,1.386294,0.0,1.098612,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
1,105574,1.386294,2.484907,1.791759,1.386294,1.386294,1.386294,2.079442,2.397895,1.94591,2.079442,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


In [31]:
items = items.reindex(df_2017.index.get_level_values(1))
items.head()

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1
105574,GROCERY I,1045,0


In [32]:
items.shape

(156790, 3)

In [33]:
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [34]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({  # Mean target for different retrospective timespans & total # promotions
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values
    })
    for i in range(16):  # Promotions on future days
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[  # Target values for future days
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [35]:
print("Preparing dataset...")
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

Preparing dataset...


In [36]:
X_train.shape

(627160, 20)

In [37]:
X_train.head()

Unnamed: 0,mean_3_2017,mean_7_2017,mean_14_2017,promo_14_2017,promo_0,promo_1,promo_2,promo_3,promo_4,promo_5,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,0.0,0.099021,0.099021,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.24589,0.98796,0.835944,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.462098,0.773092,0.840554,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.059351,1.243926,1.14142,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.416165,1.505723,1.645124,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [38]:
y_train.shape

(627160, 16)

In [39]:
y_train

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.69314718, 0.        , 0.        , ..., 0.        , 0.        ,
        1.38629436],
       [0.69314718, 1.38629436, 1.38629436, ..., 0.        , 0.69314718,
        1.94591015],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.69314718, 5.30330491, 2.19722458, ..., 2.48490665, 1.94591015,
        1.38629436],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [40]:
X_val.shape

(156790, 20)

In [41]:
X_val.head()

Unnamed: 0,mean_3_2017,mean_7_2017,mean_14_2017,promo_14_2017,promo_0,promo_1,promo_2,promo_3,promo_4,promo_5,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,0.0,0.354987,0.177493,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.0,0.610952,0.709973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1.059351,0.850092,0.840554,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.229626,0.881969,0.853895,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.866141,1.892588,1.82682,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [42]:
y_val.shape

(156790, 16)

In [43]:
y_val

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.69314718],
       [0.        , 0.        , 0.69314718, ..., 0.        , 1.09861229,
        0.        ],
       [0.69314718, 1.09861229, 1.09861229, ..., 1.38629436, 0.        ,
        1.38629436],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.69314718],
       [1.94591015, 1.38629436, 1.09861229, ..., 2.39789527, 2.39789527,
        1.60943791],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [44]:
X_test.shape

(156790, 20)

In [45]:
X_test.head()

Unnamed: 0,mean_3_2017,mean_7_2017,mean_14_2017,promo_14_2017,promo_0,promo_1,promo_2,promo_3,promo_4,promo_5,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,0.0,0.099021,0.334438,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.0,0.156945,0.206455,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.231049,0.495105,0.573577,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.462098,0.98099,1.031388,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.998577,1.560437,1.629185,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
## Random Forest Regressor

In [62]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=4, min_samples_split=2,
                      min_weight_fraction_leaf=0.0,n_estimators=50,n_jobs=-1,
                      oob_score=True, random_state=7,verbose=0,
                      warm_start=True)

In [63]:
model.fit(X_train, y_train)

RandomForestRegressor(min_samples_leaf=4, n_estimators=50, n_jobs=-1,
                      oob_score=True, random_state=7, warm_start=True)

In [64]:
y_pred = model.predict(X_test)

In [65]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_val, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

Mean Absolute Error: 0.5006102166396864
Mean Squared Error: 0.44332774868915104
Root Mean Squared Error: 0.6658286181061542


In [None]:
## GridSearch Random Forest Regressor

In [66]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [67]:
rfc=RandomForestRegressor(n_jobs=1,random_state=7)

In [68]:
grid = {'n_estimators': [30, 50]}

In [69]:
CV_rfc = GridSearchCV(estimator=rfc,param_grid=grid, cv= 5)

In [70]:
CV_rfc=CV_rfc.fit(X_train, y_train)

KeyboardInterrupt: 

In [60]:
y_pred = CV_rfc.predict(X_test)

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_val, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))