# Forcasting the data using ARIMA model

- [About the data](#data)

In [1]:
import pandas as pd

## About the data

In [2]:
df = pd.read_csv('../data/processed/CA_1_sales_data.csv', index_col=0, parse_dates=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1951 entries, 2011-01-29 to 2016-06-01
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   wm_yr_wk           1951 non-null   int64  
 1   wday               1951 non-null   int64  
 2   month              1951 non-null   int64  
 3   year               1951 non-null   int64  
 4   event_name_1       1951 non-null   object 
 5   event_type_1       1951 non-null   object 
 6   event_name_2       1951 non-null   object 
 7   event_type_2       1951 non-null   object 
 8   cpi                1951 non-null   float64
 9   unemployment_rate  1951 non-null   float64
 10  gas_price          1951 non-null   float64
 11  snap_ca            1951 non-null   int64  
 12  ca_walmart         1951 non-null   float64
 13  ca_hobbies         1951 non-null   float64
 14  ca_household       1951 non-null   float64
 15  ca_foods           1951 non-null   float64
 16  store_

In [3]:
# number of null values
df.isnull().sum()

wm_yr_wk              0
wday                  0
month                 0
year                  0
event_name_1          0
event_type_1          0
event_name_2          0
event_type_2          0
cpi                   0
unemployment_rate     0
gas_price             0
snap_ca               0
ca_walmart            0
ca_hobbies            0
ca_household          0
ca_foods              0
store_sales          10
median_price          0
min_price             0
max_price             0
dtype: int64

In [4]:
# show rows that have null values
df.loc[df.isnull().any(axis=1)]

Unnamed: 0,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,cpi,unemployment_rate,gas_price,snap_ca,ca_walmart,ca_hobbies,ca_household,ca_foods,store_sales,median_price,min_price,max_price
2016-05-23,11617,3,5,2016,,,,,239.557,4.8,2.403,0,23.76,26.79,46.8,52.08,,3.48,0.23,29.97
2016-05-24,11617,4,5,2016,,,,,239.557,4.8,2.403,0,24.48,30.21,43.55,49.84,,3.48,0.23,29.97
2016-05-25,11617,5,5,2016,,,,,239.557,4.8,2.403,0,23.4,37.62,34.45,54.88,,3.48,0.23,29.97
2016-05-26,11617,6,5,2016,,,,,239.557,4.8,2.403,0,23.76,27.36,48.75,53.76,,3.48,0.23,29.97
2016-05-27,11617,7,5,2016,,,,,239.557,4.8,2.403,0,25.92,26.79,43.55,47.04,,3.48,0.23,29.97
2016-05-28,11618,1,5,2016,,,,,239.557,4.8,2.403,0,32.4,33.63,33.15,45.92,,3.48,0.23,29.97
2016-05-29,11618,2,5,2016,,,,,239.557,4.8,2.403,0,32.76,57.0,25.35,48.72,,3.48,0.23,29.97
2016-05-30,11618,3,5,2016,MemorialDay,National,,,239.557,4.8,2.44,0,36.0,43.32,36.4,54.32,,3.48,0.23,29.97
2016-05-31,11618,4,5,2016,,,,,239.557,4.8,2.44,0,26.28,47.88,44.85,48.16,,3.48,0.23,29.97
2016-06-01,11618,5,6,2016,,,,,240.222,4.9,2.44,1,24.85,37.38,39.52,53.0,,3.48,0.23,29.97


In [5]:
df.index

DatetimeIndex(['2011-01-29', '2011-01-30', '2011-01-31', '2011-02-01',
               '2011-02-02', '2011-02-03', '2011-02-04', '2011-02-05',
               '2011-02-06', '2011-02-07',
               ...
               '2016-05-23', '2016-05-24', '2016-05-25', '2016-05-26',
               '2016-05-27', '2016-05-28', '2016-05-29', '2016-05-30',
               '2016-05-31', '2016-06-01'],
              dtype='datetime64[ns]', length=1951, freq=None)

## Data Processing

In [6]:
df = df.dropna()

In [7]:
def get_day_value(df):
    """ Preprocessing function
    Creates day column.
    """
    df = df.assign(**{'day': df.index.day})
    
    return df

df_processed = df.pipe(get_day_value)
df_processed.tail()

Unnamed: 0,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,cpi,unemployment_rate,...,snap_ca,ca_walmart,ca_hobbies,ca_household,ca_foods,store_sales,median_price,min_price,max_price,day
2016-05-18,11616,5,5,2016,,,,,239.557,4.8,...,0,23.04,34.2,50.05,52.64,3995.0,3.48,0.23,29.97,18
2016-05-19,11616,6,5,2016,,,,,239.557,4.8,...,0,23.4,26.79,50.7,49.84,4136.0,3.48,0.23,29.97,19
2016-05-20,11616,7,5,2016,,,,,239.557,4.8,...,0,23.76,40.47,37.7,51.52,4433.0,3.48,0.23,29.97,20
2016-05-21,11617,1,5,2016,,,,,239.557,4.8,...,0,30.24,44.46,39.0,48.72,5764.0,3.48,0.23,29.97,21
2016-05-22,11617,2,5,2016,,,,,239.557,4.8,...,0,31.68,29.07,42.9,53.76,6289.0,3.48,0.23,29.97,22


In [8]:
event_cols = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']

df_processed = (df_processed.join([pd.get_dummies(df[col], prefix=col) for col in event_cols])
                            .drop(event_cols, axis=1)
                            .rename(str.lower, axis=1))

df_processed.tail()

Unnamed: 0,wm_yr_wk,wday,month,year,cpi,unemployment_rate,gas_price,snap_ca,ca_walmart,ca_hobbies,...,event_type_1_religious,event_type_1_sporting,event_name_2_cinco de mayo,event_name_2_easter,event_name_2_father's day,event_name_2_none,event_name_2_orthodoxeaster,event_type_2_cultural,event_type_2_none,event_type_2_religious
2016-05-18,11616,5,5,2016,239.557,4.8,2.345,0,23.04,34.2,...,0,0,0,0,0,1,0,0,1,0
2016-05-19,11616,6,5,2016,239.557,4.8,2.345,0,23.4,26.79,...,0,0,0,0,0,1,0,0,1,0
2016-05-20,11616,7,5,2016,239.557,4.8,2.345,0,23.76,40.47,...,0,0,0,0,0,1,0,0,1,0
2016-05-21,11617,1,5,2016,239.557,4.8,2.345,0,30.24,44.46,...,0,0,0,0,0,1,0,0,1,0
2016-05-22,11617,2,5,2016,239.557,4.8,2.345,0,31.68,29.07,...,0,0,0,0,0,1,0,0,1,0


In [9]:
# Print firs row of df_processed with all columns displayed
pd.set_option('display.max_columns', None)
print(df_processed.head(1))

            wm_yr_wk  wday  month  year      cpi  unemployment_rate  \
2011-01-29     11101     1      1  2011  221.187                9.1   

            gas_price  snap_ca  ca_walmart  ca_hobbies  ca_household  \
2011-01-29      3.163        0        39.5       20.14          36.5   

            ca_foods  store_sales  median_price  min_price  max_price  day  \
2011-01-29      63.0       4337.0          3.18        0.2       20.0   29   

            event_name_1_chanukah end  event_name_1_christmas  \
2011-01-29                          0                       0   

            event_name_1_cinco de mayo  event_name_1_columbusday  \
2011-01-29                           0                         0   

            event_name_1_easter  event_name_1_eid al-fitr  \
2011-01-29                    0                         0   

            event_name_1_eidaladha  event_name_1_father's day  \
2011-01-29                       0                          0   

            event_name_1_hallowee

In [10]:
cat_cols = ['snap_ca', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
timeseries_cols = ['cpi', 'unemployment_rate', 'gas_price', 'ca_walmart', 'ca_hobbies', 'ca_household', 'ca_foods']
time_cols = ['wm_yr_wk', 'wday', 'day', 'month', 'year']
target_col = ['store_sales', 'median_price', 'min_price', 'max_price']

In [11]:
from statsmodels.tsa.stattools import grangercausalitytests
import numpy as np

maxlag=12
test = 'ssr_chi2test'
def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False):    
    """Check Granger Causality of all possible combinations of the Time series.
    The rows are the response variable, columns are predictors. The values in the table 
    are the P-Values. P-Values lesser than the significance level (0.05), implies 
    the Null Hypothesis that the coefficients of the corresponding past values is 
    zero, that is, the X does not cause Y can be rejected.

    data      : pandas dataframe containing the time series variables
    variables : list containing names of the time series variables.
    """
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

df_used = df_processed[timeseries_cols+target_col]
grangers_causation_matrix(df_used, variables = df_used.columns)  



Unnamed: 0,cpi_x,unemployment_rate_x,gas_price_x,ca_walmart_x,ca_hobbies_x,ca_household_x,ca_foods_x,store_sales_x,median_price_x,min_price_x,max_price_x
cpi_y,1.0,0.2332,0.0,0.4841,0.0134,0.3423,0.2106,0.3963,0.2985,0.0543,0.3846
unemployment_rate_y,0.0422,1.0,0.6723,0.3568,0.3788,0.0329,0.7919,0.4673,0.2356,0.0016,0.2194
gas_price_y,0.0,0.0003,1.0,0.0,0.0,0.0008,0.0006,0.0331,0.0263,0.0152,0.0
ca_walmart_y,0.0,0.0,0.0,1.0,0.0,0.0002,0.0,0.0,0.0028,0.0402,0.0018
ca_hobbies_y,0.0048,0.004,0.0,0.0,1.0,0.0026,0.0008,0.0,0.004,0.1526,0.0009
ca_household_y,0.17,0.0487,0.0608,0.0,0.0009,1.0,0.001,0.0,0.0,0.1977,0.0051
ca_foods_y,0.0005,0.0005,0.0004,0.0,0.0002,0.0005,1.0,0.0,0.0994,0.0841,0.0008
store_sales_y,0.0,0.0,0.0,0.0,0.0002,0.0,0.0,1.0,0.0,0.0052,0.0
median_price_y,0.8121,0.5962,0.2461,0.6079,0.0745,0.063,0.116,0.3905,1.0,0.2079,0.7623
min_price_y,0.715,0.4057,0.1166,0.3147,0.3441,0.0648,0.9341,0.778,0.6225,1.0,0.2808


In [12]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen

def cointegration_test(df, alpha=0.05): 
    """Perform Johanson's Cointegration Test and Report Summary"""
    out = coint_johansen(df,-1,5)
    d = {'0.90':0, '0.95':1, '0.99':2}
    traces = out.lr1
    cvts = out.cvt[:, d[str(1-alpha)]]
    def adjust(val, length= 6): return str(val).ljust(length)

    # Summary
    print('Name   ::  Test Stat > C(95%)    =>   Signif  \n', '--'*20)
    for col, trace, cvt in zip(df.columns, traces, cvts):
        print(adjust(col), ':: ', adjust(round(trace,2), 9), ">", adjust(cvt, 8), ' =>  ' , trace > cvt)

cointegration_test(df_used)

Name   ::  Test Stat > C(95%)    =>   Signif  
 ----------------------------------------
cpi    ::  695.29    > 263.2603  =>   True
unemployment_rate ::  512.92    > 219.4051  =>   True
gas_price ::  382.28    > 179.5199  =>   True
ca_walmart ::  256.38    > 143.6691  =>   True
ca_hobbies ::  174.26    > 111.7797  =>   True
ca_household ::  103.38    > 83.9383   =>   True
ca_foods ::  59.74     > 60.0627   =>   False
store_sales ::  28.45     > 40.1749   =>   False
median_price ::  11.86     > 24.2761   =>   False
min_price ::  4.59      > 12.3212   =>   False
max_price ::  0.03      > 4.1296    =>   False


## Stationary Analysis

In [13]:
from statsmodels.tsa.stattools import adfuller

def adfuller_test(series, sig=0.05, name=''):
    res = adfuller(series, autolag='AIC')    
    p_value = round(res[1], 3) 

    if p_value <= sig:
        print(f" {name} : P-Value = {p_value} => Stationary. ")
    else:
        print(f" {name} : P-Value = {p_value} => Non-stationary.")

for name, column in df_processed[timeseries_cols].iteritems():
    adfuller_test(column, name=column.name)

 cpi : P-Value = 0.092 => Non-stationary.
 unemployment_rate : P-Value = 0.91 => Non-stationary.
 gas_price : P-Value = 0.638 => Non-stationary.
 ca_walmart : P-Value = 0.002 => Stationary. 
 ca_hobbies : P-Value = 0.0 => Stationary. 
 ca_household : P-Value = 0.0 => Stationary. 
 ca_foods : P-Value = 0.0 => Stationary. 


In [14]:
non_stationary_cols = ['cpi', 'unemployment_rate', 'gas_price']
data_differenced = df_processed[non_stationary_cols].diff().dropna()
for name, column in data_differenced.iteritems():
    adfuller_test(column, name=column.name)


 cpi : P-Value = 0.0 => Stationary. 
 unemployment_rate : P-Value = 0.0 => Stationary. 
 gas_price : P-Value = 0.0 => Stationary. 


In [15]:
df_processed[non_stationary_cols] = df_processed[non_stationary_cols].diff().dropna()

In [16]:
df_processed = df_processed.dropna()