# SVD Components

This notebook fits an ARIMA(1,1,1) plus various number of SVD components ranging from 1 to 4 to find the number of optimal SVD components for each store based on AIC.

In [1]:
import module.util_functions as utf
import module.constants as const
from module.sarima_model import SARIMAX_Model
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load sales data for all stores
store_sales = pd.read_csv('data/store_sales.csv')
store_sales.head()

Unnamed: 0,Store,Date,Type,Week,Weekly Sales (Million),Holiday,Super Bowl,Labor Day,Thanksgiving,Before Christmas,...,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Log of Weekly Sales (Million),Scaled_Week
0,1,2010-02-05,A,5,1.643691,,0,0,0,0,...,2.572,,,,,,211.096358,8.106,0.496944,0.096154
1,1,2010-02-12,A,6,1.641957,Super Bowl,1,0,0,0,...,2.548,,,,,,211.24217,8.106,0.495889,0.115385
2,1,2010-02-19,A,7,1.611968,,0,0,0,0,...,2.514,,,,,,211.289143,8.106,0.477456,0.134615
3,1,2010-02-26,A,8,1.409728,,0,0,0,0,...,2.561,,,,,,211.319643,8.106,0.343396,0.153846
4,1,2010-03-05,A,9,1.554807,,0,0,0,0,...,2.625,,,,,,211.350143,8.106,0.441351,0.173077


In [3]:
from statsmodels.tsa.arima.model import ARIMA

store_list = []       # a list of stores
n_components = []     # a list of number of SVD components
mse_list = []         # a list of MSE
aic_list = []         # a list of AIC

# iterate through a list of stores
for store_num in tqdm(store_sales['Store'].unique()):
    # get sales data for a given store, then set train/test data
    data = utf.get_store_sales(store_sales, store_num)
    train = data[:-const.TEST_SIZE]
    test = data.tail(const.TEST_SIZE)
    X_train = train[['Scaled_Week', 'Super Bowl', 'Labor Day', 'Thanksgiving', 'Before Christmas', 'Christmas']]
    Y_train = train['Log of Weekly Sales (Million)']
    X_test = test[['Scaled_Week', 'Super Bowl', 'Labor Day', 'Thanksgiving', 'Before Christmas', 'Christmas']]
    Y_test = test['Log of Weekly Sales (Million)']
    
    
    test_mse = []    # a list of MSE computed using Test data 
    n_list = []      # a list of number of components
    aic = []         # a list of AIC
    
    for n in range(1, 5):
        svd = TruncatedSVD(n_components=n, random_state=42)
        model = ARIMA(Y_train, exog=svd.fit_transform(X_train), 
                          order=(1, 1, 1)).fit()
        forecast = model.get_prediction(start=train.shape[0], 
                                        end=train.shape[0]+52-1, 
                                        exog=svd.transform(X_test)) 
        predictions = forecast.predicted_mean
        mse = mean_squared_error(Y_test, predictions)
        test_mse.append(mse)
        n_list.append(n)
        aic.append(model.aic)

    # select an optimal number of components based on lowest AIC
    min_index = np.argmin(aic)                # get index of lowest AIC score
    aic_list.append(aic[min_index])           # save lowest AIC score
    mse_list.append(test_mse[min_index])      # save corresponded MSE
    n_components.append(n_list[min_index])    # save corresponded number of components
    store_list.append(store_num)

100%|██████████| 45/45 [00:36<00:00,  1.24it/s]


In [4]:
svd_results = pd.DataFrame({'Store': store_list, 'SVD': n_components, 'AIC': aic_list, 'MSE': mse_list})
svd_results

Unnamed: 0,Store,SVD,AIC,MSE
0,1,4,-200.395425,0.022945
1,2,4,-200.661492,0.036443
2,3,4,-183.875306,0.032015
3,4,4,-209.410248,0.018302
4,5,4,-175.648151,0.044697
5,6,4,-172.702454,0.061619
6,7,1,-129.257477,0.068464
7,8,4,-180.54693,0.020196
8,9,4,-184.173159,0.024153
9,10,4,-179.043505,0.045313


In [5]:
# save results to a csv file
svd_results.to_csv('results/svd_results.csv', index=False)