In [3]:
import pandas as pd
import numpy as np

import matplotlib.pylab as plt
%matplotlib inline

plt.rcParams['figure.figsize'] = (8, 5)
plt.rcParams["font.size"] = 14

import utils
import importlib
importlib.reload(utils)

import os
import sys
import subprocess

In [4]:
train, test, info, index = utils.ReadData("debug3")

In [5]:
val = test.values
train_for_val = train.values

In [6]:
log_train_for_val = np.log1p(np.nan_to_num(train_for_val))

In [7]:
# restore indices of weekly seasonal, many zeros
weekly_indices = np.load("../data/google_wtts/weekly_index_for_score.npy")
weekly_indices = np.argwhere(np.isin(index.values.reshape(-1), weekly_indices))

zeros_indices = np.load("../data/google_wtts/zeros_index_for_score.npy")
zeros_indices = np.argwhere(np.isin(index.values.reshape(-1), zeros_indices))

# Model

In [8]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima_model import ARIMA

In [9]:
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

In [10]:
def ARIMA_for_row(i, lags_train=100, 
                  forecast=True, log=True, d=1, return_params=False):
    print("called")
    if log:
        row_train = log_train_for_val[i]
    else:
        row_train = train[i]
        
    lags = 30
    
    p = min((pacf(row_train, nlags=lags, alpha=0.05)[1][:, 0] > 0).sum(), 
            27)
    q = min((acf(row_train, nlags=lags, alpha=0.05)[1][:, 0] > 0).sum(), 
            10)
    
    params = {"maxiter":70, "tol":5e-4}
    model = ARIMA(row_train[-lags_train:], order=(p, d, q))
    try: 
#         print('try1', [p,d,q])
        model_fit = model.fit(**params)
    except ValueError:
        try:
            p = 9
            q = 2
#             print('try2', [p,d,q])
            model = ARIMA(row_train[-lags_train:], order=(p, d, q))
            model_fit = model.fit(**params)
        except ValueError:
            try:
                p = 7
                q = 2
                model = ARIMA(row_train[-lags_train:], order=(p, d, q))
                model_fit = model.fit(**params)
            except:
                try:
                    p = 2
                    q = 2
                    model = ARIMA(row_train[-lags_train:], order=(p, d, q))
                    model_fit = model.fit(**params)
                except:
                    print('ARIMA failed')
                    if forecast:
                        return np.full((60,), -1)
                    else:
                        sync_index = - lags_train + d
                        return np.full((lags_train + 60,), -1), sync_index, [0, 0, 0]

    if forecast:
#         print('forecast True')
        prediction = model_fit.forecast(steps=60)[0]
        print(i, 'prediction ready')
        if return_params:
            return prediction, [p, d, q]
        else:
            return prediction
        
    else:
        if d == 1:
            prediction = model_fit.predict(d, lags_train + 60, typ='levels')
        else:
            prediction = model_fit.predict(d, lags_train + 60)
        
        sync_index = - lags_train + d
        if return_params:
            return prediction, sync_index, [p, d, q]
        else:
            return prediction, sync_index

        

In [11]:
def ARIMA_plot(row_train, idx, row_val, preds, title):
    plt.plot(np.concatenate((row_train[idx:], row_val)), label='validation')
    plt.vlines(row_train[idx:].shape[0], row_val.min(), row_val.max(), linestyles='--')
    plt.plot(row_train[idx:], label='train')
    plt.plot(preds, label='prediction')
    plt.xlabel('days')
    plt.ylabel('log')
    plt.legend()
    plt.title(title)
    plt.show()

# Single row

In [None]:
i = 43
row_train = log_train_for_val[i]

In [None]:
# d = 0. stationary
preds_0, idx, pdq = ARIMA_for_row(i, forecast=False, d=0, return_params=True)
print(pdq)

In [None]:
ARIMA_plot(row_train, idx, np.log1p(np.array(test.iloc[i])), 
           preds_0, 'Index: %i \nARIMA, p=%i, d=%i, q=%i' % (i, pdq[0], pdq[1], pdq[2]))

In [None]:
# d = 1
preds_1, idx, pdq = ARIMA_for_row(i, forecast=False, d=1, return_params=True)
print(pdq)

In [None]:
ARIMA_plot(row_train, idx, np.log1p(np.array(test.iloc[i])), 
           preds_1, 'Index: %i \nARIMA, p=%i, d=%i, q=%i' % (i, pdq[0], pdq[1], pdq[2]))

# Weekly seasonal

In [None]:
for i in weekly_indices[:6].ravel():
    row_train = log_train_for_val[i]

    # d = 0. stationary
    preds_0, idx, pdq = ARIMA_for_row(i, forecast=False, d=0, return_params=True)
    if preds_1[0] != -1:
        ARIMA_plot(row_train, idx, np.log1p(np.array(test.iloc[i])), 
                   preds_0, 'Index: %i \nARIMA for weekly seasonal data, p=%i, d=%i, q=%i' % (i, pdq[0], pdq[1], pdq[2]))

    # d = 1
    preds_1, idx, pdq = ARIMA_for_row(i, forecast=False, d=1, return_params=True)
    if preds_1[0] != -1:
        ARIMA_plot(row_train, idx, np.log1p(np.array(test.iloc[i])), 
                   preds_1, 'Index: %i \nARIMA for weekly seasonal data, p=%i, d=%i, q=%i' % (i, pdq[0], pdq[1], pdq[2]))

# Parallel

In [27]:
zeros_indices.shape

(200, 1)

In [12]:
from joblib import Parallel, delayed

In [28]:
%%time
predictions_0 = Parallel(n_jobs=10, max_nbytes=None)(
    delayed(ARIMA_for_row)(k, d=0) 
    for k in range(2000)
)
print('end')
predictions_0 = np.array(predictions_0)

called
called
called
called
called
called
called
called
called
called
178 prediction ready
called
182 prediction ready
called
187 prediction ready
called
31 prediction ready
called
34 prediction ready
called
177 prediction ready
called
132 prediction ready
called
188 prediction ready
called
197 prediction ready
called
198 prediction ready
called
194 prediction ready
195 prediction ready
192 prediction ready
191 prediction ready
196 prediction ready
199 prediction ready
200 prediction ready
190 prediction ready
143 prediction ready
33 prediction ready
end
CPU times: user 160 ms, sys: 76 ms, total: 236 ms
Wall time: 2min 31s


In [29]:
predictions_0 = utils.from_log_and_correction(predictions_0) 

In [None]:
utils.total_time(40, 10) * 10

In [31]:
x = utils.SMAPE_score(test.values, predictions_0)

SMAPE:  96.68639


In [None]:
utils.SaveModel("arima_d_0",
                predictions_0, test,
                "4833 hours", "no transformation",
                index.values.reshape(-1))
# SMAPE:  56.118244

In [36]:
%%time
predictions_1 = Parallel(n_jobs=10, max_nbytes=None)(
    delayed(ARIMA_for_row)(k, d=1) 
    for k in zeros_indices[:20].ravel()
)
print('end')
predictions_1 = np.array(predictions_1)

called
called
called
called
called
called
called
called
called
called
177 prediction ready
called
132 prediction ready
called
178 prediction ready
called
34 prediction ready
called
143 prediction ready
called
33 prediction ready
called
31 prediction ready
called
197 prediction ready
called
187 prediction ready
called
182 prediction ready
called
195 prediction ready
188 prediction ready
191 prediction ready
192 prediction ready
196 prediction ready
194 prediction ready
199 prediction ready
200 prediction ready
198 prediction ready
190 prediction ready
end
CPU times: user 136 ms, sys: 80 ms, total: 216 ms
Wall time: 1min 56s


In [37]:
predictions_1 = utils.from_log_and_correction(predictions_1) 

In [38]:
x = utils.SMAPE_score(test.iloc[zeros_indices[:20].ravel()].values, predictions_1)

SMAPE:  87.88188


In [None]:
utils.SaveModel("arima_d_1",
                predictions_1, test,
                "2900 hours", "no transformation",
                index.values.reshape(-1))
# SMAPE:  50.363083

In [13]:
utils.total_time(24, 10) * 10

2900.0

In [14]:
x = utils.SMAPE_score(test[:200], predictions_1)

SMAPE:  50.363083
