In [15]:
%%capture
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import math
import timeit
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from sklearn.ensemble import RandomForestRegressor
from bmc_helper_functions_7 import prep_data, train_test, sarima, convert, model_performance, lstm_function, split_series, train_test_lstm

In [2]:
# SES
def run_ses(DATA_TYPE, p):
    ls = list()
    x = prep_data(DATA_TYPE)
    train_s = int(len(x)*0.80)
    test_s = len(x) - train_s

    t = list()
    t_ = list()
    idx = list()

    START_TIME = timeit.default_timer()

    for i in range(0,int(test_s/p)):
        train, test = train_test(x,train_s,p)

        y_hat = SimpleExpSmoothing(train).fit(smoothing_level=0.5,optimized=False).forecast(p)

        idx = idx + list(test.index)
        t_ = t_ + list(y_hat.values)
        t = t + list(test.values)
        train_s += p

    END_TIME = timeit.default_timer()
    TIME = convert(END_TIME-START_TIME)

    y_real = pd.DataFrame(t,index=idx)
    y_pred = pd.DataFrame(t_,index=idx)
    mape, rmse, mae = model_performance(y_real, y_pred)
    ls = ['SES',DATA_TYPE,p,rmse,mae,mape,TIME]
    return ls

In [3]:
# SARIMA
def run_sarima(DATA_TYPE, p):
    ls = list()
    x = prep_data(DATA_TYPE)

    if (DATA_TYPE == 'modeling_monthly' or DATA_TYPE == 'testing_monthly'):
        sarima_parameters = [[1,1,0],[0,1,1,12]] # monthly parameters
    else:
        sarima_parameters = [[2,0,1],[0,1,1,7]] # daily parameters

    train_s = int(len(x)*0.80)
    test_s = len(x) - train_s

    t = list()
    t_ = list()
    idx = list()

    START_TIME = timeit.default_timer()

    for i in range(0,int(test_s/p)):
        train, test = train_test(x,train_s,p)

        y_hat = sarima(sarima_parameters[0][0],sarima_parameters[0][1],
                       sarima_parameters[0][2],sarima_parameters[1][0],
                       sarima_parameters[1][1],sarima_parameters[1][2],sarima_parameters[1][3],train).get_forecast(steps=p).conf_int()
        y_hat['y'] = (y_hat['lower y'] + y_hat['upper y'])/2

        idx = idx + list(test.index)
        t_ = t_ + list(y_hat['y'].values)
        t = t + list(test.values)
        train_s += p

    END_TIME = timeit.default_timer()
    TIME = convert(END_TIME-START_TIME)

    y_real = pd.DataFrame(t,index=idx)
    y_pred = pd.DataFrame(t_,index=idx)
    mape, rmse, mae = model_performance(y_real, y_pred)

    ls = ['SARIMA',DATA_TYPE,p,rmse,mae,mape,TIME]
    return ls

In [4]:
def run_rf(DATA_TYPE, p):
    ls = list()
    x, dt = prep_data(data_type = DATA_TYPE, model = 'random_forest')

    train_data_gs, test_data_gs = train_test(x,int(len(x)*0.7),0)
    rf_parameters = (20,10,30)

    #augmented out of sample training 
    train_s = int(len(x)*0.80)
    test_s = len(x) - train_s
    t = list()
    t_ = list()
    idx = list()

    START_TIME = timeit.default_timer()

    for i in range(0,int(test_s/p)):
        train, test = train_test(x,train_s,p)

        train_input = train.drop(['y'], axis=1)
        test_input = test.drop(['y'], axis=1)

        model = RandomForestRegressor(n_estimators = rf_parameters[0], max_features = rf_parameters[1], 
                                                  max_depth = rf_parameters[2], random_state = 42).fit(train_input, train['y'])
        y_hat = model.predict(test_input)

        idx = idx + list(test.index)
        t_ = t_ + list(y_hat)
        t = t + list(test['y'].values)
        train_s += p

    END_TIME = timeit.default_timer()
    TIME = convert(END_TIME-START_TIME)

    y_real = pd.DataFrame(t,index=idx)
    y_pred = pd.DataFrame(t_,index=idx)
    mape, rmse, mae = model_performance(y_real, y_pred)

    ls = ['RF',DATA_TYPE,p,rmse,mae,mape,TIME]
    return ls

In [5]:
def run_lstm(DATA_TYPE, p):
    ls = list()
    x = prep_data(data_type = DATA_TYPE)

    n_input, n_nodes, n_epochs, n_activation, n_optimize = [14, 80, 100, 'relu', 'Adam']

    lstm_x, lstm_y, lstm_dt = split_series(x.values, x.index, n_input)
    train_s = int(len(lstm_x)*0.80) ### change from x to lstm_x
    test_s = len(lstm_x) - train_s ### change from x to lstm_x
    t = list()
    t_ = list()
    idx = list()
    
    START_TIME = timeit.default_timer()
        
    for i in range(0,int(test_s/p)):
        train, test, datetime = train_test_lstm(lstm_x, lstm_y, lstm_dt, train_s, p)
                
        # train model using training sample
        train = train.reshape((train.shape[0], train.shape[1], 1))
        model = lstm_function(n_input, n_nodes, n_activation, n_optimize)
        model.fit(train, lstm_y[0:train_s], epochs = n_epochs, verbose=0)
                
        # predict until completion of prediction horizon
        for j in range(0,p):
            if (len(lstm_x)==train_s):
                break
                        
            x_test_instance = lstm_x[train_s+j]
            x_test_instance = x_test_instance.reshape((1,n_input,1))
                    
            t.append(test[j])
            t_.append(int(model.predict(x_test_instance,verbose=0)))
                    
            idx.append(lstm_dt[train_s+j])
                    
        train_s += p
        
    END_TIME = timeit.default_timer()
    TIME = convert(END_TIME-START_TIME)
            
    y_real = pd.DataFrame(t,index=idx)
    y_pred = pd.DataFrame(t_,index=idx)
    
    mape, rmse, mae = model_performance(y_real, y_pred)

    ls = ['LSTM',DATA_TYPE,p,rmse,mae,mape,TIME]
    return ls

In [16]:
ls = list()

ls.append(run_ses('modeling_daily', 7))
ls.append(run_ses('modeling_daily', 30))
ls.append(run_ses('modeling_monthly', 1))
ls.append(run_ses('testing_daily', 7))
ls.append(run_ses('testing_daily', 30))
ls.append(run_ses('testing_monthly', 1))

ls.append(run_sarima('modeling_daily', 7))
ls.append(run_sarima('modeling_daily', 30))
ls.append(run_sarima('modeling_monthly', 1))
ls.append(run_sarima('testing_daily', 7))
ls.append(run_sarima('testing_daily', 30))
ls.append(run_sarima('testing_monthly', 1))

ls.append(run_rf('modeling_daily', 7))
ls.append(run_rf('modeling_daily', 30))
ls.append(run_rf('modeling_monthly', 1))
ls.append(run_rf('testing_daily', 7))
ls.append(run_rf('testing_daily', 30))
ls.append(run_rf('testing_monthly', 1))

ls.append(run_lstm('modeling_daily', 7))
ls.append(run_lstm('modeling_daily', 30))
ls.append(run_lstm('modeling_monthly', 1))
ls.append(run_lstm('testing_daily', 7))
ls.append(run_lstm('testing_daily', 30))
ls.append(run_lstm('testing_monthly', 1))

df = pd.DataFrame(ls,columns=['MODEL','DATA_TYPE','PREDICTION_WINDOW','RMSE','MAE','MAPE','RUN_TIME'])

df.to_csv('Regression_Performance_BMC_Data.csv', index = False)