In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data_path_rel = "../../cut/"
arima_path_rel = "./data/"
dataset_size = "10000"

data_path = data_path_rel + dataset_size + "/"
arima_path = arima_path_rel + dataset_size + "-"

time_categories = ["Hourly", "Daily", "Weekly", "Monthly", "Quarterly", "Yearly"]
prediction_horizons = {"Hourly": 48, "Daily": 14, "Weekly": 13, "Monthly": 18, "Quarterly": 8, "Yearly": 6}

In [3]:
from sklearn.preprocessing import MinMaxScaler

def create_residuals(temporal_category, train_data, arima_data, scale=True):
    horizon = prediction_horizons[temporal_category]
    
    assert train_data.shape[0] == arima_data.shape[0], "Both of the data frames must have the same number of rows"
    
    residuals_shape = (arima_data.shape[0], arima_data.shape[1] - horizon)
    
    residuals = np.zeros(shape=residuals_shape)
    test_arima = np.zeros(shape=(arima_data.shape[0], horizon))
    scalers = [None] * train_data.shape[0]
    
    counter = 0
    for i, row in train_data.iterrows():
        arima_row = arima_data.loc[i].dropna()
        
        arima_train = arima_row[:-horizon]
        arima_test = arima_row[-horizon:]
        
        row = row.dropna()[-arima_train.shape[0]:]
        
        residual_values = row.values - arima_row[:-horizon].values
        
        if scale:
            residual_values = residual_values.reshape((residual_values.shape[0], 1))
            
            scaler = MinMaxScaler()
            scaler = scaler.fit(residual_values)
            residual_values = scaler.transform(residual_values).flatten()
            
            scalers[counter] = scaler
            
        residuals[counter, -arima_train.shape[0]:] = residual_values
        test_arima[counter, :] = arima_test
        
        counter += 1
        
    return residuals, test_arima, scalers

In [4]:
def create_sliding_window_from_data(data, horizon):
    nr_windows = data.shape[1] - horizon
    
    X = list()
    y = list()
    
    for row in data:
        for i in range(nr_windows):
            window = row[i: i + horizon]
            
            if np.count_nonzero(window) > 0:
                X.append(window)
                y.append(row[i + horizon])
        
    return np.array(X), np.array(y)

In [5]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

def create_model(input_len, neurons=6):
    model = Sequential()
    model.add(Dense(2 * input_len + 1, activation='relu', input_shape=(input_len,)))
    model.add(Dropout(0.4))
    model.add(Dense(2 * input_len + 1))
    model.add(Dropout(0.4))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    return model

Using TensorFlow backend.


In [6]:
def make_predictions(residuals, model, horizon):
    data = residuals.copy()
    for i in range(horizon):
        pred = model.predict(data[:, -horizon:])
        data = np.hstack((data, pred))
        
    return data[:, -horizon:]

In [7]:
def inverse_scale(predictions, scalers):
    scaled_predictions = [None] * predictions.shape[0]
    
    for i in range(predictions.shape[0]):
        scaler = scalers[i]
        scaled_predictions[i] = scaler.inverse_transform(predictions[i,:].reshape(predictions.shape[1], 1)).flatten()

    return np.array(scaled_predictions)

In [8]:
def get_results(temporal_category):
    train_data = pd.read_csv(data_path + "train/" + temporal_category + ".csv", index_col=1)
    train_data = train_data.iloc[:, 1:]

    arima_data = pd.read_csv(arima_path + temporal_category + ".csv", index_col=0)
    residuals, test_arima, scalers = create_residuals(temporal_category, train_data, arima_data)
    
    train_X, train_y = create_sliding_window_from_data(residuals, prediction_horizons[temporal_category])
    
    model = create_model(train_X.shape[1])
    
    model.fit(train_X, train_y, epochs=20, batch_size=32, verbose=0, validation_split=0.1, shuffle=True)
    
    predictions = make_predictions(residuals, model, prediction_horizons[temporal_category])
    
    predictions = inverse_scale(predictions, scalers)
    
    results_frame = pd.DataFrame(test_arima + predictions, index=train_data.index)
    results_frame.to_csv("./res2_" + temporal_category + "_" + dataset_size + ".csv")

In [9]:
for i in range(len(time_categories)):
    print("Working on " + time_categories[i])
    get_results(time_categories[i])

print("DONE")

Working on Hourly
Working on Daily
Working on Weekly
Working on Monthly
Working on Quarterly
Working on Yearly
DONE
