In [18]:
# we'll use the electricity data for this, import:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


def import_energy_data():
    """
    Import the UCI ML data archive Energy dataset
    Args:
        downsample: sample to one row per hour (else every 10 minutes)
    """
    
    # download
    energy_df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv')

    # fix data types
    energy_df['date'] = pd.to_datetime(energy_df['date'])
    energy_df['month'] = energy_df['date'].dt.month.astype(int)
    energy_df['day_of_month'] = energy_df['date'].dt.day.astype(int)

    # day_of_week=0 corresponds to Monday
    energy_df['day_of_week'] = energy_df['date'].dt.dayofweek.astype(int)
    energy_df['hour_of_day'] = energy_df['date'].dt.hour.astype(int)

    # filter columns
    selected_columns = ['date', 'day_of_week', 'hour_of_day', 'Appliances']
    energy_df = energy_df[selected_columns]
    
    # downsample to one hour
    energy_df = energy_df.set_index('date').resample('1H').mean()
    energy_df['date'] = energy_df.index
    
    # model log outcome
    energy_df['log_energy_consumption'] = np.log(energy_df['Appliances'])
    
    datetime_columns = ['date', 'day_of_week', 'hour_of_day']
    target_column = 'log_energy_consumption'
    feature_columns = datetime_columns + ['log_energy_consumption']
    energy_df = energy_df[feature_columns]
    
    return energy_df


def create_sliding_window(data, sequence_length, stride=1):
    X_list, y_list = [], []
    for i in range(len(data)):
        if(i + sequence_length) < len(data):
            X_list.append(data.iloc[i:i+sequence_length:stride, :].values)
            y_list.append(data.iloc[i+sequence_length, -1])
    return np.array(X_list), np.array(y_list)



In [14]:
energy_df = import_energy_data()

In [15]:
energy_df.head()

Unnamed: 0_level_0,date,day_of_week,hour_of_day,log_energy_consumption
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-11 17:00:00,2016-01-11 17:00:00,0,17,4.007333
2016-01-11 18:00:00,2016-01-11 18:00:00,0,18,5.174265
2016-01-11 19:00:00,2016-01-11 19:00:00,0,19,5.155217
2016-01-11 20:00:00,2016-01-11 20:00:00,0,20,4.828314
2016-01-11 21:00:00,2016-01-11 21:00:00,0,21,4.63796


In [23]:
energy_df = import_energy_data()

train_split = 0.7
n_train = int(train_split * len(energy_df))
n_test = len(energy_df) - n_train

features = ['day_of_week', 'hour_of_day', 'log_energy_consumption']
feature_array = energy_df[features].values

# Fit Scaler only on Training features
feature_scaler = MinMaxScaler()
feature_scaler.fit(feature_array[:n_train])
# Fit Scaler only on Training target values
target_scaler = MinMaxScaler()
target_scaler.fit(feature_array[:n_train, -1].reshape(-1, 1))

# Transform on both Training and Test data
scaled_array = pd.DataFrame(feature_scaler.transform(feature_array),
                            columns=features)

sequence_length = 10
X, y = create_sliding_window(scaled_array, sequence_length)

X_train = X[:n_train]
y_train = y[:n_train]

X_test = X[n_train:]
y_test = y[n_train:]

(3290, 4)