# Time series to supervised
Before machine learning can be used, time series forecasting problems must be re-framed as supervised learning problems. From a sequence to pairs of input and output sequences as described at https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/

In [10]:
import boto3
import io
from sagemaker import get_execution_role

role = get_execution_role()
bucket ='sagemaker-data-energy-demand'

In [11]:
import numpy as np 
import pandas as pd 
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings('ignore')

In [12]:
def series_to_supervised(data,  col_names, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
     data: Sequence of observations as a list or NumPy array.
     n_in: Number of lag observations as input (X).
     n_out: Number of observations as output (y).
     dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
     Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('%s(t-%d)' % (col_names[j], i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    if i == 0:
        names += [('%s(t)' % (col_names[j])) for j in range(n_vars)]
    else:
        names += [('%s(t+%d)' % (col_names[j], i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [13]:
CITY = 'LosAngeles'
df_file = 'dataframes/%s_selectedfeatures.csv' % CITY

df_location = 's3://{}/{}'.format(bucket, df_file)
df = pd.read_csv(df_location, index_col ='datetime')
df.tail()

Unnamed: 0_level_0,dailycoolingdegreedays,dailyheatingdegreedays,hourlydewpointtemperature,hourlydrybulbtemperature,hourlyprecipitation,hourlyrelativehumidity,hourlysealevelpressure,hourlystationpressure,hourlyvisibility,hourlywindspeed,...,quarter,month,year,dayofyear,dayofmonth,weekofyear,isbusinessday,isholiday,daylight,demand
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-11 18:00:00+00:00,0.0,0.0,47.0,58.0,0.0,67.0,30.05,29.7,10.0,6.0,...,1,1,2020,11,11,2,0,0,1,2777.0
2020-01-11 19:00:00+00:00,0.0,0.0,47.0,56.0,0.0,72.0,30.06,29.71,10.0,3.0,...,1,1,2020,11,11,2,0,0,0,2771.0
2020-01-11 20:00:00+00:00,0.0,0.0,47.0,57.0,0.0,69.0,30.07,29.73,10.0,3.0,...,1,1,2020,11,11,2,0,0,0,2734.0
2020-01-11 21:00:00+00:00,0.0,0.0,47.0,57.0,0.0,69.0,30.07,29.72,10.0,5.0,...,1,1,2020,11,11,2,0,0,0,2695.0
2020-01-11 22:00:00+00:00,0.0,0.0,46.0,54.0,0.0,75.0,30.08,29.74,10.0,0.0,...,1,1,2020,11,11,2,0,0,0,2635.0


In [14]:
#set the column we want to predict (demand) to the first columns for consistency
values = df.values
# ensure all data is float
values = values.astype('float32')
# frame as supervised learning
reframed = series_to_supervised(values, list(df.columns), 1, 1)
# drop columns we don't want to predict
#reframed.drop(reframed.columns[[15,16,17,18,19,20,21,22,23,24,25,26,27]], axis=1, inplace=True)

cols = list(reframed.columns)
cols.remove('demand(t)')
cols.insert(0,'demand(t)')
reframed = reframed[cols]
reframed.index = df.index[:-1]


In [15]:
reframed.shape

(39492, 58)

In [16]:
reframed.tail()

Unnamed: 0_level_0,demand(t),dailycoolingdegreedays(t-1),dailyheatingdegreedays(t-1),hourlydewpointtemperature(t-1),hourlydrybulbtemperature(t-1),hourlyprecipitation(t-1),hourlyrelativehumidity(t-1),hourlysealevelpressure(t-1),hourlystationpressure(t-1),hourlyvisibility(t-1),...,dayofweek(t),quarter(t),month(t),year(t),dayofyear(t),dayofmonth(t),weekofyear(t),isbusinessday(t),isholiday(t),daylight(t)
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-11 17:00:00+00:00,2777.0,0.0,0.0,45.0,57.0,0.0,64.0,30.049999,29.700001,10.0,...,5.0,1.0,1.0,2020.0,11.0,11.0,2.0,0.0,0.0,1.0
2020-01-11 18:00:00+00:00,2771.0,0.0,0.0,47.0,58.0,0.0,67.0,30.049999,29.700001,10.0,...,5.0,1.0,1.0,2020.0,11.0,11.0,2.0,0.0,0.0,0.0
2020-01-11 19:00:00+00:00,2734.0,0.0,0.0,47.0,56.0,0.0,72.0,30.059999,29.709999,10.0,...,5.0,1.0,1.0,2020.0,11.0,11.0,2.0,0.0,0.0,0.0
2020-01-11 20:00:00+00:00,2695.0,0.0,0.0,47.0,57.0,0.0,69.0,30.07,29.73,10.0,...,5.0,1.0,1.0,2020.0,11.0,11.0,2.0,0.0,0.0,0.0
2020-01-11 21:00:00+00:00,2635.0,0.0,0.0,47.0,57.0,0.0,69.0,30.07,29.719999,10.0,...,5.0,1.0,1.0,2020.0,11.0,11.0,2.0,0.0,0.0,0.0


In [17]:
# save as csv file to continue in another notebook
s3_resource = boto3.resource('s3')

reframed_buffer = io.StringIO()
reframed_key = 'dataframes/%s_reframed.csv' % CITY
reframed.to_csv(reframed_buffer, compression=None)

s3_resource.Object(bucket, reframed_key).put(Body=reframed_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'D6551C7CC0DA1519',
  'HostId': 'fl9tFw+7UAAVGDI33r7H0l4mF4NvycKxOWJPSBzHWUlMYTow/hz56vxN8hmLN2mACO1rl6+Pvl8=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'fl9tFw+7UAAVGDI33r7H0l4mF4NvycKxOWJPSBzHWUlMYTow/hz56vxN8hmLN2mACO1rl6+Pvl8=',
   'x-amz-request-id': 'D6551C7CC0DA1519',
   'date': 'Tue, 14 Jan 2020 04:34:44 GMT',
   'etag': '"b4d927c832bb92ced49c02c30f7a3069"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"b4d927c832bb92ced49c02c30f7a3069"'}