In [46]:
import boto3
import io
from sagemaker import get_execution_role

role = get_execution_role()
bucket ='sagemaker-data-energy-demand'

In [47]:
import datetime
import pandas as pd
from scipy import stats
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import register_matplotlib_converters
%matplotlib inline
register_matplotlib_converters()


import warnings
warnings.filterwarnings('ignore')

In [48]:
CITY = 'LosAngeles'
df_file = 'dataframes/%s_dataset.csv' % CITY

df_location = 's3://{}/{}'.format(bucket, df_file)
df = pd.read_csv(df_location, index_col ='datetime', parse_dates=True)

In [49]:
len(df.columns)

14

In [51]:
# Add new features based on datetime

cal = calendar()
holidays = cal.holidays(start=df.index.min(), end=df.index.max())

def create_timeseries_features(df):
    """
    Creates time series features from datetime index
    """
    df = df.copy()
    df['date'] = df.index
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    df['isbusinessday']=np.where(df.date.dt.weekday_name.isin(['Saturday','Sunday']),0,1)
    df['isholiday'] = df['date'].isin(holidays).astype(int)
    df['daylight'] = [1 if (hour >= 6 and hour <= 18) else 0 for hour in df['date'].dt.hour]
    return df

df = create_timeseries_features(df.copy())
df.index.name = 'datetime'

## Feature selection

In [52]:
columns = df.columns.tolist()
columns = [c for c in df.columns if c not in ['date', 'demand']]
X = df[columns]  
y = df["demand"]          

# Number of features
nof_list = np.arange(1,df.shape[1])            
high_score = 0

#Variable to store the optimum features
nof=0           
score_list =[]

for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    model = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

cols = list(X.columns)
model = LinearRegression()

#Initializing RFE model
rfe = RFE(model, nof)     

#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  

#Fitting the data to model
model.fit(X_rfe,y)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

Optimum number of features: 24
Score with 24 features: 0.740610
Index(['dailycoolingdegreedays', 'dailyheatingdegreedays',
       'hourlydewpointtemperature', 'hourlydrybulbtemperature',
       'hourlyprecipitation', 'hourlyrelativehumidity', 'hourlyvisibility',
       'hourlywindspeed', 'hourlycoolingdegrees', 'hourlyheatingdegrees',
       'hourlyskyconditions_BKN', 'hourlyskyconditions_FEW',
       'hourlyskyconditions_SCT', 'hour', 'dayofweek', 'quarter', 'month',
       'year', 'dayofyear', 'dayofmonth', 'weekofyear', 'isbusinessday',
       'isholiday', 'daylight'],
      dtype='object')


In [53]:
# Number of features
nof_list = np.arange(1,df.shape[1])            
high_score = 0

#Variable to store the optimum features
nof=0           
score_list =[]

for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    model = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

Optimum number of features: 24
Score with 24 features: 0.740610


In [54]:
# save as csv file to continue in another notebook
selected = df[selected_features_rfe].copy()
selected['demand'] = df['demand']

csv_buffer = io.StringIO()
s3_resource = boto3.resource('s3')
key = 'dataframes/%s_selectedfeatures.csv' % CITY

selected.to_csv(csv_buffer, compression=None)
s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '557C3E616ED6FFA5',
  'HostId': 'en7sASbJTZ3keoJrKJMfAx7z6e8Ak2PLKX47wP/RU0cPg0ebFL/pY8Q6oqRM9Lm0Z1b/jTxmmFo=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'en7sASbJTZ3keoJrKJMfAx7z6e8Ak2PLKX47wP/RU0cPg0ebFL/pY8Q6oqRM9Lm0Z1b/jTxmmFo=',
   'x-amz-request-id': '557C3E616ED6FFA5',
   'date': 'Fri, 17 Jan 2020 03:40:18 GMT',
   'etag': '"3ede3cfa19438f4b589311bf33889fa3"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"3ede3cfa19438f4b589311bf33889fa3"'}