In [24]:
import boto3
import io
from sagemaker import get_execution_role

role = get_execution_role()
bucket ='sagemaker-data-energy-demand'

In [25]:
import datetime
import pandas as pd
from scipy import stats
import numpy as np


import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import register_matplotlib_converters
%matplotlib inline
register_matplotlib_converters()


import warnings
warnings.filterwarnings('ignore')

In [26]:
CITY = 'LosAngeles'
df_file = 'dataframes/%s_dataset.csv' % CITY

df_location = 's3://{}/{}'.format(bucket, df_file)
df = pd.read_csv(df_location, index_col ='datetime', parse_dates=True)

In [27]:
len(df.columns)

29

## Feature selection

In [28]:
columns = df.columns.tolist()
columns = [c for c in df.columns if c not in ['date', 'demand']]
X = df[columns]  
y = df["demand"]          

# Number of features
nof_list = np.arange(1,df.shape[1])            
high_score = 0

#Variable to store the optimum features
nof=0           
score_list =[]

for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    model = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

cols = list(X.columns)
model = LinearRegression()

#Initializing RFE model
rfe = RFE(model, nof)     

#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  

#Fitting the data to model
model.fit(X_rfe,y)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

Optimum number of features: 24
Score with 24 features: 0.752086
Index(['dailycoolingdegreedays', 'dailyheatingdegreedays',
       'hourlydewpointtemperature', 'hourlydrybulbtemperature',
       'hourlyprecipitation', 'hourlyrelativehumidity',
       'hourlystationpressure', 'hourlyvisibility', 'hourlycoolingdegrees',
       'hourlyheatingdegrees', 'hourlyskyconditions_BKN',
       'hourlyskyconditions_CLR', 'hourlyskyconditions_FEW',
       'hourlyskyconditions_OVC', 'hourlyskyconditions_SCT', 'dayofweek',
       'quarter', 'month', 'year', 'dayofyear', 'dayofmonth', 'isbusinessday',
       'isholiday', 'daylight'],
      dtype='object')


In [29]:
# Number of features
nof_list = np.arange(1,df.shape[1])            
high_score = 0

#Variable to store the optimum features
nof=0           
score_list =[]

for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    model = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

Optimum number of features: 24
Score with 24 features: 0.752086


In [30]:
# save as csv file to continue in another notebook
selected = df[selected_features_rfe].copy()
selected['demand'] = df['demand']

csv_buffer = io.StringIO()
s3_resource = boto3.resource('s3')
key = 'dataframes/%s_selectedfeatures.csv' % CITY

selected.to_csv(csv_buffer, compression=None)
s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'CDD03452C0CB2473',
  'HostId': 'BNAotPaXBysz4/4aHt14L6D8Z0lilzwCMxucbT6i4Ao/qLl0E2fUzge0B7zTpHuVaFbMGEZ4Dmw=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'BNAotPaXBysz4/4aHt14L6D8Z0lilzwCMxucbT6i4Ao/qLl0E2fUzge0B7zTpHuVaFbMGEZ4Dmw=',
   'x-amz-request-id': 'CDD03452C0CB2473',
   'date': 'Thu, 16 Jan 2020 17:12:07 GMT',
   'etag': '"e5d314d38888d9b65ec61403d7966230"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"e5d314d38888d9b65ec61403d7966230"'}