In [None]:
#loading dependencies 

import sys 
import numpy as np 
import pandas as pd                                               #for data processing 
import matplotlib.pyplot as plt        
import seaborn as sns                                             #interactive graph
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler                  #normalization
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score

#keras deep learning 
import itertools 
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Activation
from keras.utils import np_utils, to_categorical 
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from keras.layers.convolutional import Conv1D, MaxPooling1D


In [None]:
#loading data from a .txt file using pandas 

#Date and Time are both merged into one column called 'dt'
data = pd.read_csv('../input/household_power_consumption.txt', sep = ';', 
                   parse_dates = {'dt' : ["Date", "Time"]}, infer_datetime_format = True, low_memory = False, 
                   na_values = ['nan', '?'], index_col = "dt")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
#using pd to get all the columns 

data.columns

**Dealing with columns that have NaN**

In [None]:
columns_with_nan = []

for i in range(7): #7 becausle no. of columns = 7
    if not data.iloc[:, i].notnull().all():
        columns_with_nan.append(i)
        
print(columns_with_nan)

In [None]:
#filling in these NaN values with mean of that column

for i in range(7):
    data.iloc[:, i] = data.iloc[:, i].fillna(data.iloc[:, i].mean())
    
#now, time to check if there are any NaN values left 
data.isnull().sum()

**Data Visualization ** techniques and Introduction to Resampling 

In [None]:
data.Global_active_power.resample('D').sum().plot(title = "Global active power resampled over day (sum)")
plt.tight_layout()
plt.show()  #this is done to make this a seperate graph

data.Global_active_power.resample('D').mean().plot(title = "Global active power resampled over day (mean)", color = "red")
plt.tight_layout()
plt.show()

Comparison of the mean of different feature sampled over day 

In [None]:
features = [0, 1, 2, 3, 4, 5, 6]
i = 1 
groups = features 
values = data.resample("D").mean().values

#plotting each feature column 
plt.figure(figsize = (15, 10))
for group in groups: 
    plt.subplot(len(features), 1, i)
    plt.plot(values[:, group])
    plt.title(data.columns[group], y = 0.75, loc = "right")
    i += 1 
    
plt.show()

Above, you notice there is similarity between Global Active Power and Global Intensity

In [None]:
# frame a sequence as a supervised learning problem
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	dff = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(dff.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(dff.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [None]:
resampled = data.resample("h").mean()
resampled.shape

In [None]:
from pandas import DataFrame
from pandas import concat

values = resampled.values

#normalization
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

reframed = series_to_supervised(scaled, n_in=1, n_out=1, dropnan=True)

reframed.drop(reframed.columns[[8,9,10,11,12,13]], axis=1, inplace=True)
print(reframed.head())


In [None]:
#train and test split 

values = reframed.values

n_train_time = 365*24
train = values[:n_train_time, :]
test = values[n_train_time:, :]

train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape) 

In [None]:
#model 

model = Sequential()
layers = [1, 50, 100, 1]

model.add(LSTM(1, input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(layers[2], return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(layers[3]))
model.add(Activation("linear"))

model.compile(loss='mean_squared_error', optimizer='adam')
 

In [None]:
history = model.fit(train_X, train_y, epochs=20, batch_size=70, validation_data=(test_X, test_y), verbose=2, shuffle=False)

In [None]:
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], 7))
# invert scaling for forecast
inv_yhat = np.concatenate((yhat, test_X[:, -6:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = np.concatenate((test_y, test_X[:, -6:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = np.sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)