In [113]:
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
from sklearn import datasets, linear_model
from sklearn.metrics import mean_absolute_error

base = datetime.strptime('20141201', '%Y%m%d') #30-11-2015
date_list = [base + timedelta(minutes=x) for x in range(0, 525600, 15)]
len(date_list)

timeDf = pd.DataFrame({
    'date': date_list
});
# 1. Create a new column day from date
def getDay(x):
    return x.day
timeDf['day'] = timeDf['date'].map(getDay)

# 2. Create a new column month from date
def getMonth(x):
    return x.month
timeDf['month'] = timeDf['date'].map(getMonth)

# 3. Create a new column year from date
def getYear(x):
    return x.year
timeDf['year'] = timeDf['date'].map(getYear)

# 4. Create a new column hour from date
def getHour(x):
    return x.hour
timeDf['hour'] = timeDf['date'].map(getHour)

# 4. Create a new column hour from date
def getWeekDay(x):
    return x.weekday()
timeDf['weekday'] = timeDf['date'].map(getWeekDay)

#timeDf = timeDf[(timeDf['year'] == 2015) & (timeDf['month'] == 11) & (timeDf['day'] == 7)]
timeDf.tail(672)

Unnamed: 0,date,day,month,year,hour,weekday
34368,2015-11-24 00:00:00,24,11,2015,0,1
34369,2015-11-24 00:15:00,24,11,2015,0,1
34370,2015-11-24 00:30:00,24,11,2015,0,1
34371,2015-11-24 00:45:00,24,11,2015,0,1
34372,2015-11-24 01:00:00,24,11,2015,1,1
34373,2015-11-24 01:15:00,24,11,2015,1,1
34374,2015-11-24 01:30:00,24,11,2015,1,1
34375,2015-11-24 01:45:00,24,11,2015,1,1
34376,2015-11-24 02:00:00,24,11,2015,2,1
34377,2015-11-24 02:15:00,24,11,2015,2,1


In [114]:
def predictDemandFor24Hours(houseNo):
    
    fileName = "Home"+str(houseNo)+"_yr1.csv"
    df = pd.read_csv(fileName,sep=",", names = ["demand"])
    data = pd.concat([df, timeDf], axis=1)

    #One Hot Encoding for categorical features
    dataX = pd.get_dummies(data, columns=["day", "month", "year", "hour",  "weekday"])
    dataX = dataX.drop(columns=['demand', 'date'])
    dataY = pd.DataFrame({
        'demand': data.demand
    })

    trainX = dataX[0:32159]  #34944
    trainY = dataY[0:32159]

    testX = dataX[32160:32840]
    testY = dataY[32160:32840] 

    #print("******** Train Data X Shape:", trainX.shape)
    #print("******** Train Data Y Shape:", trainY.shape)
    #print("******** Test Data X Shape:", testX.shape)
    #print("******** Test Data Y Shape:", testY.shape)
    
    return data, trainX, trainY, testX, testY



# DATA ANALYSIS

In [115]:
# Data Analysis on the original data

def dataAnalysis(data):
    
    groupby_month = data.groupby(['month']).mean()
    plt.plot(groupby_month.demand, color='blue')
    plt.xlabel('Month')
    plt.ylabel('Average Electricity Demand')
    plt.show()

    groupby_weekday = data.groupby(['weekday']).mean()
    plt.plot(groupby_weekday.demand, color='gray')
    plt.xlabel('Weekday')
    plt.ylabel('Average Electricity Demand')
    plt.show()

    groupby_hour = data.groupby(['hour']).mean()
    plt.plot(groupby_hour.demand, color='red')
    plt.xlabel('Hour')
    plt.ylabel('Average Electricity Demand')
    plt.show()

    groupby_day = data.groupby(['day']).mean()
    plt.plot(groupby_day.demand, color='green')
    plt.xlabel('Day')
    plt.ylabel('Average Electricity Demand')
    plt.show()



# NAIVE APPROACH

In [116]:
# Naive Method

def naiveMethod(modelList, maeList, house):
    demandArray = np.asarray(trainY.demand)
    y_predicted = testY.copy()
    y_predicted['baseline'] = demandArray[len(demandArray)-1]

    testingError = mean_absolute_error(testY.demand, y_predicted.baseline)
    y_predicted.baseline.to_csv("House_"+str(house)+"_Baseline.csv", index=False)
    #print(testingError)
    modelList.append("Naive");
    maeList.append(testingError);


# MEAN APPROACH

In [117]:
# Average Approach
def meanApproach(modelList, maeList, house):
    y_predicted = testY.copy()
    y_predicted['average'] = trainY.demand.mean()
    testingError = mean_absolute_error(testY.demand, y_predicted.average)
    y_predicted.average.to_csv("House_"+str(house)+"_MeanApproach.csv", index=False)
    #print(testingError)
    modelList.append("Average");
    maeList.append(testingError);
    

# ROLLING AVERAGE APPROACH

In [145]:
# Rolling Average Approach
def rollingAverage(modelList, maeList, trainY, testY, house):
    
    temp1 = trainY[-60:-1]
    temp1.reset_index()
    

    print(temp.demand)
    
    
    y_predicted = pd.DataFrame({})
    temp['movingAverage'] = temp.demand.rolling(59).mean()
    print(temp.movingAverage)
    testingError = mean_absolute_error(testY.demand, temp.movingAverage)
    y_predicted.movingAverage.to_csv("House_"+str(house)+"_RollingAverage.csv", index=False)
    print("Testing Error", testingError)
    modelList.append("Moving Average");
    maeList.append(testingError);

In [156]:
from sklearn.svm import SVR
def SV(modelList, maeList, trainX, trainY, testX, testY, house):
    
    scaler = preprocessing.StandardScaler().fit(trainX)
    trainX = scaler.transform(trainX)
    
    scaler = preprocessing.StandardScaler().fit(testX)
    testX = scaler.transform(testX)
    
    clf = SVR(gamma='scale', C=1.0, epsilon=0.2)
    model = clf.fit(trainX, trainY.values.ravel()) 
    trainMae = mean_absolute_error(trainY, clf.predict(trainX))
    print("Train MAE: ", trainMae)
    
    # Testing
    y_predicted = clf.predict(testX)
    
    df = pd.DataFrame({
        "prediction": y_predicted
    })
    #df.prediction.to_csv("House_"+str(house)+"_LinearRegression.csv", index=False)
    testMae =  mean_absolute_error(testY, y_predicted)
    print("Test MAE: ", testMae)

    modelList.append("SV Regression");
    maeList.append(testMae);
    

# SIMPLE LINEAR REGRESSION

In [164]:
# Simple Linear Regression

from sklearn import linear_model
from sklearn import preprocessing

def linearRegression(modelList, maeList, trainX, trainY, testX, testY, house):
    # Preprocessing
    scaler = preprocessing.StandardScaler().fit(trainX)
    trainX = scaler.transform(trainX)

    scaler = preprocessing.StandardScaler().fit(testX)
    testX = scaler.transform(testX)

    # Training
    regr = linear_model.Ridge()
    model = regr.fit(trainX, trainY.values.ravel())
    trainMae = mean_absolute_error(trainY, regr.predict(trainX))
    print("Train MAE: ", trainMae)

    # Testing
    y_predicted = regr.predict(testX)
    
    df = pd.DataFrame({
        "prediction": y_predicted
    })
    df.prediction.to_csv("House_"+str(house)+"_RidgeRegression.csv", index=False)
    testMae =  mean_absolute_error(testY, y_predicted)
    print("Test MAE: ", testMae)

    modelList.append("Ridge Regression");
    maeList.append(testMae);


# RANDOM FOREST

In [120]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor

def randomForest(modelList, maeList, trainX, trainY, testX, testY, house):
    
    print("Random Forest Regressor");
    # Training
    regr = RandomForestRegressor()
    model = regr.fit(trainX, trainY.values.ravel())
    trainMae = mean_absolute_error(trainY, regr.predict(trainX))

    print("Train MAE: ", trainMae)

    # Testing
    y_predicted = regr.predict(testX)
    df = pd.DataFrame({
        "prediction": y_predicted
    })
    
    print(y_predicted);
    df.prediction.to_csv("House_"+str(house)+"_RandomForest.csv", index=False)
    testMae =  mean_absolute_error(testY, y_predicted)
    #print("Test MAE: ", testMae)

    modelList.append("Random Forest");
    maeList.append(testMae);

# SARIMA

In [121]:
# SARIMA
import statsmodels.api as sm

def sarima(modelList, maeList, trainY, testY, house):
    
    fit1 = sm.tsa.statespace.SARIMAX(trainY.demand).fit()
    y_predicted = fit1.predict(start=34945, end=35040, dynamic=True)
    y_predicted.to_csv("House_"+str(house)+"_Sarima.csv", index=False)
    testingError = mean_absolute_error(testY.demand, y_predicted)
    #print(testingError)
    modelList.append("Arima");
    maeList.append(testingError);

# FINAL RESULT

In [158]:
modelList = []
maeList = []
house = 4
data, trainX, trainY, testX, testY = predictDemandFor24Hours(house)
SV(modelList, maeList, trainX, trainY, testX, testY, house)

TypeError: must be real number, not str

In [165]:
results = pd.DataFrame({
        'Approach': ["Naive", "Mean", "Rolling Average", "Linear Regression", "Random Forest"]
        })
for house in range(1,11):
    modelList = []
    maeList = []
    
    data, trainX, trainY, testX, testY = predictDemandFor24Hours(house)
    #dataAnalysis(data)
    #naiveMethod(modelList, maeList, house)
    #meanApproach(modelList, maeList, house)
    #rollingAverage(modelList, maeList,trainY, testY, house)
    linearRegression(modelList, maeList, trainX, trainY, testX, testY, house)
    #randomForest(modelList, maeList, trainX, trainY, testX, testY, house)
    #sarima(modelList, maeList, trainY, testY, house)
    
    houseNo = "house"+str(house)
    #results[houseNo] = maeList
    
#results

Train MAE:  0.7429666488323223
Test MAE:  0.7628830187665839
Train MAE:  0.8524480989469203
Test MAE:  1.0171628078336223
Train MAE:  0.7025870915109155
Test MAE:  0.8466682431932963
Train MAE:  0.6163162653814932
Test MAE:  0.8212716396003978
Train MAE:  0.8164479863679118
Test MAE:  0.9780769525476551
Train MAE:  0.7476662773379293
Test MAE:  0.9596273533911004
Train MAE:  0.8199955571989531
Test MAE:  0.964489417238299
Train MAE:  0.9409587369663412
Test MAE:  0.9970264005599163
Train MAE:  0.5535376174647516
Test MAE:  0.629615603882038
Train MAE:  0.6417403123469927
Test MAE:  0.7711067669437983


In [96]:
modelList = []
maeList = []
house = 4
data, trainX, trainY, testX, testY = predictDemandFor24Hours(house)
randomForest(modelList, maeList, trainX, trainY, testX, testY, house)

Random Forest Regressor
Train MAE:  0.39067176356083455
[0.43776218 0.43776218 0.43776218 0.43776218 0.3661119  0.3661119
 0.3661119  0.3661119  0.34679199 0.34679199 0.34679199 0.34679199
 0.57305011 0.57305011 0.57305011 0.57305011 0.29146077 0.29146077
 0.29146077 0.29146077 0.37572835 0.37572835 0.37572835 0.37572835
 0.3573926  0.3573926  0.3573926  0.3573926  0.82761545 0.82761545
 0.82761545 0.82761545 1.200245   1.200245   1.200245   1.200245
 0.88199504 0.88199504 0.88199504 0.88199504 1.00022287 1.00022287
 1.00022287 1.00022287 0.24672082 0.24672082 0.24672082 0.24672082
 0.34421619 0.34421619 0.34421619 0.34421619 0.73009928 0.73009928
 0.73009928 0.73009928 1.06723482 1.06723482 1.06723482 1.06723482
 0.64353544 0.64353544 0.64353544 0.64353544 0.98343326 0.98343326
 0.98343326 0.98343326 0.91884253 0.91884253 0.91884253 0.91884253
 1.00995325 1.00995325 1.00995325 1.00995325 0.82278925 0.82278925
 0.82278925 0.82278925 1.0283356  1.0283356  1.0283356  1.0283356
 0.4438155