In [210]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display, HTML, display_html
import seaborn as sns
import datetime as dt
import tensorflow as tf
import IPython
import holidays
from datetime import date

from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold

from sklearn.metrics import mean_squared_error,mean_absolute_error

# random search linear regression model on the auto insurance dataset
from scipy.stats import loguniform
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV


In [184]:
df = pd.read_csv('main_data/NSW.csv')
dfmx = pd.read_csv('main_data/NSW_MAX_TEMP/NSW_Data.csv')
dfmn = pd.read_csv('main_data/NSW_MIN_TEMP/NSW_Data.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,b'REGION,SETTLEMENTDATE,TOTALDEMAND,RRP,PERIODTYPE
0,0,1,nNSW1,2016/12/01 00:30:00,7100.66,52.97,TRADE
1,1,2,nNSW1,2016/12/01 01:00:00,6818.0,42.08,TRADE
2,2,3,nNSW1,2016/12/01 01:30:00,6538.58,50.16,TRADE
3,3,4,nNSW1,2016/12/01 02:00:00,6367.53,47.64,TRADE
4,4,5,nNSW1,2016/12/01 02:30:00,6213.0,45.5,TRADE


In [185]:
df = df[['SETTLEMENTDATE','TOTALDEMAND','RRP']]
df[['SETTLEMENTDATE']] = pd.to_datetime(df['SETTLEMENTDATE'], exact=True, cache=True, format='%Y-%m-%d %H:%M:%S')
df = df.sort_values(by='SETTLEMENTDATE')
df.head()

Unnamed: 0,SETTLEMENTDATE,TOTALDEMAND,RRP
201543,1999-01-01 00:30:00,6820.56,27.64
201544,1999-01-01 01:00:00,6421.34833,26.64
201545,1999-01-01 01:30:00,5991.45,25.81
201546,1999-01-01 02:00:00,5663.07333,19.94
201547,1999-01-01 02:30:00,5384.04667,19.48


In [186]:
df = pd.DataFrame({"date": df.SETTLEMENTDATE.dt.date, "time": df.SETTLEMENTDATE.dt.time, "consumption": df.TOTALDEMAND})
df = df.dropna()

In [187]:
df = df.rename(columns = {'date': 'ds', 'consumption': 'consumption'})

# Group data by number of listings per date
df_example = df.groupby(by = 'ds').mean()

# Change index to datetime
df_example.index = pd.to_datetime(df_example.index)

# Sort the values
df_example = df_example.sort_index(ascending = True)

# Fill values with 0
df_example = df_example.fillna(value = 0)

df_example = df_example.head(-1)

df_example

Unnamed: 0_level_0,consumption
ds,Unnamed: 1_level_1
1999-01-01,6118.070460
1999-01-02,6356.287778
1999-01-03,6556.256215
1999-01-04,7777.891632
1999-01-05,8085.142431
...,...
2020-09-12,6913.052500
2020-09-13,6817.360833
2020-09-14,7132.939792
2020-09-15,7313.942083


In [188]:
dfmx['date'] = pd.to_datetime(dfmx[['Year', 'Month', 'Day']])
dfmx = dfmx.rename(columns = {'Maximum temperature (Degree C)': 'max_temp'})
df_max = dfmx.groupby(by = 'date').agg({'max_temp': 'sum'})
max_mask = (df_max.index.date >= df_example.index[0]) & (df_max.index.date <= df_example.index[len(df_example)-1])

dfmn['date'] = pd.to_datetime(dfmn[['Year', 'Month', 'Day']])
dfmn = dfmn.rename(columns = {'Minimum temperature (Degree C)': 'min_temp'})
df_min = dfmn.groupby(by = 'date').agg({'min_temp': 'sum'})
min_mask = (df_min.index.date >= df_example.index[0]) & (df_min.index.date <= df_example.index[len(df_example)-1])

df_example['max_temp'] = df_max.loc[max_mask]
df_example['min_temp'] = df_min.loc[min_mask]
df_example['avg'] = df_example[['max_temp', 'min_temp']].mean(axis=1)

# Holiday added
aus_holidays = holidays.CountryHoliday('AUS', prov='NSW')
df_example['isHoliday'] = 0
for i,j in enumerate(df_example.index):    
    if j in aus_holidays:
        df_example['isHoliday'][i] = 1
    else:
        df_example['isHoliday'][i] = 0

df_example.head()

Unnamed: 0_level_0,consumption,max_temp,min_temp,avg,isHoliday
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1999-01-01,6118.07046,25.7,20.9,23.3,1
1999-01-02,6356.287778,28.3,21.0,24.65,0
1999-01-03,6556.256215,29.8,19.8,24.8,0
1999-01-04,7777.891632,30.9,21.5,26.2,0
1999-01-05,8085.142431,31.1,23.5,27.3,0


In [189]:
df_example['day'] = df_example.index.day
df_example['dayofweek'] = df_example.index.dayofweek
df_example['dayofyear'] = df_example.index.dayofyear
df_example['month'] = df_example.index.month
df_example['weekofyear'] = df_example.index.weekofyear

In [190]:
# df_example1=df_example[0:7670]
# df_example2=df_example[7670:7930]

In [191]:
# df_example1.head()
# df_example2.head()
df_example.head()

Unnamed: 0_level_0,consumption,max_temp,min_temp,avg,isHoliday,day,dayofweek,dayofyear,month,weekofyear
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1999-01-01,6118.07046,25.7,20.9,23.3,1,1,4,1,1,53
1999-01-02,6356.287778,28.3,21.0,24.65,0,2,5,2,1,53
1999-01-03,6556.256215,29.8,19.8,24.8,0,3,6,3,1,53
1999-01-04,7777.891632,30.9,21.5,26.2,0,4,0,4,1,1
1999-01-05,8085.142431,31.1,23.5,27.3,0,5,1,5,1,1


In [192]:
# fig = plt.figure(figsize=(18, 14))
# corr = df_example.corr()
# c = plt.pcolor(corr)
# plt.yticks(np.arange(0.5, len(corr.index), 1), corr.index)
# plt.xticks(np.arange(0.5, len(corr.columns), 1), corr.columns)
# fig.colorbar(c)

In [193]:
# sns.pairplot(df_example, vars=['consumption','max_temp','min_temp','avg', 'isHoliday','day'	,'dayofweek','dayofyear','month','weekofyear'])

In [194]:
# scaler = MinMaxScaler()
# scaler.fit(df_example)
# df_example = df_example.astype(int)
# matrixTransform=scaler.transform(df_example)
# matrixTransform[0:3]

In [195]:
# X = [item[1:10] for item in matrixTransform]
# Y = [item[0] for item in matrixTransform]

In [196]:
df_example = df_example.astype(int)
X = np.array(df_example[['max_temp','min_temp','avg', 'isHoliday','day','dayofweek','dayofyear','month','weekofyear']])
Y = np.array(df_example[['consumption']])

In [197]:
trnX,tesX, trnY,tesY = train_test_split(X,Y, train_size=0.8, random_state=111, shuffle=True)
# trnX,valX,trnY,valY = train_test_split(trnX,trnY, train_size=0.8, random_state=111, shuffle=True)

In [198]:
def evalutionMatrix(y_test,y_pred):
    print('Mean Squared Error  : ' + str(mean_squared_error(tesY, y_pred)))
    print('RMean Squared Error : ' + str(np.sqrt(mean_squared_error(tesY, y_pred))))
    print('Mean Absolute Error : ' + str(mean_absolute_error(y_test,y_pred)))

In [199]:
from sklearn.dummy import DummyRegressor
clf = DummyRegressor(strategy= 'mean').fit(trnX,trnY)
y_pred = clf.predict(tesX)
evalutionMatrix(tesY, y_pred)

Mean Squared Error  : 689950.2471036618
RMean Squared Error : 830.6324380275922
Mean Absolute Error : 674.4346830479178


# GridSearch Implimentation

In [214]:
# define evaluation
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

#SVR
model = svm.SVR()

# define search space
space = dict()
space['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
space['degree'] = [3,5]
space['gamma'] = ['scale','auto']
space['coef0'] = [0.0]
space['tol'] = [0.001]
space['C'] = [1,50,100,500,1000]
space['epsilon'] = [0.1]
space['shrinking'] = [True]
space['cache_size'] = [200]
space['verbose'] = [False]
space['max_iter'] = [-1]

# define search
search = RandomizedSearchCV(model, space, n_iter=500, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv, random_state=1)

result = search.fit(X,Y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import RandomForestRegressor

#RandomForestRegressor
model = RandomForestRegressor()

# define search space
space = dict()
space['n_estimators'] = [10,50,100,200]
space['criterion'] = ['mse','mae']
space['random_state'] = [173]
space['coef0'] = [0.0]
space['tol'] = [0.001]
space['C'] = [1,50,100,500,1000]
space['epsilon'] = [0.1]
space['shrinking'] = [True]
space['cache_size'] = [200]
space['verbose'] = [False]
space['max_iter'] = [-1]

# define search
search = RandomizedSearchCV(model, space, n_iter=500, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv, random_state=1)

result = search.fit(X,Y)
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)