Comparing Gradient Boosting Regression against KNN Regression for forecasting smart grid load timeseries

Imports

In [10]:
#import time
import time
#import models
from models import *
#import utilities
from utility import *
#import formatter
from formatter import *
import os, pickle
from pathlib import Path

Load dataset

In [11]:
base_dir = os.getcwd()
temps_path = os.path.join(base_dir, "..", "data", "Temp_history_final.csv")
loads_path = os.path.join(base_dir, "..", "data", "Load_history_final.csv")
#precision value
precision = 6
#load temperatures from csv
temps = pd.read_csv(temps_path)
#load loads from csv
loads = pd.read_csv(loads_path)

Set mappings between temperature measurement stations and load zones

In [12]:
#best mapping determined by mapping search
#top 3 stations per load zone as determined by KNN regression
maps = \
[[1, [1, 3, 7]],
[2, [7, 6, 1]],
[3, [9, 8, 11]],
[4, [11, 9, 7]],
[5, [2, 10, 11]],
[6, [9, 8, 11]],
[7, [9, 11, 8]],
[8, [10, 2, 9]],
[9, [4, 8, 9]],
[10, [11, 9, 10]],
[11, [8, 4, 7]],
[12, [6, 7, 1]],
[13, [1, 3, 6]],
[14, [8, 4, 9]],
[15, [5, 3, 10]],
[16, [6, 7, 4]],
[17, [9, 11, 8]],
[18, [4, 6, 7]],
[19, [1, 2, 7]],
[20, [9, 11, 8]]]

#features to add
add_features = ['day_of_year','year','month','indexes','day_of_week']
#validation percent
val_percent = .1
#test percent
test_percent = .1

#save non-hour columns to later recover
load_zone_metadata = backup(loads.to_numpy())

Format dataset and obtain train-validate-test split

In [13]:
#apply mapping
mappings = format_dataset(temps,loads,maps,summary='mean',features=add_features,preprocess=0)

#obtain train-validate-test split
partitions = train_val_test(mappings,val_percent,test_percent,mode=1)

X_train, X_validate, X_test , y_train, y_validate, y_test = partitions

Train, validate, and test Gradient Boosting Regressor

In [14]:
#Gradient Boosting

#hyperparameter settings
n_estimators = 125
learning_rate = .1
subsample = .8

#start timer
gb_training_start_time = time.time()
models = fit_gradient_boosting(X_train,y_train,n_estimators,learning_rate,subsample,mode=1)
#stop timer
gb_training_time = time.time()-gb_training_start_time
#calculate scores on the training and validation sets
train_score = score_gradient_boosting(models, X_train, y_train)
validation_score = score_gradient_boosting(models, X_validate, y_validate)

gb_testing_start_time = time.time()
test_score = score_gradient_boosting(models, X_test, y_test)
gb_testing_time = time.time()-gb_testing_start_time

Display metrics from Gradient Boosting Regressor training, validation, and testing

In [15]:
#print out scores, and time
print("Gradient Boosting Regressor training results:")
print("Training time: "+str(round(gb_training_time,precision))+" seconds")
print("Training average R^2: "+str(round(train_score[0],precision)))
print("Training average RPE: "+str(round(train_score[1],precision)))
print("Validation average R^2: "+str(round(validation_score[0],precision)))
print("Validation average RPE: "+str(round(validation_score[1],precision)))
print()
print("Gradient Boosting Regressor inference results:")
print("Testing time: "+str(round(gb_testing_time,precision))+" seconds")
print("Testing average R^2: "+str(round(test_score[0],precision)))
print("Testing average RPE: "+str(round(test_score[1],precision)))

Gradient Boosting Regressor training results:
Training time: 87.395263 seconds
Training average R^2: 0.926761
Training average RPE: 5.779117
Validation average R^2: 0.883252
Validation average RPE: 7.435713

Gradient Boosting Regressor inference results:
Testing time: 0.253297 seconds
Testing average R^2: 0.877621
Testing average RPE: 7.334323


Display top ten largest prediction errors by relative percentage error (RPE)

In [16]:
print("Top prediction errors sorted in descending order by absolute RPE:")
print()

#print out top 10 errors
predictions = predict_gradient_boosting(models,X_test)
bad_predictions = worst_predictions(predictions,y_test, load_zone_metadata)
print(bad_predictions)

Top prediction errors sorted in descending order by absolute RPE:

  Zone  Year Month Day Hour Predicted load True load          RPE
0   11  2006    10  28   23           3060        18     -16900.0
1   11  2006    10  28   24           2788        37 -7435.135135
2   19  2006     9   6   10         488979     26977 -1712.577381
3   19  2006     9   6   13         434808     25601 -1598.402406
4   19  2006     9   6   15         482674     29179 -1554.182803
5   19  2006     9   6   11         483812     30268 -1498.427382
6   19  2006     9   6   12         429321     26944 -1493.382571
7   19  2006     9   6   16         484691     31122 -1457.390271
8   19  2006     9   6   14         426278     29299 -1354.923376
9   19  2006     6  23    5         686510     57564 -1092.603016


Train, validate, and test KNN Regressor

In [17]:
#KNN

#hyperparameter settings
n_neighbors = 5
weights = 'distance'
p = 1

#start timer
knn_training_start_time = time.time()
models = fit_knn(X_train,y_train,n_neighbors,weights,p,mode=1)
#stop timer
knn_training_time = time.time()-knn_training_start_time
#calculate scores on the trainin and validation sets
train_score = score_knn(models, X_train, y_train)
validation_score = score_knn(models, X_validate, y_validate)

knn_testing_start_time = time.time()
test_score = score_knn(models, X_test, y_test)
knn_testing_time = time.time()-knn_testing_start_time

Display metrics from KNN Regressor training, validation, and testing

In [18]:
#print out scores, and time
print("KNN Regressor training results:")
print("Training time: "+str(round(knn_training_time,precision))+" seconds")
print("Training average R^2: "+str(round(train_score[0],precision)))
print("Training average RPE: "+str(round(train_score[1],precision)))
print("Validation average R^2: "+str(round(validation_score[0],precision)))
print("Validation average RPE: "+str(round(validation_score[1],precision)))
print()
#print out scores, and time
print("KNN Regressor inference results:")
print("Testing time: "+str(round(knn_testing_time,precision))+" seconds")
print("Testing average R^2: "+str(round(test_score[0],precision)))
print("Testing average RPE: "+str(round(test_score[1],precision)))
print()

KNN Regressor training results:
Training time: 0.491 seconds
Training average R^2: 1.0
Training average RPE: 0.000485
Validation average R^2: 0.804616
Validation average RPE: 9.658034

KNN Regressor inference results:
Testing time: 0.35971 seconds
Testing average R^2: 0.818024
Testing average RPE: 8.865103



Display top ten largest prediction errors by relative percentage error (RPE)

In [19]:
print("Top prediction errors sorted in descending order by absolute RPE:")
print()
#print out top 10 errors
predictions = predict_knn(models,X_test)
bad_predictions = worst_predictions(predictions,y_test, load_zone_metadata)
print(bad_predictions)
print()

Top prediction errors sorted in descending order by absolute RPE:

  Zone  Year Month Day Hour Predicted load True load          RPE
0   11  2006    10  28   23           2988        18     -16500.0
1   11  2006    10  28   24           2759        37 -7356.756757
2   19  2006     9   6   10         483339     26977 -1691.670682
3   19  2006     9   6   13         429372     25601 -1577.168861
4   19  2006     9   6   15         465295     29179 -1494.622845
5   19  2006     9   6   16         482193     31122 -1449.363794
6   19  2006     9   6   11         446242     30268 -1374.302894
7   19  2006     9   6   14         430956     29299 -1370.889791
8   19  2006     9   6   12         382158     26944 -1318.341746
9   19  2006     3  15   17         636777     47886  -1229.77697

