In [151]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

### Load dataset (generation mix and weather data joined by datetime)

In [152]:
data_file = "scraping/california/california_full_data_hourly_20200101_20211231.csv"

In [153]:
data = pd.read_csv(data_file, parse_dates=[0], index_col=[0])
data

Unnamed: 0_level_0,Solar,Wind,Geothermal,Biomass,Biogas,Small hydro,Coal,Nuclear,Batteries,Imports,Other,Natural Gas,Large Hydro,tempC,uvIndex,WindGustKmph,cloudcover,humidity,precipMM
date_time_hourly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2020-01-01 00:00:00,-33.000000,797.833333,650.083333,299.750000,229.500000,210.416667,9.916667,2273.333333,1.666667,8321.750000,0.0,8247.666667,1345.583333,6,3,4,7,59,0.0
2020-01-01 01:00:00,-33.666667,986.750000,648.083333,302.416667,231.000000,209.750000,8.250000,2272.666667,-2.500000,7684.833333,0.0,8072.500000,1230.416667,6,3,4,8,59,0.0
2020-01-01 02:00:00,-34.000000,1036.416667,645.916667,304.500000,230.416667,210.750000,8.666667,2273.333333,-0.750000,7835.666667,0.0,7311.500000,1176.583333,5,3,4,9,59,0.0
2020-01-01 03:00:00,-35.083333,920.250000,644.166667,301.166667,231.083333,209.750000,8.750000,2273.333333,-0.916667,7857.333333,0.0,7037.833333,1192.916667,5,3,4,10,59,0.0
2020-01-01 04:00:00,-35.000000,1288.750000,643.916667,302.083333,232.000000,209.416667,9.583333,2271.750000,6.083333,7791.416667,0.0,6526.166667,1241.000000,5,3,6,11,58,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-31 19:00:00,-31.083333,3440.416667,878.500000,289.000000,207.000000,247.416667,19.250000,2264.666667,1012.750000,7971.166667,0.0,9230.750000,2224.500000,6,2,13,7,74,0.0
2021-12-31 20:00:00,-32.166667,3448.916667,877.583333,288.166667,207.083333,209.416667,18.416667,2264.416667,638.000000,8281.000000,0.0,9126.416667,1769.666667,6,2,15,6,75,0.0
2021-12-31 21:00:00,-31.666667,3673.500000,878.333333,290.083333,206.833333,196.583333,18.333333,2266.333333,390.916667,8105.666667,0.0,8820.916667,1570.166667,5,2,16,5,76,0.0
2021-12-31 22:00:00,-31.083333,3826.000000,880.166667,290.166667,202.916667,196.333333,19.666667,2265.666667,68.416667,8557.166667,0.0,7789.166667,1358.833333,4,2,17,3,75,0.0


### Add features

Inspiration: CCAI's tutorial on load forecasting https://colab.research.google.com/drive/1o86HKTerVnEi0xjlKOj7uoYkS9fmnrRC#scrollTo=avqrYmzYWPIj

In [154]:
# add day and month indicators
data["day_of_week"] = data.index.weekday
data["month_of_year"] = data.index.month
data = pd.get_dummies(data, columns=["day_of_week","month_of_year"])

In [155]:
genmix_vars = ['Solar', 'Wind', 'Geothermal', 'Biomass', 'Biogas', 'Small hydro',
       'Coal', 'Nuclear', 'Batteries', 'Imports', 'Other', 'Natural Gas',
       'Large Hydro']
weather_vars = ['tempC', 'uvIndex', 'WindGustKmph', 'cloudcover',
       'humidity', 'precipMM']
time_vars = ['day_of_week','month_of_year']

### Generate X and Y data

In [156]:
# Create X as a copy of the data with numerical variables (generation mix and weather variables) standardized
# to have mean 0, standard deviation 1

X = pd.DataFrame(StandardScaler().fit_transform(data[genmix_vars+weather_vars]), index=data.index, columns=genmix_vars+weather_vars)
categorical_columns = data[[c for c in data.columns if "day_of_week" in c] + [c for c in data.columns if "month_of_year" in c]]
X = pd.concat([X, categorical_columns],axis=1)

In [157]:
# Generate Y as a shift of H from X

H = 24 # forecast horizon in hours

Y = pd.DataFrame(index=data.index)
for h in range(H):
    for source in genmix_vars:
        Y[source + "_" + str(h)] = data[source].shift(-h)

In [158]:
# dimension of input
num_features = len(genmix_vars) + len(weather_vars) + 7 + 12 # day of week and month of year are one-hot
num_features

38

In [159]:
# dimension of output
output_dim = len(genmix_vars)*24 # one measurement per hour per energy source in supply mix
output_dim

312

In [160]:
# checking that dimensions are correct
print(X.shape)
print(Y.shape)

(17544, 38)
(17544, 312)


In [161]:
print(X.columns)
print(Y.columns)

Index(['Solar', 'Wind', 'Geothermal', 'Biomass', 'Biogas', 'Small hydro',
       'Coal', 'Nuclear', 'Batteries', 'Imports', 'Other', 'Natural Gas',
       'Large Hydro', 'tempC', 'uvIndex', 'WindGustKmph', 'cloudcover',
       'humidity', 'precipMM', 'day_of_week_0', 'day_of_week_1',
       'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',
       'day_of_week_6', 'month_of_year_1', 'month_of_year_2',
       'month_of_year_3', 'month_of_year_4', 'month_of_year_5',
       'month_of_year_6', 'month_of_year_7', 'month_of_year_8',
       'month_of_year_9', 'month_of_year_10', 'month_of_year_11',
       'month_of_year_12'],
      dtype='object')
Index(['Solar_0', 'Wind_0', 'Geothermal_0', 'Biomass_0', 'Biogas_0',
       'Small hydro_0', 'Coal_0', 'Nuclear_0', 'Batteries_0', 'Imports_0',
       ...
       'Biomass_23', 'Biogas_23', 'Small hydro_23', 'Coal_23', 'Nuclear_23',
       'Batteries_23', 'Imports_23', 'Other_23', 'Natural Gas_23',
       'Large Hydro_23'],
      dty

In [162]:
# Remove rows with Nan
nans = pd.isna(Y).any(1)
nan_indx = nans.index[nans==True]
Y = Y.drop(index=nan_indx)
X = X.drop(index=nan_indx)

nans = pd.isna(X).any(1)
nan_indx = nans.index[nans==True]
Y = Y.drop(index=nan_indx)
X = X.drop(index=nan_indx)

### Split X and Y and save data

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, shuffle=False)

In [167]:
# checking dimensions
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(15725, 38)
(1748, 38)
(15725, 312)
(1748, 312)


In [168]:
X_train.to_csv('Data/X_train_california_2020-2021.csv')
X_test.to_csv('Data/X_test_california_2020-2021.csv')
y_train.to_csv('Data/y_train_california_2020-2021.csv')
y_test.to_csv('Data/y_test_california_2020-2021.csv')