In [47]:
import pandas as pd
import numpy as np

# Read in data
metered = pd.read_csv('hrl_load_metered - 20170201-20200131.csv')
temp = pd.read_csv('hr_temp_20170201-20200131_subset.csv')

In [48]:
# Convert date to datetime64 and sort
metered['datetime_beginning_ept'] = pd.to_datetime(metered['datetime_beginning_ept'], format='%m/%d/%Y %H:%M')
metered = metered.sort_values(by='datetime_beginning_ept')
metered.head(5)

Unnamed: 0,datetime_beginning_utc,datetime_beginning_ept,nerc_region,mkt_region,zone,load_area,mw,is_verified
0,2/1/2017 5:00,2017-02-01 00:00:00,RFC,WEST,DUQ,DUQ,1419.881,True
1,2/1/2017 6:00,2017-02-01 01:00:00,RFC,WEST,DUQ,DUQ,1379.505,True
2,2/1/2017 7:00,2017-02-01 02:00:00,RFC,WEST,DUQ,DUQ,1366.106,True
3,2/1/2017 8:00,2017-02-01 03:00:00,RFC,WEST,DUQ,DUQ,1364.453,True
4,2/1/2017 9:00,2017-02-01 04:00:00,RFC,WEST,DUQ,DUQ,1391.265,True


In [49]:
# convert date to datetime64 and sort
temp['DATE'] = pd.to_datetime(temp['DATE'], format='%Y-%m-%dT%H:%M:%S')
temp['DATE'] = temp['DATE'].dt.floor('h')
temp = temp.sort_values(by='DATE')
temp.head(5)

Unnamed: 0,STATION,DATE,REPORT_TYPE,SOURCE,HourlyDryBulbTemperature
0,72520514762,2017-02-01 00:00:00,FM-15,7,37.0
1,72520514762,2017-02-01 01:00:00,FM-15,7,37.0
2,72520514762,2017-02-01 02:00:00,FM-15,7,36.0
3,72520514762,2017-02-01 03:00:00,FM-15,7,36.0
4,72520514762,2017-02-01 04:00:00,FM-15,7,36.0


In [63]:
# Join data and create dataset with only values we want
metered['DATE'] = metered['datetime_beginning_ept']
columns_to_keep = ['DATE', 'HourlyDryBulbTemperature', 'mw']
energy_data = temp.merge(metered, how = 'left', on = 'DATE')[columns_to_keep]
energy_data.head(5)

Unnamed: 0,DATE,HourlyDryBulbTemperature,mw
0,2017-02-01 00:00:00,37.0,1419.881
1,2017-02-01 01:00:00,37.0,1379.505
2,2017-02-01 02:00:00,36.0,1366.106
3,2017-02-01 03:00:00,36.0,1364.453
4,2017-02-01 04:00:00,36.0,1391.265


In [64]:
# Add more columns about dates that can be used to gain insights about energy data
energy_data['year'] = energy_data['DATE'].dt.year
energy_data['month'] = energy_data['DATE'].dt.month
energy_data['hour'] = energy_data['DATE'].dt.hour
energy_data['day_of_week'] = energy_data['DATE'].dt.day_name()
energy_data.head(5)

Unnamed: 0,DATE,HourlyDryBulbTemperature,mw,year,month,hour,day_of_week
0,2017-02-01 00:00:00,37.0,1419.881,2017,2,0,Wednesday
1,2017-02-01 01:00:00,37.0,1379.505,2017,2,1,Wednesday
2,2017-02-01 02:00:00,36.0,1366.106,2017,2,2,Wednesday
3,2017-02-01 03:00:00,36.0,1364.453,2017,2,3,Wednesday
4,2017-02-01 04:00:00,36.0,1391.265,2017,2,4,Wednesday


In [65]:
# Use the DATE as the index
energy_data = energy_data.set_index('DATE')
energy_data.head(5)

Unnamed: 0_level_0,HourlyDryBulbTemperature,mw,year,month,hour,day_of_week
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-02-01 00:00:00,37.0,1419.881,2017,2,0,Wednesday
2017-02-01 01:00:00,37.0,1379.505,2017,2,1,Wednesday
2017-02-01 02:00:00,36.0,1366.106,2017,2,2,Wednesday
2017-02-01 03:00:00,36.0,1364.453,2017,2,3,Wednesday
2017-02-01 04:00:00,36.0,1391.265,2017,2,4,Wednesday


In [76]:
# Instructions say to interpolate missing temp data, but I am not seeing any missing data
# I would accomplish this I would use
# energy_data['HourlyDryBulbTemperature'] = energy_data['HourlyDryBulbTemperature'].interpolate()
missing_temp = energy_data[energy_data['HourlyDryBulbTemperature'] == np.nan]
len(missing_temp)

0

In [79]:
# Split up data into training and testing data sets
t = pd.to_datetime("2020-01-01")
training_data = energy_data[energy_data.index < t]
testing_data = energy_data[energy_data.index >= t]
[len(training_data), len(testing_data), len(energy_data)]

[25557, 744, 26301]