In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
from datetime import datetime


## prepare the data. 
## the following code was adapted from milestone 1 notebook

def prepare_data():
    def temperature_date_parser(d):
        # 2017-02-01T00:53:00
        return datetime.strptime(d, '%Y-%m-%dT%H:%M:%S').replace(minute=0, second=0, microsecond=0)

    temp_file_path = os.path.join('.', 'dataset', 'Milestone 1 Dataset', 'hr_temp_20170201-20200131_subset.csv')
    temp_df = pd.read_csv(temp_file_path)
    temp_df['DATE'] = temp_df['DATE'].apply(temperature_date_parser)

    def energy_usage_date_parser(d):
        # 2/1/2017 0:00
        return datetime.strptime(d, '%m/%d/%Y %H:%M').replace(minute=0, second=0, microsecond=0)

    energy_usage_file_path = os.path.join('.', 'dataset', 'Milestone 1 Dataset', 'hrl_load_metered - 20170201-20200131.csv')
    energy_usage_df = pd.read_csv(energy_usage_file_path)
    energy_usage_df['datetime_beginning_ept'] = energy_usage_df['datetime_beginning_ept'].apply(energy_usage_date_parser)

    # join both datasets
    data = temp_df.merge(energy_usage_df, left_on='DATE', right_on='datetime_beginning_ept')[['DATE', 'mw', 'HourlyDryBulbTemperature']]

    # Use the date variable to create variables for the hour of day, the day of the week, the month, and the year for each observation.
    data['hour_of_day'] = data['DATE'].apply(lambda x : x.hour)
    data['day_of_week'] = data['DATE'].apply(lambda x : x.isoweekday())
    data['month'] = data['DATE'].apply(lambda x : x.month)
    data['year'] = data['DATE'].apply(lambda x : x.year)

    # Set the date variable as your index for your pandas DataFrame.
    data = data.set_index(['DATE'])
    
    # Use linear interpolation to fix these missing values in your temperature data.
    data['HourlyDryBulbTemperature'] = data['HourlyDryBulbTemperature'].interpolate(method='linear')
    
    train_set = data[:'2019-12-31']
    test_set = data['2019-12-31':]
    
    return train_set, test_set


(train, test) = prepare_data()

Unnamed: 0_level_0,mw,HourlyDryBulbTemperature,hour_of_day,day_of_week,month,year
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-02-01 00:00:00,1419.881,37.0,0,3,2,2017
2017-02-01 01:00:00,1379.505,37.0,1,3,2,2017
2017-02-01 02:00:00,1366.106,36.0,2,3,2,2017
2017-02-01 03:00:00,1364.453,36.0,3,3,2,2017
2017-02-01 04:00:00,1391.265,36.0,4,3,2,2017
...,...,...,...,...,...,...
2019-12-31 19:00:00,1606.134,32.0,19,2,12,2019
2019-12-31 20:00:00,1565.014,32.0,20,2,12,2019
2019-12-31 21:00:00,1515.905,32.0,21,2,12,2019
2019-12-31 22:00:00,1474.519,30.0,22,2,12,2019
