In [1]:
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
load_df = pd.read_csv('../data/load_raw.csv')

# Recreate Date index
load_df['Date'] = pd.to_datetime(load_df[['year','month','day']])
load_df.set_index('Date', inplace=True)

# Hour columns
hour_cols = [f'h{i}' for i in range(1,25)]
load_df[hour_cols] = load_df[hour_cols].apply(pd.to_numeric, errors='coerce')

# Daily load (target)
load_df['Daily_Load'] = load_df[hour_cols].sum(axis=1)

load_df.head()


Unnamed: 0_level_0,zone_id,year,month,day,h1,h2,h3,h4,h5,h6,...,h16,h17,h18,h19,h20,h21,h22,h23,h24,Daily_Load
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-01,1,2004,1,1,,,,,,,...,,,,,,,,,,0.0
2004-01-02,1,2004,1,2,,,,,,,...,,,,,,,,,,0.0
2004-01-03,1,2004,1,3,,,,,,,...,,,,,,,,,,0.0
2004-01-04,1,2004,1,4,,,,,,,...,,,,,,,,,,0.0
2004-01-05,1,2004,1,5,,,,,,,...,,,,,,,,,,0.0


In [3]:
load_df['DayOfWeek'] = load_df.index.dayofweek
load_df['Month'] = load_df.index.month
load_df['Day'] = load_df.index.day
load_df['Is_Weekend'] = load_df['DayOfWeek'] >= 5


In [4]:
load_df['Lag_1'] = load_df['Daily_Load'].shift(1)
load_df['Lag_7'] = load_df['Daily_Load'].shift(7)
load_df['Lag_14'] = load_df['Daily_Load'].shift(14)


In [5]:
load_df['Rolling_7'] = load_df['Daily_Load'].rolling(window=7).mean()
load_df['Rolling_14'] = load_df['Daily_Load'].rolling(window=14).mean()


In [6]:
load_df.dropna(inplace=True)
load_df.head()


Unnamed: 0_level_0,zone_id,year,month,day,h1,h2,h3,h4,h5,h6,...,Daily_Load,DayOfWeek,Month,Day,Is_Weekend,Lag_1,Lag_7,Lag_14,Rolling_7,Rolling_14
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-01,4,2004,1,1,484.0,457.0,450.0,448.0,444.0,490.0,...,13341.0,3,1,1,False,0.0,0.0,0.0,1905.857143,952.928571
2004-01-02,4,2004,1,2,461.0,438.0,447.0,439.0,448.0,485.0,...,12876.0,4,1,2,False,13341.0,0.0,0.0,3745.285714,1872.642857
2004-01-03,4,2004,1,3,423.0,406.0,396.0,379.0,378.0,403.0,...,11035.0,5,1,3,True,12876.0,0.0,0.0,5321.714286,2660.857143
2004-01-04,4,2004,1,4,331.0,316.0,309.0,312.0,307.0,327.0,...,10032.0,6,1,4,True,11035.0,0.0,0.0,6754.857143,3377.428571
2004-01-05,4,2004,1,5,310.0,304.0,300.0,294.0,301.0,343.0,...,10186.0,0,1,5,False,10032.0,0.0,0.0,8210.0,4105.0


In [7]:
features = [
    'DayOfWeek', 'Month', 'Is_Weekend',
    'Lag_1', 'Lag_7', 'Lag_14',
    'Rolling_7', 'Rolling_14'
]

X = load_df[features]
y = load_df['Daily_Load']

X.head(), y.head()


(            DayOfWeek  Month  Is_Weekend    Lag_1  Lag_7  Lag_14    Rolling_7  \
 Date                                                                            
 2004-01-01          3      1       False      0.0    0.0     0.0  1905.857143   
 2004-01-02          4      1       False  13341.0    0.0     0.0  3745.285714   
 2004-01-03          5      1        True  12876.0    0.0     0.0  5321.714286   
 2004-01-04          6      1        True  11035.0    0.0     0.0  6754.857143   
 2004-01-05          0      1       False  10032.0    0.0     0.0  8210.000000   
 
              Rolling_14  
 Date                     
 2004-01-01   952.928571  
 2004-01-02  1872.642857  
 2004-01-03  2660.857143  
 2004-01-04  3377.428571  
 2004-01-05  4105.000000  ,
 Date
 2004-01-01    13341.0
 2004-01-02    12876.0
 2004-01-03    11035.0
 2004-01-04    10032.0
 2004-01-05    10186.0
 Name: Daily_Load, dtype: float64)

In [8]:
split_date = '2010-01-01'

X_train = X[X.index < split_date]
X_test  = X[X.index >= split_date]

y_train = y[y.index < split_date]
y_test  = y[y.index >= split_date]

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (1581, 8)
Test size: (0, 8)
