In [1]:
# import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Conv1D, MaxPooling1D, Flatten, LSTM, Dense, Dropout
)
from tensorflow.keras.layers import Lambda
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import loaders

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

2024-12-05 17:34:43.426308: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Loading & merging data functions

In [2]:
def load_load_data(mode='grouped'):
    load_history_raw = pd.read_csv("Load_history.csv")

    if mode == 'raw':
        return load_history_raw

    # step 1: prep the data for unpivoting

    # init empty df to hold prepped data
    load_data_wide = pd.DataFrame()
    
    # copy over the zone_id column (it doesn't need any prepping)
    load_data_wide['zone_id'] = load_history_raw['zone_id']
    
    # convert year, month, day columns to one date column
    load_data_wide['date'] = pd.to_datetime(load_history_raw[['year', 'month', 'day']])
    
    # bring in the hour columns
    hour_columns = [f'h{i}' for i in range(1, 25)]
    for col in hour_columns:
        load_data_wide[col] = load_history_raw[col]

    if mode == 'wide':
        return load_data_wide
    
    # step 2: unpivoting
    load_data_long = load_data_wide.melt(
        id_vars = ['zone_id', 'date'], # cols to preserve
        value_vars=hour_columns,       # cols to unpivot
        var_name='hour',               # colname for new index col (hour)
        value_name='load'              # colname for new value col (load)
    )
    
    # step 3: clean up after unpivot
    
    # convert 'hour' from string (i.e. 'h1') to numeric (i.e. 1)
    load_data_long['hour'] = load_data_long['hour'].str.extract(r'(\d+)').astype(int)
    
    # create a full datetime column by augmenting 'date' with 'hour'
    load_data_long['datetime'] = load_data_long['date'] + pd.to_timedelta(load_data_long['hour'] - 1, unit='h')

    # change dtype on 'load'
    load_data_long['load'] = (
        load_data_long['load']
        .replace(',', '', regex=True)                             # remove commas
        .apply(lambda x: x.strip() if isinstance(x, str) else x)  # strip any whitespace
        .apply(pd.to_numeric, errors='coerce')                    # convert to numeric (keeping NaNs)
    )
    
    # drop extra columns
    load_data_long = load_data_long[['zone_id', 'datetime', 'load']]

    # set datetime as index
    load_data_long.set_index('datetime', inplace=True)

    if mode == 'long':
        return load_data_long

    # step 4: repivot

    # pivot s.t. there is one column per zone
    load_data_grouped = load_data_long.pivot(columns='zone_id', values='load')

    # rename columns for clarity
    load_data_grouped = load_data_grouped.rename(columns=lambda col: f"zone_{col}" if col != 'datetime' else col)
    
    if mode == 'grouped':
        return load_data_grouped

    raise 'InvalidModeError'
    return None

def load_temp_data(mode='grouped'):
    temp_history_raw = pd.read_csv("temperature_history.csv")

    if mode == 'raw':
        return temp_history_raw
    
    # step 1: prep the data for unpivoting

    # init empty df to hold prepped data
    temp_data_wide = pd.DataFrame()
    
    # copy over the zone_id column (it doesn't need any prepping)
    temp_data_wide['station_id'] = temp_history_raw['station_id']
    
    # convert year, month, day columns to one date column
    temp_data_wide['date'] = pd.to_datetime(temp_history_raw[['year', 'month', 'day']])
    
    # bring in the hour columns
    hour_columns = [f'h{i}' for i in range(1, 25)]
    for col in hour_columns:
        temp_data_wide[col] = temp_history_raw[col]

    if mode == 'wide':
        return temp_data_wide
    
    # step 2: unpivoting
    temp_data_long = temp_data_wide.melt(
        id_vars = ['station_id', 'date'], # cols to preserve
        value_vars=hour_columns,          # cols to unpivot
        var_name='hour',                  # colname for new index col (hour)
        value_name='temp'                 # colname for new value col (temp)
    )
    
    # step 3: clean up after unpivot
    
    # convert 'hour' from string (i.e. 'h1') to numeric (i.e. 1)
    temp_data_long['hour'] = temp_data_long['hour'].str.extract(r'(\d+)').astype(int)
    
    # create a full datetime column by augmenting 'date' with 'hour'
    temp_data_long['datetime'] = temp_data_long['date'] + pd.to_timedelta(temp_data_long['hour'] - 1, unit='h')
    
    # drop extra columns
    temp_data_long = temp_data_long[['station_id', 'datetime', 'temp']]

    # set datetime as index
    temp_data_long.set_index('datetime', inplace=True)

    if mode == 'long':
        return temp_data_long
    
    # step 4: repivot

    # pivot s.t. there is one column per zone
    temp_data_grouped = temp_data_long.pivot(columns='station_id', values='temp')

    # rename columns for clarity
    temp_data_grouped = temp_data_grouped.rename(columns=lambda col: f"station_{col}" if col != 'datetime' else col)
    
    if mode == 'grouped':
        return temp_data_grouped

    raise 'InvalidModeError'
    return None

def load_all_data(dropna=True):
    # load datasets
    load_data = load_load_data()
    temp_data = load_temp_data()

    # merge datasets
    merged_data = pd.merge(load_data, temp_data, left_index=True, right_index=True, how='inner')
    if dropna:
        merged_data = merged_data.dropna()

    return merged_data

def load_zone_data(zone, dropna=True, data=None):
    if zone < 1 or zone > 20:
        raise 'DomainError'
        return None

    # make colname list
    station_cols = [f'station_{z}' for z in range(1, 12)]
    cols = [f'zone_{zone}'] + station_cols

    # case 1: no preloaded data
    if data is None:
        # load data
        load_data = load_load_data()
        load_data = load_data[f'zone_{zone}']

        # temp data
        temp_data = load_temp_data()

        # merge
        merged_data = pd.merge(load_data, temp_data, left_index=True, right_index=True, how='inner')

        # make empty df
        final_data = pd.DataFrame()

        # rename & load
        final_data['load'] = merged_data[f'zone_{zone}']

        # load the rest
        for col in station_cols:
            final_data[col] = merged_data[col]

        # dropna
        if dropna:
            final_data = final_data.dropna()

        return final_data
    # case 2: preloaded data
    else:
        # make empty df
        final_data = pd.DataFrame()

        # rename & load
        final_data['load'] = data[f'zone_{zone}']

        # load the rest
        for col in station_cols:
            final_data[col] = data[col]

        # dropna
        if dropna:
            final_data = final_data.dropna()

        return final_data

# Load data

In [3]:
data = load_all_data()

In [4]:
data

Unnamed: 0_level_0,zone_1,zone_2,zone_3,zone_4,zone_5,zone_6,zone_7,zone_8,zone_9,zone_10,zone_11,zone_12,zone_13,zone_14,zone_15,zone_16,zone_17,zone_18,zone_19,zone_20,station_1,station_2,station_3,station_4,station_5,station_6,station_7,station_8,station_9,station_10,station_11
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2004-01-01 00:00:00,16853.0,126259.0,136233.0,484.0,6829.0,133088.0,136233.0,3124.0,75243.0,23339.0,90700.0,118378.0,20673.0,21791.0,65970.0,28752.0,30645.0,200946.0,82298.0,79830.0,46.0,38.0,44.0,45.0,42.0,44.0,45.0,43.0,41.0,42.0,36.0
2004-01-01 01:00:00,16450.0,123313.0,133055.0,457.0,6596.0,129909.0,133055.0,2956.0,67368.0,22100.0,86699.0,112480.0,19666.0,21400.0,64600.0,27851.0,30461.0,195835.0,79827.0,77429.0,46.0,36.0,42.0,43.0,42.0,43.0,44.0,44.0,39.0,43.0,32.0
2004-01-01 02:00:00,16517.0,119192.0,128608.0,450.0,6525.0,125717.0,128608.0,2953.0,64050.0,21376.0,84243.0,108435.0,19020.0,20998.0,63843.0,27631.0,30197.0,194093.0,77728.0,75558.0,45.0,35.0,40.0,41.0,40.0,42.0,41.0,42.0,36.0,43.0,31.0
2004-01-01 03:00:00,16873.0,117507.0,126791.0,448.0,6654.0,124162.0,126791.0,2914.0,63861.0,21335.0,84285.0,107224.0,18841.0,21214.0,64023.0,27986.0,30264.0,194708.0,76433.0,75709.0,41.0,30.0,36.0,37.0,39.0,38.0,40.0,34.0,35.0,39.0,30.0
2004-01-01 04:00:00,17064.0,118343.0,127692.0,444.0,6977.0,125320.0,127692.0,3221.0,75852.0,21564.0,86087.0,108870.0,19310.0,21830.0,65679.0,29160.0,30907.0,202458.0,78172.0,77475.0,39.0,30.0,34.0,33.0,40.0,38.0,35.0,30.0,33.0,35.0,34.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2008-06-30 01:00:00,11559.0,141159.0,152311.0,324.0,5391.0,146550.0,152311.0,2393.0,63357.0,71309.0,111029.0,139035.0,13902.0,18225.0,49935.0,25591.0,28803.0,178118.0,59838.0,70839.0,74.0,65.0,76.0,72.0,74.0,70.0,73.0,71.0,72.0,70.0,65.0
2008-06-30 02:00:00,11081.0,135947.0,146687.0,313.0,4933.0,140880.0,146687.0,2291.0,63315.0,69140.0,105139.0,129338.0,13185.0,17049.0,47885.0,23733.0,27561.0,165406.0,55466.0,67356.0,74.0,64.0,76.0,72.0,74.0,70.0,72.0,71.0,72.0,70.0,64.0
2008-06-30 03:00:00,10798.0,133739.0,144304.0,299.0,4716.0,138454.0,144304.0,2234.0,63777.0,67560.0,100889.0,122493.0,13061.0,16415.0,47092.0,22653.0,27188.0,159075.0,53076.0,66952.0,74.0,64.0,75.0,72.0,73.0,70.0,71.0,71.0,69.0,70.0,64.0
2008-06-30 04:00:00,10876.0,135928.0,146666.0,300.0,4719.0,140646.0,146666.0,2392.0,70098.0,67830.0,98820.0,120205.0,13001.0,16483.0,47509.0,22211.0,27447.0,159934.0,53952.0,68682.0,74.0,64.0,75.0,72.0,72.0,70.0,71.0,70.0,69.0,70.0,64.0


# LSTM CNN

## Model Constructor Def

In [5]:
# sequence_length: number of timesteps in the sequence
#     feature_dim: number of features in each timestep
def lstm_cnn(sequence_length, feature_dim, show=False):
    # define shape of input
    input_shape = (sequence_length, feature_dim)
    
    # construct input layer
    inputs = Input(shape=input_shape)

    # construct CNN layers
    cnn = Conv1D(filters=64, kernel_size=3, activation='relu')(inputs)
    cnn = MaxPooling1D(pool_size=2)(cnn)
    cnn = Flatten()(cnn)

    # Define a Lambda layer to reshape the CNN output for LSTM
    lstm_input = Lambda(lambda x: tf.expand_dims(x, axis=1))(cnn)
    
    # construct LSTM layers
    lstm = LSTM(64, return_sequences=False)(lstm_input)

    # Fully connected layers
    dense = Dense(128, activation='relu')(lstm)
    dense = Dropout(0.5)(dense)
    outputs = Dense(1, activation='sigmoid')(dense)  # For binary classification

    # Build and compile the model
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Summary of the model
    if show:
        print(model.summary())

    # Return
    return model

## Cross-Validation

### Functions

In [6]:
def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        end = i + sequence_length
        X.append(data.iloc[i:end, 1:])  # Predictors
        y.append(data.iloc[i + sequence_length, 0])    # Response
    return np.array(X), np.array(y)

In [12]:
def cross_validate_sequence_length(data, sequence_lengths, model_fn, n_splits=5, verbose=False):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    results = {}
    full_results = pd.DataFrame(columns=['seq_len', 'zone_id', 'MSE'])

    seq_counter = 0
    tot_counter = 0
    for seq_len in sequence_lengths:
        if verbose:
            seq_counter += 1
            print(f'Testing sequence length: {seq_len} ({seq_counter}/{len(sequence_lengths)})...')
        
        zone_results = []
        zone_rows = []
        
        for zone in range(1,21):
            if verbose:
                tot_counter += 1
                print(f'Testing zone: {zone} ({zone}/20) ({tot_counter}/{20*len(sequence_lengths)} [{(tot_counter/(20*len(sequence_lengths)))*100}%])...')

            # construct dataset
            zone_data = load_zone_data(zone=zone, data=data)
            X, y = create_sequences(zone_data, seq_len)
            
            errors = []

            split_counter = 0
            for train_idx, val_idx in tscv.split(X):
                X_train, X_val = X[train_idx], X[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]
    
                # Build and train the model
                model = model_fn(seq_len, X.shape[2])
                model.fit(X_train, y_train, epochs=5, verbose=0)  # Adjust epochs as needed
    
                # Evaluate the model
                y_pred = model.predict(X_val)
                errors.append(mean_squared_error(y_val, y_pred))
    
            # Average validation error for this sequence length on this zone
            zone_results.append(np.mean(errors))
            zone_rows.append({'seq_len': seq_len, 'zone_id': zone, 'MSE': zone_results[-1]})
            print(f'mean error for seq_len {seq_len} on zone {zone}: {zone_results[-1]}')

        # Average validation error for this sequence length
        results[seq_len] = np.mean(zone_results)
        new_df = pd.DataFrame(zone_rows)
        full_results = pd.concat([full_results, new_df], ignore_index=True)
        print(f'mean error for seq_len {seq_len}: {results[seq_len]}')

    return results

### Execution

In [8]:
sequence_lengths = [7, 14, 30, 90, 180, 365]
results = cross_validate_sequence_length(data, sequence_lengths, lstm_cnn, verbose=True)
results

Testing sequence length: 7 (1/6)...
Testing zone: 1 (1/20) (1/120)...
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
mean error for seq_len 7 on zone 1: 385403092.95834774
Testing zone: 2 (2/20) (2/120)...
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
mean error for seq_len 7 on zone 2: 31837340786.928364
Testing zone: 3 (3/20) (3/120)...
[

{7: 12683474719.592735,
 14: 12684031588.056734,
 30: 12686099109.588818,
 90: 12691640973.167942,
 180: 12703425464.958973,
 365: 12728299830.723425}