In [19]:
from scipy.interpolate import griddata
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.utils.class_weight import compute_sample_weight

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Input, BatchNormalization

tf.random.set_seed(42)
from sklearn.model_selection import train_test_split

In [None]:
import pandas as pd
import numpy as np
import plotly.offline as pyo
from plotly import subplots
import plotly.graph_objects as go

np.random.seed(42)

In [None]:
client = pd.read_csv("../../data/client.csv")
ePrices = pd.read_csv("../../data/electricity_prices.csv")
gasPrices = pd.read_csv("../../data/gas_prices.csv")
train = pd.read_csv("../../data/train.csv")

In [None]:
weatherPredInt = pd.read_csv("interpolPredWeather.csv")
weatherHistInt = pd.read_csv("../histWeatherSnowCover.csv")

# process data

### merge client & train, add business feat & interpolate daylight savings

In [None]:
producing = train.loc[train.is_consumption == 0]
consuming = train.loc[train.is_consumption == 1]
train = pd.merge(producing.drop('is_consumption',axis = 1), consuming.drop('is_consumption',axis = 1),on=['data_block_id','prediction_unit_id','datetime','county','is_business','product_type'], how='outer',suffixes=('_prod', '_cons'))
del producing, consuming
print(train.shape)

clientsTime = pd.merge(train, client, on=['county','is_business','product_type','data_block_id'], how='inner')

clientsTime['datetime'] = pd.to_datetime(clientsTime['datetime'])
clientsTime['yearday'] = clientsTime['datetime'].dt.day_of_year
clientsTime['weekday'] = clientsTime['datetime'].dt.day_of_week
clientsTime['month'] = clientsTime['datetime'].dt.month
clientsTime['monthday'] = clientsTime['datetime'].dt.day
clientsTime['year'] = clientsTime['datetime'].dt.year

unique_pairs = list(set(zip(clientsTime['is_business'], clientsTime[ 'product_type'])))
pair_index_dict = {pair: index for index, pair in enumerate(unique_pairs)}
clientsTime['business_prodType'] = list(map(pair_index_dict.get, zip(clientsTime['is_business'], clientsTime['product_type'])))

unique_pairs_cust = list(set(zip(clientsTime['is_business'], clientsTime[ 'product_type'], clientsTime['county'], clientsTime['eic_count'],clientsTime['installed_capacity'])))
pair_index_dict = {pair: index for index, pair in enumerate(unique_pairs_cust)}
clientsTime['ind_customer_id'] = list(map(pair_index_dict.get, zip(clientsTime['is_business'], clientsTime['product_type'], clientsTime['county'], clientsTime['eic_count'],clientsTime['installed_capacity'])))

# interpolate daylight savings
clientsTime = clientsTime.interpolate()

#### get holidays

In [None]:
import holidays
from datetime import date

us_holidays = holidays.EE()  # this is a dict

clientsTime['holiday'] = clientsTime['datetime'].apply(lambda s : s in us_holidays)

In [None]:
clientsTime['no_workday'] = ((clientsTime['holiday']) | (clientsTime['weekday'] > 4))

### prepare electric prices

In [None]:
ePrices.euros_per_mwh.plot()

In [None]:
ePrices['forecast_date'] = pd.to_datetime(ePrices['forecast_date'])

# Set 'timestamp' as the index
ePrices.set_index('forecast_date', inplace=True)

# Resample to fill missing hours
df_resampled = ePrices.resample('1H').asfreq()

# Linearly interpolate missing values
ePrices = df_resampled.interpolate(method='linear')

In [None]:
ePrices['euros_per_mwh'] = ePrices.euros_per_mwh.replace(4000.0, np.nan)

# Resample to fill missing hours
df_resampled = ePrices.resample('1H').asfreq()

# Linearly interpolate missing values
ePrices = df_resampled.interpolate(method='linear')

# write data into python arrays

In [None]:
#clientsTime, weatherPredInt, weatherHistInt, ePrices, gasPrices

featPredWeather = [
        #'latitude', 'longitude', 
        'County', #'forecast_datetime',
       'hours_ahead',
        'data_block_id', #'origin_datetime', 
       'temperatureint',
       'dewpointint', 'cloudcover_highint', 'cloudcover_lowint',
       'cloudcover_midint', 'cloudcover_totalint',
       '10_metre_u_wind_componentint', '10_metre_v_wind_componentint',
       'direct_solar_radiationint', 'surface_solar_radiation_downwardsint',
       'snowfallint', 'total_precipitationint', 
       #'sunrise', 'sunset',
       #'daylight', 'minDaylight'
       ]

predWeatherComplete = weatherPredInt[featPredWeather]
del weatherPredInt

featHistWeather = [
       #'latitude', 'longitude', 
       'County',
       #'datetime', 
       'data_block_id', 
       'temperatureint', 'dewpointint', 'rainint',
       'snowfallint', 'surface_pressureint', 'cloudcover_totalint',
       'cloudcover_lowint', 'cloudcover_midint', 'cloudcover_highint',
       'windspeed_10mint', 'winddirection_10mint', 'shortwave_radiationint',
       'direct_solar_radiationint', 'diffuse_radiationint', 'meltingInCM',
       'snowHeightFlux', 'snowcover'
]
histWeatherComplete = weatherHistInt[featHistWeather]
del weatherHistInt

# can't use client id as an axis in data because it's not constant!
y = np.zeros((clientsTime.shape[0],2))
y_indexes = np.zeros((clientsTime.shape[0],2)) # dataframe indexes in the end, not important for now
index_y = 0

customers = []

constValsArray = np.zeros((1,17))
targetsArray = np.zeros((1,24,2))
oldTargetsArray = np.zeros((1,24,2)) # targets from t-1
ePricesArray = np.zeros((1,24,1))

nFeatHistWeather = histWeatherComplete.shape[1]-1
histWeatherArray = np.zeros((1,24,nFeatHistWeather))
nFeatPredWeather = predWeatherComplete.shape[1]-1
predWeatherArray = np.zeros((1,24,nFeatPredWeather))
dataBlockIdArray = np.zeros((1))

# loop over customer, append all data cycles for each customer
for customerId in clientsTime.ind_customer_id.unique():
    customerSlice = clientsTime.loc[clientsTime.ind_customer_id == customerId]

    y_cons = 0
    y_prod = 0
    for dataBlockId in customerSlice.data_block_id.unique():
        timeSlice = customerSlice.loc[customerSlice.data_block_id == dataBlockId]
        gasSlice = gasPrices.loc[gasPrices.data_block_id == dataBlockId]
        eSlice   = ePrices.loc[ePrices.data_block_id == dataBlockId]

        if len(timeSlice.yearday.unique()) > 1:
            print(timeSlice.yearday.unique())

        # const values
        yearday = timeSlice.yearday.unique()[0]
        weekday = timeSlice.weekday.unique()[0]
        month   = timeSlice.month.unique()[0]
        monthday = timeSlice.monthday.unique()[0]
        year = timeSlice.year.unique()[0]

        holiday = timeSlice.holiday.unique()[0]
        noWorkDay = timeSlice.no_workday.unique()[0]

        county       = customerSlice.county.unique()[0]
        is_business  = customerSlice.is_business.unique()[0]
        product_type = customerSlice.product_type.unique()[0]
        prediction_unit_id = customerSlice.prediction_unit_id.unique()[0] #should be redundant
        eic_count          = customerSlice.eic_count.unique()[0]
        installed_capacity = customerSlice.installed_capacity.unique()[0]
    
        business_prodType = customerSlice.business_prodType.unique()[0]
        ind_customer_id   = customerSlice.ind_customer_id.unique()[0]

        

        lowest_price_per_mwh = gasSlice.lowest_price_per_mwh.iloc[0]
        highest_price_per_mwh = gasSlice.highest_price_per_mwh.iloc[0]
        euros_per_mwh = eSlice['euros_per_mwh'].to_numpy()
        if euros_per_mwh.shape[0] == 23:
            print(dataBlockId, customerId)

        # feed the old targets per customer
        new_row = np.zeros((1,24,2))
        new_row[:,:,0] = y_cons
        new_row[:,:,1] = y_prod
        oldTargetsArray = np.concatenate((oldTargetsArray, new_row), axis=0)

        y_cons = timeSlice['target_cons']
        y_prod = timeSlice['target_prod']

        histWeather = histWeatherComplete.loc[(histWeatherComplete.data_block_id == dataBlockId) & (histWeatherComplete.County == county)]
        predWeather = predWeatherComplete.loc[(predWeatherComplete.data_block_id == dataBlockId) & (predWeatherComplete.County == county)]

        histWeather = histWeather.drop('data_block_id', axis = 1)
        predWeather = predWeather.drop('data_block_id', axis = 1)


        if y_cons.shape[0] == 23:
            print(dataBlockId, customerId)

        new_row = np.array([county, is_business, product_type, prediction_unit_id, eic_count, installed_capacity, 
        business_prodType,ind_customer_id,lowest_price_per_mwh,highest_price_per_mwh,
        yearday,weekday,month,monthday,year,holiday,noWorkDay
        ])
        constValsArray = np.vstack((constValsArray, new_row))

        new_row = np.zeros((1,24,2))
        new_row[:,:,0] = y_cons
        new_row[:,:,1] = y_prod
        targetsArray = np.concatenate((targetsArray, new_row), axis=0)

        new_row = np.zeros((1,24,1))
        new_row[0,:,0] = euros_per_mwh
        ePricesArray = np.concatenate((ePricesArray, new_row), axis=0)

        new_row = np.zeros((1,24,nFeatHistWeather)) #without datablock id
        new_row[:,:,:] = histWeather
        histWeatherArray = np.concatenate((histWeatherArray, new_row), axis=0)

        new_row = np.zeros((1,24,nFeatPredWeather)) # withotu datablock id
        new_row[:,:,:] = predWeather
        predWeatherArray = np.concatenate((predWeatherArray, new_row), axis=0)

        dataBlockIdArray = np.concatenate((dataBlockIdArray, np.array([dataBlockId])))
    


In [None]:
#remove first rows
targetsArray     = targetsArray[1:,:,:]
oldTargetsArray  = oldTargetsArray[1:,:,:]
constValsArray   = constValsArray[1:,:]
ePricesArray     = ePricesArray[1:,:,:]
histWeatherArray = histWeatherArray[1:,:,:]
predWeatherArray = predWeatherArray[1:,:,:]
dataBlockIdArray = dataBlockIdArray[1:]
print(targetsArray.shape,oldTargetsArray.shape, constValsArray.shape, ePricesArray.shape,histWeatherArray.shape, predWeatherArray.shape, dataBlockIdArray.shape)

In [None]:
np.savez('data_arrays_absTarget_oldTarget.npz', arr1=targetsArray, arr2=constValsArray, arr3=ePricesArray, arr4 = histWeatherArray, arr5=predWeatherArray, arr6=dataBlockIdArray, arr7=oldTargetsArray)

# load data & preselect data

In [8]:
loaded_data = np.load('data_arrays_absTarget_oldTarget.npz')

# Access individual arrays by their keys
targetsAbs = loaded_data['arr1']
constValsArray = loaded_data['arr2']
ePricesArray = loaded_data['arr3']
histWeatherArray = loaded_data['arr4']
predWeatherArray = loaded_data['arr5']
dataBlockIdArray = loaded_data['arr6']
oldTargetsAbs = loaded_data['arr7']


featPredWeather = [
        'County',
       'hours_ahead',
       # 'data_block_id',
       'temperatureint',
       'dewpointint', 'cloudcover_highint', 'cloudcover_lowint',
       'cloudcover_midint', 
       'cloudcover_totalint',
       '10_metre_u_wind_componentint', '10_metre_v_wind_componentint',
       'direct_solar_radiationint', 'surface_solar_radiation_downwardsint',
       'snowfallint', 'total_precipitationint', 
       ]

#correct hours
predWeatherArray[:,:,1] = predWeatherArray[:,:,1] - 24

# add sin of hours
sinDay = np.reshape(np.sin((2*predWeatherArray[:,:,1] - 24)*np.pi/24),(predWeatherArray.shape[0],predWeatherArray.shape[1],1))
predWeatherArray = np.concatenate((predWeatherArray, sinDay), axis=2)
featPredWeather = featPredWeather + ['sinDay']

cosDay = np.reshape(np.cos((2*predWeatherArray[:,:,1] - 24)*np.pi/24),(predWeatherArray.shape[0],predWeatherArray.shape[1],1))
predWeatherArray = np.concatenate((predWeatherArray, cosDay), axis=2)
featPredWeather = featPredWeather + ['cosDay']
nPredFeat = predWeatherArray.shape[2]


featConst =['county', 'is_business', 'product_type', 'prediction_unit_id', 'eic_count', 'installed_capacity', 'business_prodType', 'ind_customer_id',
'lowest_price_per_mwh','highest_price_per_mwh','yearday','weekday','month','monthday','year','holiday','no_workday'
]



# add sin of year day
yeardayIdx = 10
CapIdx = 5
sinYear = np.reshape(np.sin(2*constValsArray[:,yeardayIdx]*np.pi/365),(constValsArray.shape[0],1))
constValsArray = np.concatenate((constValsArray, sinYear), axis=1)
featConst = featConst + ['sinYearDay']

cosYear = np.reshape(np.cos(2*constValsArray[:,yeardayIdx]*np.pi/365),(constValsArray.shape[0],1))
constValsArray = np.concatenate((constValsArray, cosYear), axis=1)
featConst = featConst + ['cosYearDay']

nConst = constValsArray.shape[1]



featHistWeather = [
       'County', 
       #'data_block_id', 
       'temperatureint', 'dewpointint', 'rainint',
       'snowfallint', 'surface_pressureint', 'cloudcover_totalint',
       'cloudcover_lowint', 'cloudcover_midint', 'cloudcover_highint',
       'windspeed_10mint', 'winddirection_10mint', 'shortwave_radiationint',
       'direct_solar_radiationint', 'diffuse_radiationint', 'meltingInCM',
       'snowHeightFlux', 'snowcover'
]
#featHistWeatherKeepIdx = [1,10]
#histWeatherArray=histWeatherArray[:,:,featHistWeatherKeepIdx]

#featHistWeather = [featHistWeather[i] for i in featHistWeatherKeepIdx]
nHistFeat = histWeatherArray.shape[2]

print(nPredFeat,nHistFeat,nConst)

16 18 19


In [16]:
featPredWeatherKeepIdx = [14,15] #only keep cos/sin day features
predWeatherArray=predWeatherArray[:,:,featPredWeatherKeepIdx]
featPredWeather = [featPredWeather[i] for i in featPredWeatherKeepIdx]
nPredFeat = predWeatherArray.shape[2]

#3,4 (prediction unit id, eic count)
#7 ind cust id
# 10 yearday
# 13 monthday
# 14 year
# 15 holiday
featConstKeepIdx = [0,1,2,5,6,8,9,10,11,12,16,17,18]
constValsArray=constValsArray[:,featConstKeepIdx]
featConst = [featConst[i] for i in featConstKeepIdx]
nConst = constValsArray.shape[1]

#featHistWeatherKeepIdx = [1,10]
#histWeatherArray=histWeatherArray[:,:,featHistWeatherKeepIdx]
#featHistWeather = [featHistWeather[i] for i in featHistWeatherKeepIdx]
nHistFeat = histWeatherArray.shape[2]
print(nPredFeat,nHistFeat,nConst)

2 18 13


# build a model & train it

#### split data based on time

In [17]:
trainSplit = int(max(dataBlockIdArray)*0.7)
mask = dataBlockIdArray < trainSplit

In [18]:
shuffledArr = [constValsArray,ePricesArray,histWeatherArray,predWeatherArray,targetsAbs,oldTargetsAbs[:,:,0],dataBlockIdArray]


#X_train = [shuffledArr[0][mask],shuffledArr[1][mask],shuffledArr[2][mask],shuffledArr[3][mask],shuffledArr[5][mask]]
X_train = [shuffledArr[0][mask],shuffledArr[1][mask],shuffledArr[3][mask],shuffledArr[5][mask]]
y_trainAbs = shuffledArr[4][mask]

#X_test = [shuffledArr[0][~mask],shuffledArr[1][~mask],shuffledArr[2][~mask],shuffledArr[3][~mask],shuffledArr[5][~mask]]
X_test = [shuffledArr[0][~mask],shuffledArr[1][~mask],shuffledArr[3][~mask],shuffledArr[5][~mask]]
y_testAbs = shuffledArr[4][~mask]

#### shuffle training data

In [None]:

index_array = np.arange(X_train[0].shape[0])
if 1: #not shuffle!!
    np.random.shuffle(index_array)

arrays = [X_train[0],X_train[1],X_train[2],X_train[3],y_trainAbs]
[X_train[0],X_train[1],X_train[2],X_train[3],y_trainAbs] = [arr[index_array] for arr in arrays]
del arrays

#### calculate class weights

In [None]:


# Calculate class weights on target
class_weights = compute_sample_weight(class_weight='balanced', y=y_train[:,:,0]) #, X=[X_train[0][:,[yeardayIdx,5,6]]])

class_weight_dict = dict(enumerate(class_weights))

In [20]:
# based on capacity (3) or prod business type
class_weights = compute_sample_weight(class_weight='balanced', y=X_train[0][:,4]) #y_train[:,:,0]) #, X=[X_train[0][:,[yeardayIdx,5,6]]])

class_weight_dict = dict(enumerate(class_weights))

#### use standard scaler for train data

In [None]:
# normalize the target
from sklearn.preprocessing import StandardScaler

scaler0 = StandardScaler()
X_train[0] = scaler0.fit_transform(X_train[0])
X_test[0]  = scaler0.transform(X_test[0])

#scaler1 = StandardScaler()
#X_train[1] = scaler1.fit_transform(X_train[1].reshape(-1,1)).reshape(-1,24,1)
#X_test[1]  = scaler1.transform(X_test[1].reshape(-1,1)).reshape(-1,24,1)

#scaler2 = StandardScaler()
#X_train[2] = scaler2.fit_transform(X_train[2])
#X_test[2]  = scaler2.transform(X_test[2])

#### use standard scaler for target

In [None]:
# normalize the target
from sklearn.preprocessing import StandardScaler

scalerT = StandardScaler()
y_trainAbsN_cons = scalerT.fit_transform(y_trainAbs[:,:,0].reshape(-1, 1)).reshape(-1,24,1)
y_testAbsN_cons  = scalerT.transform(y_testAbs[:,:,0].reshape(-1, 1)).reshape(-1,24,1)

#### use a label encoder for the eic count

In [None]:
from sklearn.preprocessing import LabelEncoder

eicCountIdx = featConstKeepIdx.index(4)
# Assuming cat_feature is a categorical feature
encoder = LabelEncoder()
constValsArray[:,eicCountIdx] = encoder.fit_transform(constValsArray[:,eicCountIdx])

### define model & train it

### transformer net
maximum around 140 test error

In [22]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, MultiHeadAttention, LayerNormalization, Flatten
from tensorflow.keras.models import Model

weatherPred_inputs3 = tf.keras.Input(shape=(24, nPredFeat), name='weatherPred_inputs2')
ePrices_inputs3     = tf.keras.Input(shape=(24, 1), name='ePrices_inputs2')
weatherHist_input3  = tf.keras.Input(shape=(24, nHistFeat), name='weatherHist_input2')
constant_inputs3    = tf.keras.Input(shape=(nConst,), name='constant_inputs2')

oldTargets_input = tf.keras.Input(shape=(24,1), name='oldTargets_input')
#oldTargets_input = oldTargets_input[:,:,0] #choose only the cons input
#oldTargets_input = tf.expand_dims(oldTargets_input, axis=-1)

weatherPred_inputs3_normalized = BatchNormalization()(weatherPred_inputs3)
ePrices_inputs3_normalized = BatchNormalization()(ePrices_inputs3)
weatherHist_input3_normalized = BatchNormalization()(weatherHist_input3)
constant_inputs3_normalized = BatchNormalization()(constant_inputs3)

encoder_inputs = tf.concat([weatherPred_inputs3_normalized, ePrices_inputs3_normalized], axis=-1)
#encoder_inputs = tf.concat([weatherPred_inputs3_normalized, ePrices_inputs3_normalized, weatherHist_input3_normalized], axis=-1)
input_shape = encoder_inputs.shape

x = encoder_inputs
print(input_shape)
for _ in range(8):
    x = MultiHeadAttention(num_heads=8, key_dim=16, dropout=0.1)(x, x)
    x = LayerNormalization(epsilon=1e-6)(x)
    y = x
    x = Dense(input_shape[-1], activation="relu")(x)
    #x = Dense(24*, activation="relu")(x)
    x = tf.keras.layers.Add()([x, y])  # Residual connection
    x = Dense(input_shape[-1], activation="relu")(x)

# Flatten the encoder output
encoder_outputs = Flatten()(x)

# Decoder
y = oldTargets_input # = decoder input

decoder_shape = y.shape
print(decoder_shape)
for _ in range(8):
    y = MultiHeadAttention(num_heads=8, key_dim=16, dropout=0.1)(y, y)
    y = LayerNormalization(epsilon=1e-6)(y)
    z = y
    y = Dense(decoder_shape[-1], activation="relu")(y)
    y = tf.keras.layers.Add()([y, z])  # Residual connection
    y = Dense(decoder_shape[-1], activation="relu")(y)

# Flatten the decoder output
decoder_outputs = Flatten()(y)

# Concatenate encoder and decoder outputs
concatenated_outputs = tf.keras.layers.Concatenate(axis=-1)([encoder_outputs, decoder_outputs, constant_inputs3_normalized])

for i in range(20):
    concatenated_outputs = Dense(int(concatenated_outputs.shape[-1]), activation="relu")(concatenated_outputs)
    print(concatenated_outputs.shape)

for i in range(1):
    concatenated_outputs = Dense(int(concatenated_outputs.shape[-1]/(i+1)), activation="relu")(concatenated_outputs)
    print(concatenated_outputs.shape)
final_output = Dense(24, activation="relu")(concatenated_outputs)

# Build the model
transformer_model = Model(inputs=[constant_inputs3,ePrices_inputs3,weatherPred_inputs3,oldTargets_input], outputs=final_output)
#transformer_model = Model(inputs=[constant_inputs3, ePrices_inputs3, weatherHist_input3,weatherPred_inputs3,oldTargets_input], outputs=final_output)


# Compile the model
transformer_model.compile(optimizer='adam', loss='mae')

history3 = transformer_model.fit(X_train, y_trainAbs[:,:,0], validation_data=(X_test, y_testAbs[:,:,0]), epochs=20, batch_size=400)#, class_weight=class_weight_dict)

(None, 24, 3)
(None, 24, 1)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
(None, 109)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
history3 = transformer_model.fit(X_train, y_trainAbs[:,:,0], validation_data=(X_test, y_testAbs[:,:,0]), epochs=50, batch_size=5000, class_weight=class_weight_dict)

In [None]:
transformer_model.save('transformer2.h5')

In [23]:
fig = go.Figure()
length = len(history3.history['loss'])
fig.add_trace(go.Scatter(x=np.arange(1, length+1), y=history3.history['loss'], mode='lines', name='Train Loss'))
fig.add_trace(go.Scatter(x=np.arange(1, length+1), y=history3.history['val_loss'], mode='lines', name='Validation Loss'))
fig.update_layout(
    title='Training and Validation Loss Over Epochs',
    xaxis=dict(title='Epoch'),
    yaxis=dict(title='Loss'),
    legend=dict(x=0, y=1, traceorder='normal'),
)
fig.show()

# reconstruct dataframe and analyse results

In [15]:
# restructure dataframes, SHOULD BE CONST IF WE DON'T CHANGE MASK & NOT SHUFFLE
storedArr = shuffledArr
#storedArr[0]=storedArr[0][:,featConstKeepIdx]

appendedList = []
shuffledDataBlockId = storedArr[6]
for i,data_block_id in enumerate(shuffledDataBlockId):
    isTrain = mask[i]
    
    new_columns = np.zeros((24, 2))  # Adjust the size according to your requirements

    #use targets as first block
    dataBlock = storedArr[4][i,:,:].reshape(24,2)

    #add train/test info & data block id
    new_columns = np.ones((24, 2)) * [isTrain, data_block_id]
    dataBlock = np.hstack((dataBlock, new_columns))
    #add const values
    new_columns = np.ones((24, nConst)) * storedArr[0][i,:].reshape(1,storedArr[0].shape[1])
    dataBlock = np.hstack((dataBlock, new_columns))
    #eprices (of yesterday)
    dataBlock = np.hstack((dataBlock, storedArr[1][i,:,:].reshape(24,1)))
    # add weather prediction
    dataBlock = np.hstack((dataBlock, storedArr[3][i,:,:].reshape(24,storedArr[3].shape[-1])))
    # hist weather
    dataBlock = np.hstack((dataBlock, storedArr[2][i,:,:].reshape(24,storedArr[2].shape[-1])))

    # old target
    dataBlock = np.hstack((dataBlock, storedArr[5][i,:].reshape(24,storedArr[5].shape[-1])))

    #hist weather, flatten the array and add all values as columns
    #new_columns = np.ones((24, 100)) * shuffledArr[2][i,:].reshape(1,100)
    #dataBlock = np.hstack((dataBlock, new_columns))

    appendedList.append(dataBlock)




feat = ['y_cons','y_prod','isTrain','data_block_id'] + featConst + ['ePrices'] + featPredWeather + featHistWeather + ['y_consOld','y_prodOld']
#for i in range(0,10):
#    feat = feat + [f'{value}_'+str(i) for value in featHistWeather]

appendedDf = pd.DataFrame(np.vstack(appendedList), columns=feat)

from datetime import datetime, timedelta
appendedDf['unique_time'] = appendedDf.apply(lambda row: datetime(row['year'].astype(int), 1, 1) + timedelta(days=row['yearday'] - 1, hours=row['hours_ahead']-24), axis=1)
#appendedDf['unique_time'] = appendedDf.apply(lambda row: datetime(2022, 1, 1) + timedelta(days=row['yearday'] - 1, hours=row['hours_ahead']-24), axis=1)
    
            

In [None]:
featConstKeepIdx = [0,1,2,5,6,8,9,13,16,17,18]
storedArr[0]=storedArr[0][:,featConstKeepIdx]

In [None]:
X2 = [storedArr[0],storedArr[1],storedArr[2],storedArr[3],storedArr[5]]
#X2 = [storedArr[0],storedArr[1],storedArr[3]]
#X2[0]  = scaler0.transform(X2[0])
pred_cons = transformer_model.predict(X2)

#pred_cons = pred_cons*X2[0][:,CapIdx].reshape(X2[0].shape[0],1)
#pred_cons = scalerT.inverse_transform(pred_cons.reshape(-1,1)).reshape(-1,24,1)

appendedListProd = []
for i in range(0,pred_cons.shape[0]):
    appendedListProd.append(pred_cons[i,:].reshape(24,1))
    
appendedDf['pred_cons'] = np.vstack(appendedListProd)

appendedDf['absErr_cons'] = (appendedDf['y_cons']-appendedDf['pred_cons'])
print(X2[0].shape)



# analyze dataframe
good generalization, test and train are pretty much always the same

production:
- overweighting big producers -> lots of errors for smaller capacities

consumption:
- bigger error in summer


In [None]:
trainDf = appendedDf.loc[appendedDf.isTrain == True]
testDf = appendedDf.loc[appendedDf.isTrain == False]

### investigate consumption
- some heavy overestimation (when no consumption but we predict one)
- some heave underestimation, when lots of consumption but no prediciton

we can't fit yearly trend!!
things that don't help:
- relu activation function doesn't help (but makes targets all positive)
- input normalization
- target normalization
- bigger network
- using less features to predict
- using weights on day doesn't help (but is definitely needed)


it seems that the consumption data is very sparse & hence really hard to fit 
-> try to find a normalization criteria


In [None]:
#train
print(trainDf.y_cons.mean(),trainDf.y_cons.std())
print(trainDf.pred_cons.mean(),trainDf.pred_cons.std())

In [None]:
#test
print(testDf.y_cons.mean(),testDf.y_cons.std())
print(testDf.pred_cons.mean(),testDf.pred_cons.std())

#### train

In [None]:
trainDf.groupby('hours_ahead')[['absErr_cons','pred_cons','y_cons']].mean().plot()
trainDf.groupby('weekday')[['absErr_cons','pred_cons','y_cons']].mean().plot()

In [None]:
trainDf.groupby('unique_time')[['y_cons','absErr_cons','pred_cons']].mean().plot()
trainDf.groupby('unique_time')[['y_cons','absErr_cons','pred_cons']].std().plot()

### test

In [None]:
testDf.groupby('hours_ahead')[['absErr_cons','pred_cons','y_cons']].mean().plot()
testDf.groupby('weekday')[['absErr_cons','pred_cons','y_cons']].mean().plot()

In [None]:
# notice that feb-may in 2023 are way different predictions
testDf.groupby('unique_time')[['y_cons','absErr_cons','pred_cons']].mean().plot()
testDf.groupby('unique_time')[['y_cons','absErr_cons','pred_cons']].std().plot()

In [None]:
testDf.loc[(testDf.data_block_id == 351) & (testDf.hours_ahead == 0)]

In [None]:
testDf.loc[testDf.ePrices > 1000].unique_time.value_counts()

In [None]:
#testDf['normCons'] = testDf['y_cons'] / testDf['windspeed_10mint']
#testDf['normConsProd'] = testDf['pred_cons'] / testDf['installed_capacity']
trainDf = trainDf.sort_values(by='unique_time')
a = trainDf.loc[(trainDf.month == 3)]

fig = subplots.make_subplots(rows=2, cols=1,shared_xaxes=True)

fig.add_trace(go.Scatter(x = a['unique_time'], y = a['y_cons'],mode ='markers', name='y_cons'),row=1, col=1)
fig.add_trace(go.Scatter(x = a['unique_time'], y = a['pred_cons'],mode ='markers', name='pred_cons'),row=1, col=1)
fig.add_trace(go.Scatter(x = a['unique_time'], y = a['absErr_cons'],mode ='markers', name='err'),row=2, col=1)
#fig.add_trace(go.Scatter(x = a['unique_time'], y = a['normConsProd'],mode ='markers', name='normConsProd'),row=1, col=1)
fig.show()

### check individual day difference
maybe some features corrupt result, seems that predictions are sometimes far off
features that are iffy: eic_count, ind_customer_id, prediction_unit_id, year

In [None]:

f = ['y_cons','pred_cons','absErr_cons','county',
 'is_business',
 'product_type',

 'installed_capacity',
 'business_prodType',
 #'lowest_price_per_mwh',
 #'highest_price_per_mwh',
 #'yearday',
 #'weekday',
 #'month',
 #'monthday'
 ]
a = testDf.loc[(testDf.month == 8) & (testDf.monthday == 3) & (testDf.hours_ahead == 25)][f]

In [None]:
b = testDf.loc[(testDf.month == 3) & (testDf.monthday == 3) & (testDf.hours_ahead == 25)][f]

In [None]:
pd.set_option('Display.max_columns', None)
pd.set_option('Display.max_rows', None)
c = pd.merge(a,b,on=['county','is_business','product_type','business_prodType'], suffixes=('_a','_b'),how='outer')
sorted_columns = sorted(c.columns)
c = c[sorted_columns]
c
#features that are iffy: eic_count, ind_customer_id, prediction_unit_id, year

In [None]:
pd.set_option('Display.max_columns', 10)
pd.set_option('Display.max_rows', 10)

### check error distribution, why we can't generalize

In [None]:
trainDf.absErr_cons.hist(bins=100)

In [None]:
testDf.absErr_cons.hist(bins=100)

In [None]:
testDf.columns

In [None]:
def visualizeDFErrorPDE(df, name,errorBinWidth=500):
    b = df.copy()
    overallSamples = df.shape[0]
    features = b.columns
    num_bins = int((b['absErr_cons'].max()-b['absErr_cons'].min())/errorBinWidth)
    error_bins = np.linspace(b['absErr_cons'].min(), b['absErr_cons'].max(), num_bins + 1)
    b['bin'] = pd.cut(b['absErr_cons'], bins=error_bins)
    b['bin'] = b['bin'].apply(lambda x: x.mid)

    for i in range(0,len(features)):
        fig = go.Figure()
        feature_name = features[i]
        if feature_name == 'unique_time':
            continue
        sub = b.iloc[:,i]

        uniqueVals = sub.unique()
        b['binsFeat'] = sub #if categorical feature
    
        featBinNr = 32
        if len(uniqueVals) > featBinNr:
            uniqueVals = np.linspace(sub.min(), sub.max(), featBinNr+1)
            b['binsFeat'] = pd.cut(sub, bins=uniqueVals)
            b['binsFeat'] = b['binsFeat'].apply(lambda x: x.mid)

        binsErr = np.sort(b.bin.unique())
        
        sampleCountFeat = b['binsFeat'].value_counts().to_dict()
        for bin in binsErr:
            featCount = b.loc[b.bin == bin]['binsFeat'].value_counts().to_dict()
            for key in featCount.keys():
                if sampleCountFeat[key] <= 0:
                    continue
                featCount[key] = featCount[key] / sampleCountFeat[key]
            fig.add_trace(go.Bar(x=list(featCount.keys()), y=list(featCount.values()),name=f'Bin_{bin}'))


        for key in sampleCountFeat.keys():
                sampleCountFeat[key] = sampleCountFeat[key] / overallSamples
        fig.add_trace(go.Scatter(x=list(sampleCountFeat.keys()), y=list(sampleCountFeat.values()),mode='markers',name=f'SampleCount'))

        fig.update_layout(title=str(feature_name))
        # Show the figure
        fig.write_html(name+"pde"+str(feature_name)+str(i)+".html",auto_open=True)

In [None]:
visualizeDFErrorPDE(trainDf, 'trainDf')