In [1]:
from scipy.interpolate import griddata
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Input, BatchNormalization

tf.random.set_seed(42)
from sklearn.model_selection import train_test_split



2024-01-01 14:10:42.984054: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pandas as pd
import numpy as np
import plotly.offline as pyo
from plotly import subplots
import plotly.graph_objects as go

np.random.seed(42)

In [None]:
client = pd.read_csv("../../data/client.csv")
ePrices = pd.read_csv("../../data/electricity_prices.csv")
gasPrices = pd.read_csv("../../data/gas_prices.csv")
train = pd.read_csv("../../data/train.csv")

In [None]:
weatherPredInt = pd.read_csv("interpolPredWeather.csv")
weatherHistInt = pd.read_csv("../histWeatherSnowCover.csv")

# process data

### merge client & train, add business feat & interpolate daylight savings

In [None]:
producing = train.loc[train.is_consumption == 0]
consuming = train.loc[train.is_consumption == 1]
train = pd.merge(producing.drop('is_consumption',axis = 1), consuming.drop('is_consumption',axis = 1),on=['data_block_id','prediction_unit_id','datetime','county','is_business','product_type'], how='outer',suffixes=('_prod', '_cons'))
del producing, consuming
print(train.shape)

clientsTime = pd.merge(train, client, on=['county','is_business','product_type','data_block_id'], how='inner')

clientsTime['datetime'] = pd.to_datetime(clientsTime['datetime'])
clientsTime['yearday'] = clientsTime['datetime'].dt.day_of_year
clientsTime['weekday'] = clientsTime['datetime'].dt.day_of_week
clientsTime['month'] = clientsTime['datetime'].dt.month
clientsTime['monthday'] = clientsTime['datetime'].dt.day
clientsTime['year'] = clientsTime['datetime'].dt.year

unique_pairs = list(set(zip(clientsTime['is_business'], clientsTime[ 'product_type'])))
pair_index_dict = {pair: index for index, pair in enumerate(unique_pairs)}
clientsTime['business_prodType'] = list(map(pair_index_dict.get, zip(clientsTime['is_business'], clientsTime['product_type'])))

unique_pairs_cust = list(set(zip(clientsTime['is_business'], clientsTime[ 'product_type'], clientsTime['county'], clientsTime['eic_count'],clientsTime['installed_capacity'])))
pair_index_dict = {pair: index for index, pair in enumerate(unique_pairs_cust)}
clientsTime['ind_customer_id'] = list(map(pair_index_dict.get, zip(clientsTime['is_business'], clientsTime['product_type'], clientsTime['county'], clientsTime['eic_count'],clientsTime['installed_capacity'])))

# interpolate daylight savings
clientsTime = clientsTime.interpolate()

#### get holidays

In [None]:
import holidays
from datetime import date

us_holidays = holidays.EE()  # this is a dict

clientsTime['holiday'] = clientsTime['datetime'].apply(lambda s : s in us_holidays)

In [None]:
clientsTime['no_workday'] = ((clientsTime['holiday']) | (clientsTime['weekday'] > 4))

### prepare electric prices

In [None]:
ePrices.euros_per_mwh.plot()

In [None]:
ePrices['forecast_date'] = pd.to_datetime(ePrices['forecast_date'])

# Set 'timestamp' as the index
ePrices.set_index('forecast_date', inplace=True)

# Resample to fill missing hours
df_resampled = ePrices.resample('1H').asfreq()

# Linearly interpolate missing values
ePrices = df_resampled.interpolate(method='linear')

In [None]:
ePrices['euros_per_mwh'] = ePrices.euros_per_mwh.replace(4000.0, np.nan)

# Resample to fill missing hours
df_resampled = ePrices.resample('1H').asfreq()

# Linearly interpolate missing values
ePrices = df_resampled.interpolate(method='linear')

# write data into python arrays

In [None]:
#clientsTime, weatherPredInt, weatherHistInt, ePrices, gasPrices

featPredWeather = [
        #'latitude', 'longitude', 
        'County', #'forecast_datetime',
       'hours_ahead',
        'data_block_id', #'origin_datetime', 
       'temperatureint',
       'dewpointint', 'cloudcover_highint', 'cloudcover_lowint',
       'cloudcover_midint', 'cloudcover_totalint',
       '10_metre_u_wind_componentint', '10_metre_v_wind_componentint',
       'direct_solar_radiationint', 'surface_solar_radiation_downwardsint',
       'snowfallint', 'total_precipitationint', 
       #'sunrise', 'sunset',
       #'daylight', 'minDaylight'
       ]

predWeatherComplete = weatherPredInt[featPredWeather]
del weatherPredInt

featHistWeather = [
       #'latitude', 'longitude', 
       'County',
       #'datetime', 
       'data_block_id', 
       'temperatureint', 'dewpointint', 'rainint',
       'snowfallint', 'surface_pressureint', 'cloudcover_totalint',
       'cloudcover_lowint', 'cloudcover_midint', 'cloudcover_highint',
       'windspeed_10mint', 'winddirection_10mint', 'shortwave_radiationint',
       'direct_solar_radiationint', 'diffuse_radiationint', 'meltingInCM',
       'snowHeightFlux', 'snowcover'
]
histWeatherComplete = weatherHistInt[featHistWeather]
del weatherHistInt

# can't use client id as an axis in data because it's not constant!
y = np.zeros((clientsTime.shape[0],2))
y_indexes = np.zeros((clientsTime.shape[0],2)) # dataframe indexes in the end, not important for now
index_y = 0

customers = []

constValsArray = np.zeros((1,17))
targetsArray = np.zeros((1,24,2))
ePricesArray = np.zeros((1,24,1))

nFeatHistWeather = histWeatherComplete.shape[1]-1
histWeatherArray = np.zeros((1,24,nFeatHistWeather))
nFeatPredWeather = predWeatherComplete.shape[1]-1
predWeatherArray = np.zeros((1,24,nFeatPredWeather))
dataBlockIdArray = np.zeros((1))

# loop over customer, append all data cycles for each customer
for customerId in clientsTime.ind_customer_id.unique():
    customerSlice = clientsTime.loc[clientsTime.ind_customer_id == customerId]

    for dataBlockId in customerSlice.data_block_id.unique():
        timeSlice = customerSlice.loc[customerSlice.data_block_id == dataBlockId]
        gasSlice = gasPrices.loc[gasPrices.data_block_id == dataBlockId]
        eSlice   = ePrices.loc[ePrices.data_block_id == dataBlockId]

        if len(timeSlice.yearday.unique()) > 1:
            print(timeSlice.yearday.unique())

        # const values
        yearday = timeSlice.yearday.unique()[0]
        weekday = timeSlice.weekday.unique()[0]
        month   = timeSlice.month.unique()[0]
        monthday = timeSlice.monthday.unique()[0]
        year = timeSlice.year.unique()[0]

        holiday = timeSlice.holiday.unique()[0]
        noWorkDay = timeSlice.no_workday.unique()[0]

        county       = customerSlice.county.unique()[0]
        is_business  = customerSlice.is_business.unique()[0]
        product_type = customerSlice.product_type.unique()[0]
        prediction_unit_id = customerSlice.prediction_unit_id.unique()[0] #should be redundant
        eic_count          = customerSlice.eic_count.unique()[0]
        installed_capacity = customerSlice.installed_capacity.unique()[0]
    
        business_prodType = customerSlice.business_prodType.unique()[0]
        ind_customer_id   = customerSlice.ind_customer_id.unique()[0]

        

        lowest_price_per_mwh = gasSlice.lowest_price_per_mwh.iloc[0]
        highest_price_per_mwh = gasSlice.highest_price_per_mwh.iloc[0]
        euros_per_mwh = eSlice['euros_per_mwh'].to_numpy()
        if euros_per_mwh.shape[0] == 23:
            print(dataBlockId, customerId)

        y_cons = timeSlice['target_cons'] / installed_capacity
        y_prod = timeSlice['target_prod'] / installed_capacity

        histWeather = histWeatherComplete.loc[(histWeatherComplete.data_block_id == dataBlockId) & (histWeatherComplete.County == county)]
        predWeather = predWeatherComplete.loc[(predWeatherComplete.data_block_id == dataBlockId) & (predWeatherComplete.County == county)]

        histWeather = histWeather.drop('data_block_id', axis = 1)
        predWeather = predWeather.drop('data_block_id', axis = 1)


        if y_cons.shape[0] == 23:
            print(dataBlockId, customerId)

        new_row = np.array([county, is_business, product_type, prediction_unit_id, eic_count, installed_capacity, 
        business_prodType,ind_customer_id,lowest_price_per_mwh,highest_price_per_mwh,
        yearday,weekday,month,monthday,year,holiday,noWorkDay
        ])
        constValsArray = np.vstack((constValsArray, new_row))

        new_row = np.zeros((1,24,2))
        new_row[:,:,0] = y_cons
        new_row[:,:,1] = y_prod
        targetsArray = np.concatenate((targetsArray, new_row), axis=0)

        new_row = np.zeros((1,24,1))
        new_row[0,:,0] = euros_per_mwh
        ePricesArray = np.concatenate((ePricesArray, new_row), axis=0)

        new_row = np.zeros((1,24,nFeatHistWeather)) #without datablock id
        new_row[:,:,:] = histWeather
        histWeatherArray = np.concatenate((histWeatherArray, new_row), axis=0)

        new_row = np.zeros((1,24,nFeatPredWeather)) # withotu datablock id
        new_row[:,:,:] = predWeather
        predWeatherArray = np.concatenate((predWeatherArray, new_row), axis=0)

        dataBlockIdArray = np.concatenate((dataBlockIdArray, np.array([dataBlockId])))
    


In [None]:
#remove first rows
targetsArray     = targetsArray[1:,:,:]
constValsArray   = constValsArray[1:,:]
ePricesArray     = ePricesArray[1:,:,:]
histWeatherArray = histWeatherArray[1:,:,:]
predWeatherArray = predWeatherArray[1:,:,:]
dataBlockIdArray = dataBlockIdArray[1:]
print(targetsArray.shape, constValsArray.shape, ePricesArray.shape,histWeatherArray.shape, predWeatherArray.shape, dataBlockIdArray.shape)

In [None]:
#calc absolute targets, not normalized by capacity
targetsAbs = targetsArray.copy()
targetsAbs = targetsAbs * constValsArray[:,5].reshape(constValsArray.shape[0],1,1)
print(targetsAbs.shape)

In [None]:
print(targetsArray.shape, constValsArray.shape, ePricesArray.shape,histWeatherArray.shape, predWeatherArray.shape, dataBlockIdArray.shape)

In [None]:
np.savez('data_arrays_snowcover_holidays.npz', arr1=targetsArray, arr2=constValsArray, arr3=ePricesArray, arr4 = histWeatherArray, arr5=predWeatherArray, arr6=dataBlockIdArray, arr7=targetsAbs)

# load data & preselect data

In [34]:
loaded_data = np.load('data_arrays_absTarget_oldTarget.npz')

# Access individual arrays by their keys
targetsAbs = loaded_data['arr1']
constValsArray = loaded_data['arr2']
ePricesArray = loaded_data['arr3']
histWeatherArray = loaded_data['arr4']
predWeatherArray = loaded_data['arr5']
dataBlockIdArray = loaded_data['arr6']
oldTargetsAbs = loaded_data['arr7']


featPredWeather = [
        'County',
       'hours_ahead',
       # 'data_block_id',
       'temperatureint',
       'dewpointint', 'cloudcover_highint', 'cloudcover_lowint',
       'cloudcover_midint', 
       'cloudcover_totalint',
       '10_metre_u_wind_componentint', '10_metre_v_wind_componentint',
       'direct_solar_radiationint', 'surface_solar_radiation_downwardsint',
       'snowfallint', 'total_precipitationint', 
       ]

#correct hours
predWeatherArray[:,:,1] = predWeatherArray[:,:,1] - 24

# add sin of hours
sinDay = np.reshape(np.sin((2*predWeatherArray[:,:,1] - 24)*np.pi/24),(predWeatherArray.shape[0],predWeatherArray.shape[1],1))
predWeatherArray = np.concatenate((predWeatherArray, sinDay), axis=2)
featPredWeather = featPredWeather + ['sinDay']

cosDay = np.reshape(np.cos((2*predWeatherArray[:,:,1] - 24)*np.pi/24),(predWeatherArray.shape[0],predWeatherArray.shape[1],1))
predWeatherArray = np.concatenate((predWeatherArray, cosDay), axis=2)
featPredWeather = featPredWeather + ['cosDay']
nPredFeat = predWeatherArray.shape[2]


featConst =['county', 'is_business', 'product_type', 'prediction_unit_id', 'eic_count', 'installed_capacity', 'business_prodType', 'ind_customer_id',
'lowest_price_per_mwh','highest_price_per_mwh','yearday','weekday','month','monthday','year','holiday','no_workday'
]



# add sin of year day
yeardayIdx = 10
CapIdx = 5
sinYear = np.reshape(np.sin(2*constValsArray[:,yeardayIdx]*np.pi/365),(constValsArray.shape[0],1))
constValsArray = np.concatenate((constValsArray, sinYear), axis=1)
featConst = featConst + ['sinYearDay']

cosYear = np.reshape(np.cos(2*constValsArray[:,yeardayIdx]*np.pi/365),(constValsArray.shape[0],1))
constValsArray = np.concatenate((constValsArray, cosYear), axis=1)
featConst = featConst + ['cosYearDay']

nConst = constValsArray.shape[1]



featHistWeather = [
       'County', 
       #'data_block_id', 
       'temperatureint', 'dewpointint', 'rainint',
       'snowfallint', 'surface_pressureint', 'cloudcover_totalint',
       'cloudcover_lowint', 'cloudcover_midint', 'cloudcover_highint',
       'windspeed_10mint', 'winddirection_10mint', 'shortwave_radiationint',
       'direct_solar_radiationint', 'diffuse_radiationint', 'meltingInCM',
       'snowHeightFlux', 'snowcover'
]
#featHistWeatherKeepIdx = [1,10]
#histWeatherArray=histWeatherArray[:,:,featHistWeatherKeepIdx]

#featHistWeather = [featHistWeather[i] for i in featHistWeatherKeepIdx]
nHistFeat = histWeatherArray.shape[2]

print(nPredFeat,nHistFeat,nConst)

16 18 19


In [33]:
#featPredWeatherKeepIdx = [14,15] #only keep cos/sin day features
#predWeatherArray=predWeatherArray[:,:,featPredWeatherKeepIdx]
#featPredWeather = [featPredWeather[i] for i in featPredWeatherKeepIdx]
nPredFeat = predWeatherArray.shape[2]

#3,4 (prediction unit id, eic count)
#7 ind cust id
# 10 yearday
# 13 monthday
# 14 year
# 15 holiday
featConstKeepIdx = [0,1,2,5,6,8,9,10,11,12,16,17,18]
constValsArray=constValsArray[:,featConstKeepIdx]
featConst = [featConst[i] for i in featConstKeepIdx]
nConst = constValsArray.shape[1]
#yeardayIdx = 10
CapIdx = 3

#featHistWeatherKeepIdx = [1,10]
#histWeatherArray=histWeatherArray[:,:,featHistWeatherKeepIdx]
#featHistWeather = [featHistWeather[i] for i in featHistWeatherKeepIdx]
nHistFeat = histWeatherArray.shape[2]
print(nPredFeat,nHistFeat,nConst)

16 18 13


In [19]:
CapIdx = 3

# build a model & train it

#### split data based on time

In [23]:
trainSplit = int(max(dataBlockIdArray)*0.7)
mask = dataBlockIdArray < trainSplit

In [37]:
targetsArray = targetsAbs.copy()
targetsArray[:,:,0] = targetsArray[:,:,0] / constValsArray[:,3].reshape(constValsArray.shape[0],1)
targetsArray[:,:,1] = targetsArray[:,:,1] / constValsArray[:,3].reshape(constValsArray.shape[0],1)
shuffledArr = [constValsArray,ePricesArray,histWeatherArray,predWeatherArray,targetsAbs,targetsArray,dataBlockIdArray]

X_train = [shuffledArr[0][mask],shuffledArr[1][mask]]#,shuffledArr[2][mask],shuffledArr[3][mask]]
# only const and e prices
#X_train = [shuffledArr[0][mask],shuffledArr[1][mask],shuffledArr[3][mask]]
y_trainAbs = shuffledArr[4][mask]
y_train = shuffledArr[5][mask]

X_test = [shuffledArr[0][~mask],shuffledArr[1][~mask]]#,shuffledArr[2][~mask],shuffledArr[3][~mask]]
y_testAbs = shuffledArr[4][~mask]
y_test = shuffledArr[5][~mask]


divide by zero encountered in divide


divide by zero encountered in divide


invalid value encountered in divide



#### shuffle training data

In [None]:

index_array = np.arange(X_train[0].shape[0])
if 1: #not shuffle!!
    np.random.shuffle(index_array)

arrays = [X_train[0],X_train[1],X_train[2],X_train[3],y_trainAbs,y_train]
[X_train[0],X_train[1],X_train[2],X_train[3],y_trainAbs,y_train] = [arr[index_array] for arr in arrays]
del arrays

In [None]:
# reduce training data again
X_train[0]=X_train[0][:,featConstKeepIdx]
X_test[0]=X_test[0][:,featConstKeepIdx]
featConst = [featConst[i] for i in featConstKeepIdx]
nConst = X_test[0].shape[1]

#### calculate class weights

In [10]:
from sklearn.utils.class_weight import compute_sample_weight

# Calculate class weights
class_weights = compute_sample_weight(class_weight='balanced', y=X_train[0][:,4]) #y_train[:,:,0]) #, X=[X_train[0][:,[yeardayIdx,5,6]]])

class_weight_dict = dict(enumerate(class_weights))

In [None]:
# calculate sample weight based on target bins
targets = []
for i in range(0,y_train.shape[0]):
    targets.append(np.sum(y_train[i,:,0]))

classWeightDf = pd.DataFrame()
classWeightDf['y_consSum'] = targets

bins = np.linspace(0, classWeightDf['y_consSum'].max(), 26)
classWeightDf['bin'] = pd.cut(classWeightDf['y_consSum'], bins=bins)

frequencies = classWeightDf['bin'].value_counts().to_dict()
classWeightDf['sample_weights'] = classWeightDf['bin'].apply(lambda val: 1 / frequencies[val] if frequencies[val] >0 else 0)
classWeightDf['sample_weights'] = classWeightDf['sample_weights'] / classWeightDf['sample_weights'].mean()
print(classWeightDf.loc[classWeightDf['sample_weights'].isna()],
    classWeightDf.loc[classWeightDf['sample_weights']<= 0.0])

class_weight_dict = dict(enumerate(classWeightDf['sample_weights']))

#### use standard scaler for train data

In [None]:
# normalize the target
from sklearn.preprocessing import StandardScaler

scaler0 = StandardScaler()
X_train[0] = scaler0.fit_transform(X_train[0])
X_test[0]  = scaler0.transform(X_test[0])

#scaler1 = StandardScaler()
#X_train[1] = scaler1.fit_transform(X_train[1].reshape(-1,1)).reshape(-1,24,1)
#X_test[1]  = scaler1.transform(X_test[1].reshape(-1,1)).reshape(-1,24,1)

#scaler2 = StandardScaler()
#X_train[2] = scaler2.fit_transform(X_train[2])
#X_test[2]  = scaler2.transform(X_test[2])

#### use standard scaler for target

In [None]:
# normalize the target
from sklearn.preprocessing import StandardScaler

scalerT = StandardScaler()
y_trainAbsN_cons = scalerT.fit_transform(y_trainAbs[:,:,0].reshape(-1, 1)).reshape(-1,24,1)
y_testAbsN_cons  = scalerT.transform(y_testAbs[:,:,0].reshape(-1, 1)).reshape(-1,24,1)

#### use a label encoder for the eic count

In [None]:
from sklearn.preprocessing import LabelEncoder

eicCountIdx = featConstKeepIdx.index(4)
# Assuming cat_feature is a categorical feature
encoder = LabelEncoder()
constValsArray[:,eicCountIdx] = encoder.fit_transform(constValsArray[:,eicCountIdx])

### define model & train it

### random forrest

In [None]:
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

length=X_train[0].shape[0]
X_trainArray = X_train[0]
X_trainArray = np.concatenate([X_trainArray, X_train[1].reshape(length,24)], axis=1)


# Define Base Models
model1 = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))

# Train the Stacking Regressor on the training set
model1.fit(X_trainArray, y_trainAbs[:,:,0])


In [None]:
# Make predictions on the test set
final_predictions = model1.predict(X_trainArray)

# Evaluate the final stacked model
mse_stacked = mean_squared_error(y_trainAbs[:,:,0], final_predictions)
print(f"Mean Squared Error for Stacked Model train: {mse_stacked}")

length=X_test[0].shape[0]
X_testArray = X_test[0]
X_testArray = np.concatenate([X_testArray, X_test[1].reshape(length,24)], axis=1)
final_predictions = model1.predict(X_testArray)
mse_stacked = mean_squared_error(y_testAbs[:,:,0], final_predictions)
print(f"Mean Squared Error for Stacked Model test: {mse_stacked}")

### use a stacked model

In [None]:
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor


# Define Base Models
model1 = RandomForestRegressor(n_estimators=100, random_state=42)
model2 = GradientBoostingRegressor(n_estimators=100, random_state=42)
model_xgb = XGBRegressor(n_estimators=100, random_state=42)

# Define Meta Model
meta_model = MultiOutputRegressor(LinearRegression())

# Create a Stacking Regressor
stacked_model = StackingRegressor(
    estimators=[('rf', model1), ('gb', model2),('xgb', model_xgb),],
    final_estimator=meta_model
)

# Train the Stacking Regressor on the training set
stacked_model.fit(X_train, y_trainAbs[:,:,0])


In [None]:
# Make predictions on the test set
final_predictions = stacked_model.predict(X_test)

# Evaluate the final stacked model
mse_stacked = mean_squared_error(y_testAbs[:,:,0], final_predictions)
print(f"Mean Squared Error for Stacked Model: {mse_stacked}")

### use a feed forward model for production and consumption separately

In [None]:
# production
# Define the inputs
weatherPred_inputs1 = tf.keras.Input(shape=(24, 10), name='weatherPred_inputs1')
ePrices_inputs1 = tf.keras.Input(shape=(24, 1), name='ePrices_inputs1')

weatherHist_input1 = tf.keras.Input(shape=(24, 5), name='weatherHist_input1')
constant_inputs1 = tf.keras.Input(shape=(14,), name='constant_inputs1')

# Process the time-based inputs
weatherPred_flattened1 = layers.Flatten()(weatherPred_inputs1)
for i in range(0,3):
    weatherPred_flattened1 = layers.Dense(240, activation='relu')(weatherPred_flattened1)

weatherHist_flattened1 = layers.Flatten()(weatherHist_input1)
for i in range(0,3):
    weatherHist_flattened1 = layers.Dense(120, activation='relu')(weatherHist_flattened1)

ePrices_layer1 = layers.Flatten()(ePrices_inputs1)
ePrices_layer1 = layers.Dense(24, activation='relu')(ePrices_layer1)

# Concatenate all inputs
x1 = layers.Concatenate()([weatherPred_flattened1, weatherHist_flattened1, ePrices_layer1, constant_inputs1])

# Main dense block
for i in range(0,5):
    x1 = layers.Dense(540, activation='relu')(x1)
    #x = layers.Dropout(0.1)(x)

# Output layer for 24*2 targets
output_layer1 = layers.Dense(24 * 1, activation='linear', name='output1')(x1)
output_layer1 = layers.Reshape((24, 1))(output_layer1)

model2 = models.Model(inputs=[constant_inputs1,ePrices_inputs1, weatherHist_input1, weatherPred_inputs1], outputs=output_layer1)

# Compile the model with an appropriate loss function and optimizer
model2.compile(optimizer='adam', loss='mae')#, metrics=['mae'])
# 0 = cons, 1 = prod
history2 = model2.fit(X_train, y_trainAbs[:,:,1], validation_data=(X_test, y_testAbs[:,:,1]), epochs=100, batch_size=400) #batch size = 110 weather station

In [15]:
#consumption

#weatherPred_inputs2 = tf.keras.Input(shape=(24, nPredFeat), name='weatherPred_inputs2')
ePrices_inputs2 = tf.keras.Input(shape=(24, 1), name='ePrices_inputs2')
#weatherHist_input2 = tf.keras.Input(shape=(24, nHistFeat), name='weatherHist_input2')
constant_inputs2 = tf.keras.Input(shape=(nConst,), name='constant_inputs2')

#weatherPred_inputs2_normalized = BatchNormalization()(weatherPred_inputs2)
ePrices_inputs2_normalized = BatchNormalization()(ePrices_inputs2)
#weatherHist_input2_normalized = BatchNormalization()(weatherHist_input2)
constant_inputs2_normalized = BatchNormalization()(constant_inputs2)


if 1:
    #weatherPred_flattened2 = layers.Flatten()(weatherPred_inputs2_normalized)
    #for i in range(0,1):
    #    weatherPred_flattened2 = layers.Dense(24*nPredFeat, activation='relu')(weatherPred_flattened2)
    #    #weatherPred_flattened2 = layers.Dropout(0.1)(weatherPred_flattened2)
    #weatherHist_flattened2 = layers.Flatten()(weatherHist_input2_normalized)
    #for i in range(0,1):
    #    weatherHist_flattened2 = layers.Dense(24*nHistFeat, activation='relu')(weatherHist_flattened2)
    #    #weatherHist_flattened2 = layers.Dropout(0.1)(weatherHist_flattened2)
    ePrices_layer2 = layers.Flatten()(ePrices_inputs2_normalized)
    for i in range(0,1):
        ePrices_layer2 = layers.Dense(24, activation='relu')(ePrices_layer2)
        #ePrices_layer2 = layers.Dropout(0.1)(ePrices_layer2)

    constInp_layer2 = layers.Dense(nConst, activation='relu')(constant_inputs2_normalized)
    for i in range(0,0):
        constInp_layer2 = layers.Dense(16, activation='relu')(constInp_layer2)
    #constInp_layer2 = layers.Dense(1, activation='relu')(constInp_layer2)

    # Concatenate all inputs
    #x2 = layers.Concatenate()([weatherPred_flattened2, ePrices_layer2,weatherHist_flattened2])
    x2 = ePrices_layer2
    for i in range(0,10):
        x2 = layers.Dense(x2.shape[-1], activation='relu')(x2)

    #for i in range(0,3):
    #    x2 = layers.Dense(int(x2.shape[-1]/(i+1)), activation='relu')(x2)
    #    print(x2.shape)
    
    x2 = layers.Concatenate()([x2,constInp_layer2])
    for i in range(0,4):
        x2 = layers.Dense(int(x2.shape[-1]*(i+1)), activation='relu')(x2)
        print(x2.shape)
    for i in range(0,2):
        x2 = layers.Dense(int(x2.shape[-1]/(i+2)), activation='relu')(x2)
        print(x2.shape)
    y = x2

# somehow logical model
if 0:
    #x2 = layers.Concatenate()([constInp_layer2]) #ePrices_layer2
    constInp_layer2 = layers.Dense(nConst, activation='relu')(constant_inputs2_normalized)
    constInp_layer2 = layers.Dense(nConst, activation='relu')(constInp_layer2)
    #constInp_layer2 = layers.Dense(1, activation='relu')(constInp_layer2)
    #constInp_layer2 = layers.Dense(16, activation='relu')(constInp_layer2)


    ePrices_layer2 = ePrices_inputs2_normalized[:,i,:]
    ePrices_layer2 = layers.Dense(24, activation='relu')(ePrices_layer2)
    ePrices_layer2 = BatchNormalization()(ePrices_layer2)
    #trend = tf.keras.layers.Add()([ePrices_layer2,  ePrices_inputs2_normalized[:,i,:]])
    trend = [ePrices_layer2]#, ePrices_inputs2_normalized[:,i,:]]
    #trend = ePrices_layer2
    for i in range(0,nPredFeat):
        weatherPred_flattened2 = weatherPred_inputs2_normalized[:,:,i]
        predLayer = layers.Dense(24, activation='relu')(weatherPred_flattened2)
        predLayer = BatchNormalization()(predLayer)
        #predLayer = tf.keras.layers.Add()([weatherPred_flattened2, predLayer])
        #trend.append(weatherPred_flattened2)
        trend.append(predLayer)

    for i in range(0,nHistFeat):
        weatherHist_flattened2 = weatherHist_input2_normalized[:,:,i]
        histLayer = layers.Dense(24, activation='relu')(weatherHist_flattened2)
        histLayer = BatchNormalization()(histLayer)
        #histLayer = tf.keras.layers.Add()([histLayer, weatherHist_flattened2])
        #trend = tf.keras.layers.Add()([trend, histLayer])
        #trend.append(weatherHist_flattened2)
        trend.append(histLayer)

    #trend = BatchNormalization()(trend)
    #for i in range(0,1):
    #    trend = layers.Dense(24, activation='relu')(trend)



    y = []
    for i in range(0,24):
        trendVals = []
        for layer in trend:
            trendVals.append(tf.expand_dims(layer[:,i], axis=-1))
        trendi = tf.concat(trendVals, axis = -1)
        weatherPred_flattened2 = weatherPred_inputs2_normalized[:,i,:]
        weatherHist_flattened2 = weatherHist_input2_normalized[:,i,:]
        ePrices_layer2         = ePrices_inputs2_normalized[:,i,:]
        #print(weatherPred_flattened2.shape,weatherHist_flattened2.shape, ePrices_layer2.shape)
        x1 = layers.Concatenate()([weatherPred_flattened2, ePrices_layer2,weatherHist_flattened2, trendi])
        x1 = layers.Dense(x1.shape[-1], activation='relu')(x1)
        x2 = layers.Concatenate()([x1, constInp_layer2])

        for i in range(0,10):
            x2 = layers.Dense(x2.shape[-1], activation='relu')(x2) #making it bigger helps a bit, but not super significant
        #x2 = layers.Dense(int(x2.shape[-1]/2), activation='relu')(x2)

        x2 = layers.Dense(1, activation='relu')(x2)
        y.append(x2)
        #x = layers.Dropout(0.1)(x)
    y = layers.Concatenate()(y)
    print(y.shape)
#for i in range(0,3):
#    y = layers.Dense(24, activation='relu')(y)


# Output layer for 24*2 targets
output_layer2 = layers.Dense(24 * 1, activation='relu', name='output')(y)
#output_layer2 = layers.Reshape((24, 1))(output_layer2)

#model3 = tf.keras.Model(inputs=[constant_inputs2,ePrices_inputs2, weatherHist_input2, weatherPred_inputs2], outputs=output_layer2)
model3 = tf.keras.Model(inputs=[constant_inputs2,ePrices_inputs2], outputs=output_layer2)


def absolute_loss(y_true, y_pred):
    return tf.reduce_sum(tf.abs(y_true - y_pred))

class CustomMetric(tf.keras.metrics.Metric):
    def __init__(self, x_feature, name="custom_metric", **kwargs):
        super().__init__(name=name, **kwargs)
        self.x_feature = x_feature
        self.abs_diff = self.add_weight(name="abs_diff", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Expand dimensions of x_feature for broadcasting
        x_feature_expanded = tf.expand_dims(self.x_feature, axis=-1)

        # Multiply each element in y_true and y_pred by the corresponding element in x_feature
        multiplied_y_true = y_true * x_feature_expanded
        multiplied_y_pred = y_pred * x_feature_expanded

        # Update the abs_diff state with the mean absolute difference
        self.abs_diff.assign_add(tf.reduced_mean(tf.abs(multiplied_y_true - multiplied_y_pred)))

    def result(self):
        # Return the final result (mean absolute difference)
        return self.abs_diff

    def reset_states(self):
        # Reset the accumulated states at the end of each epoch or batch
        self.abs_diff.assign(0.0)

# Create an instance of the CustomMetric with x_feature


x_feature = constant_inputs2[:, 3] # ATTENTION, changes!!
custom_metric = CustomMetric(x_feature)

model3.compile(optimizer='adam', loss='mae')#, metrics=custom_metric)#absolute_loss, metrics=['mae'])
# 0 = cons, 1 = prod
history3 = model3.fit(X_train, y_train[:,:,0], validation_data=(X_test, y_test[:,:,0]), epochs=1000, batch_size=5000, class_weight=class_weight_dict)

(None, 37)
(None, 74)
(None, 222)
(None, 888)
(None, 444)
(None, 148)
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
E

In [21]:
#model3.optimizer.lr.assign(0.005)  # Set a new learning rate for subsequent calls to fit
history3 = model3.fit(X_train, y_train[:,:,0], validation_data=(X_test, y_test[:,:,0]), epochs=1000, batch_size=100000, class_weight=class_weight_dict)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [None]:
fig = go.Figure()
length = len(history3.history['loss'])
fig.add_trace(go.Scatter(x=np.arange(1, length+1), y=history3.history['loss'], mode='lines', name='Train Loss'))
fig.add_trace(go.Scatter(x=np.arange(1, length+1), y=history3.history['val_loss'], mode='lines', name='Validation Loss'))
fig.update_layout(
    title='Training and Validation Loss Over Epochs',
    xaxis=dict(title='Epoch'),
    yaxis=dict(title='Loss'),
    legend=dict(x=0, y=1, traceorder='normal'),
)
fig.show()

In [20]:
a = model3.predict(X_test)
predVals = a*X_test[0][:,CapIdx].reshape(X_test[0].shape[0],1)
print(np.mean(np.abs(y_testAbs[:,:,0] - predVals)))
a = model3.predict(X_train)
predVals = a*X_train[0][:,CapIdx].reshape(X_train[0].shape[0],1)
trueVals = targetsAbs[0:trainSplit,:]
print(np.mean(np.abs(y_trainAbs[:,:,0]  - predVals)))
del a,predVals,trueVals

#only const feat
# 124/212 
# 91/181 -> no yearday
# 66/180 -> no yearday, holiday feat
# no normalization but scaler 60/200 -> bad
# same architecture, layer normalization  187/112 optimum,way better

# way better results with mae, rather than absolute loss
# 104/175 -> no yearday, weekday, monthday, holiday feat

# with e prices
# 98/178 (before overfitting)
# 72/177 (before overfitting) smaller network
# 91/175 (before overfitting) smaller network + relu activation function in last layer
# 82/173 (smaller network, relu)

# multiplicative
# 110/180
# concat
# 110/175

# with pred temp
# 95/186

# 34/67

# mean cons error single traines =35-37 (train) 53-57 (test) -> 34/63 -> 32/55
# mean prod error single trainer = 9-20 -> 10/18

682.7754489150414
412.0196809557261


In [None]:
# reverse target scaling
a = model3.predict(X_test)
predVals = scalerT.inverse_transform(a.reshape(-1,1))
print(np.mean(np.abs(y_testAbs[:,:,0] - predVals.reshape(-1,24))))
a = model3.predict(X_train)
predVals = scalerT.inverse_transform(a.reshape(-1,1))
trueVals = targetsAbs[0:trainSplit,:]
print(np.mean(np.abs(y_trainAbs[:,:,0]  - predVals.reshape(-1,24))))
del a,predVals,trueVals

In [None]:
#model2.save('prod_model_.h5')
model3.save('cons_model_big_model3_prodBusWeighted.h5')

### transformer net
maximum around 140 test error

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, MultiHeadAttention, LayerNormalization, Flatten,Reshape
from tensorflow.keras.models import Model


# Encoder
weatherPred_inputs3 = tf.keras.Input(shape=(24, nPredFeat), name='weatherPred_inputs2')
ePrices_inputs3 = tf.keras.Input(shape=(24, 1), name='ePrices_inputs2')
#weatherHist_input2 = tf.keras.Input(shape=(24, 2), name='weatherHist_input2')
constant_inputs3 = tf.keras.Input(shape=(nConst,), name='constant_inputs2')
weatherPred_inputs3_normalized = BatchNormalization()(weatherPred_inputs3)
ePrices_inputs3_normalized = BatchNormalization()(ePrices_inputs3)
#weatherHist_input2_normalized = BatchNormalization()(weatherHist_input2)
constant_inputs3_normalized = BatchNormalization()(constant_inputs3)

encoder_inputs = tf.concat([weatherPred_inputs3_normalized, ePrices_inputs3_normalized], axis=-1)
#Input(shape=(24,num_transformer_blocks), name="encoder_inputs")

attention_outputs = []
for i in range(encoder_inputs.shape[-1]):
    x = encoder_inputs[:, :, i]
    x = Reshape((-1, 1))(x)
    y = x
    x = MultiHeadAttention(num_heads=4, key_dim=8, value_dim=24)(x, x)
    #print(x.shape)
    #x = Dense(24, activation="relu")(x)
    #x = Dense(32, activation="softmax")(x)
    #x = LayerNormalization(epsilon=1e-6)(x)
    x = tf.keras.layers.Add()([x, y])
    #x = LayerNormalization(epsilon=1e-6)(x)
    x = Dense(24, activation="relu")(x)
    #x = Dense(1, activation="relu")(x)
    attention_outputs.append(x)

y = tf.concat(attention_outputs, axis=-1)
#x = LayerNormalization(epsilon=1e-6)(x)
y = Dense(144, activation="relu")(y)
#y = LayerNormalization(epsilon=1e-6)(y)
y = Dense(128, activation="relu")(y)
y = Dense(128, activation="relu")(y)
y = Dense(128, activation="relu")(y)
#y = Dense(24, activation="relu")(y)
#y = Dense(24, activation="relu")(y)
#y = Dense(24, activation="relu")(y)

#y = tf.keras.layers.Add()([x, y])  # Residual connection

# Flatten the encoder output
encoder_outputs = Flatten()(y)

print(encoder_outputs)
# Concatenate encoder and decoder outputs
concatenated_outputs = tf.concat([encoder_outputs, constant_inputs3_normalized],axis=-1)

# Multi-layer perceptron (MLP) for final prediction
#concatenated_outputs = Dense(256, activation='relu')(concatenated_outputs)
concatenated_outputs = Dense(128, activation='relu')(concatenated_outputs)
for _ in range(10):
    concatenated_outputs = Dense(128, activation='relu')(concatenated_outputs)
final_output = Dense(24, activation="linear")(concatenated_outputs)

# Build the model
transformer_model = Model(inputs=[constant_inputs3, ePrices_inputs3, weatherPred_inputs3], outputs=final_output)

# Compile the model
transformer_model.compile(optimizer='adam', loss='mae')

history3 = transformer_model.fit(X_train, y_trainAbs[:,:,0], validation_data=(X_test, y_testAbs[:,:,0]), epochs=20, batch_size=250, class_weight=class_weight_dict)


In [None]:
transformer_model.save('transformer1.h5')

In [None]:
fig = go.Figure()
length = len(history3.history['loss'])
fig.add_trace(go.Scatter(x=np.arange(1, length+1), y=history3.history['loss'], mode='lines', name='Train Loss'))
fig.add_trace(go.Scatter(x=np.arange(1, length+1), y=history3.history['val_loss'], mode='lines', name='Validation Loss'))
fig.update_layout(
    title='Training and Validation Loss Over Epochs',
    xaxis=dict(title='Epoch'),
    yaxis=dict(title='Loss'),
    legend=dict(x=0, y=1, traceorder='normal'),
)
fig.show()

# reconstruct dataframe and analyse results

In [132]:
# restructure dataframes, SHOULD BE CONST IF WE DON'T CHANGE MASK & NOT SHUFFLE
storedArr = shuffledArr
#storedArr[0]=storedArr[0][:,featConstKeepIdx]

appendedList = []
shuffledDataBlockId = storedArr[6]
for i,data_block_id in enumerate(shuffledDataBlockId):
    isTrain = mask[i]
    
    new_columns = np.zeros((24, 2))  # Adjust the size according to your requirements

    #use targets as first block
    dataBlock = storedArr[4][i,:,:].reshape(24,2)

    #add train/test info & data block id
    new_columns = np.ones((24, 2)) * [isTrain, data_block_id]
    dataBlock = np.hstack((dataBlock, new_columns))
    #add const values
    new_columns = np.ones((24, storedArr[0].shape[1])) * storedArr[0][i,:].reshape(1,storedArr[0].shape[1])
    dataBlock = np.hstack((dataBlock, new_columns))
    #eprices (of yesterday)
    dataBlock = np.hstack((dataBlock, storedArr[1][i,:,:].reshape(24,1)))
    # add weather prediction
    dataBlock = np.hstack((dataBlock, storedArr[3][i,:,:].reshape(24,storedArr[3].shape[2])))
    # hist weather
    dataBlock = np.hstack((dataBlock, storedArr[2][i,:,:].reshape(24,storedArr[2].shape[2])))

    #hist weather, flatten the array and add all values as columns
    #new_columns = np.ones((24, 100)) * shuffledArr[2][i,:].reshape(1,100)
    #dataBlock = np.hstack((dataBlock, new_columns))

    appendedList.append(dataBlock)
featPredWeather2 = []
for f in featPredWeather:
    featPredWeather2.append(f+str('_pred'))


feat = ['y_cons','y_prod','isTrain','data_block_id'] + featConst + ['ePrices'] + featPredWeather2 + featHistWeather
#for i in range(0,10):
#    feat = feat + [f'{value}_'+str(i) for value in featHistWeather]

appendedDf = pd.DataFrame(np.vstack(appendedList), columns=feat)

from datetime import datetime, timedelta
appendedDf['unique_time'] = appendedDf.apply(lambda row: datetime(row['year'].astype(int), 1, 1) + timedelta(days=row['yearday'] - 1, hours=row['hours_ahead_pred']-24), axis=1)
#appendedDf['unique_time'] = appendedDf.apply(lambda row: datetime(2022, 1, 1) + timedelta(days=row['yearday'] - 1, hours=row['hours_ahead']-24), axis=1)
    
            

In [None]:
#featConstKeepIdx = [0,1,2,5,6,8,9,13,16,17,18]
#storedArr[0]=storedArr[0][:,featConstKeepIdx]

In [26]:
X2 = [storedArr[0],storedArr[1],storedArr[2],storedArr[3]]
#X2 = [storedArr[0],storedArr[1],storedArr[3]]
#X2[0]  = scaler0.transform(X2[0])
pred_cons = model3.predict(X2)

#pred_cons = pred_cons*X2[0][:,CapIdx].reshape(X2[0].shape[0],1)
#pred_cons = scalerT.inverse_transform(pred_cons.reshape(-1,1)).reshape(-1,24,1)

appendedListProd = []
for i in range(0,pred_cons.shape[0]):
    appendedListProd.append(pred_cons[i,:].reshape(24,1))
    
appendedDf['pred_cons'] = np.vstack(appendedListProd)

appendedDf['absErr_cons'] = (appendedDf['y_cons']-appendedDf['pred_cons'])
print(X2[0].shape)



ValueError: in user code:

    File "/Users/matthiaskargl/opt/anaconda3/envs/predictenegycons/lib/python3.10/site-packages/keras/engine/training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "/Users/matthiaskargl/opt/anaconda3/envs/predictenegycons/lib/python3.10/site-packages/keras/engine/training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/matthiaskargl/opt/anaconda3/envs/predictenegycons/lib/python3.10/site-packages/keras/engine/training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "/Users/matthiaskargl/opt/anaconda3/envs/predictenegycons/lib/python3.10/site-packages/keras/engine/training.py", line 1983, in predict_step
        return self(x, training=False)
    File "/Users/matthiaskargl/opt/anaconda3/envs/predictenegycons/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/matthiaskargl/opt/anaconda3/envs/predictenegycons/lib/python3.10/site-packages/keras/engine/input_spec.py", line 216, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model_3" expects 2 input(s), but it received 4 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 19) dtype=float32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 24, 1) dtype=float32>, <tf.Tensor 'IteratorGetNext:2' shape=(None, 24, 18) dtype=float32>, <tf.Tensor 'IteratorGetNext:3' shape=(None, 24, 16) dtype=float32>]


# analyze dataframe
good generalization, test and train are pretty much always the same

production:
- overweighting big producers -> lots of errors for smaller capacities

consumption:
- bigger error in summer


In [27]:
trainDf = appendedDf.loc[appendedDf.isTrain == True]
testDf = appendedDf.loc[appendedDf.isTrain == False]

# investigate consumption target

In [133]:
appendedDf.columns

Index(['y_cons', 'y_prod', 'isTrain', 'data_block_id', 'county', 'is_business',
       'product_type', 'prediction_unit_id', 'eic_count', 'installed_capacity',
       'business_prodType', 'ind_customer_id', 'lowest_price_per_mwh',
       'highest_price_per_mwh', 'yearday', 'weekday', 'month', 'monthday',
       'year', 'holiday', 'no_workday', 'sinYearDay', 'cosYearDay', 'ePrices',
       'County_pred', 'hours_ahead_pred', 'temperatureint_pred',
       'dewpointint_pred', 'cloudcover_highint_pred', 'cloudcover_lowint_pred',
       'cloudcover_midint_pred', 'cloudcover_totalint_pred',
       '10_metre_u_wind_componentint_pred',
       '10_metre_v_wind_componentint_pred', 'direct_solar_radiationint_pred',
       'surface_solar_radiation_downwardsint_pred', 'snowfallint_pred',
       'total_precipitationint_pred', 'sinDay_pred', 'cosDay_pred', 'County',
       'temperatureint', 'dewpointint', 'rainint', 'snowfallint',
       'surface_pressureint', 'cloudcover_totalint', 'cloudcover_lowint

In [157]:
appendedDf['ind_customer_id2'] = appendedDf.groupby(['county', 'is_business','product_type','business_prodType']).ngroup() + 1 #,'eic_count'
appendedDf['combinedTarg'] = appendedDf['y_cons'] + appendedDf['y_prod']

# maybe the consumption at night == base & during day it's lower when we have solar power!!
# consumption 
appendedDf['value_changed_eic'] = appendedDf['eic_count'] != appendedDf['eic_count'].shift(1)
appendedDf['value_changed_cap'] = appendedDf['installed_capacity'] != appendedDf['installed_capacity'].shift(1)

In [189]:
appendedDf.loc[appendedDf.product_type == 2]['y_prod'].mean()

5.519197481776011

consumption depends on:
- production
- hour
- year
- temperature!! long term trend very much connected to temperature
    - for business it might be that different customers are more significant -> split into customers needed

production:
- installed capacity
- solar radiation

General:
- combined consumption & production is important
- if production higher -> consumption lower
- "Grundumsatz" more or less similar between nights, but influenced by temperatures over the year!!

USUALLY peaks around 8:00 & 19:00, bottom around 15:00 & 1:00 (night is lower)
with solar power, bottom is at 12! (solar peak at 12)

eic count & capacity change at the same time! (only 7 excemptions) -> 90% false positives from calculation

In [193]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pd.set_option('Display.max_columns',None)
a = appendedDf.loc[(appendedDf.ind_customer_id2 == 7)]# & (appendedDf.hours_ahead_pred == 0)]#.groupby('yearday').min()#['y_cons']#
#a = appendedDf.loc[appendedDf.data_block_id == 124]
print('business',a.is_business.unique(),'prodType', a.product_type.unique(),'county',a.county.unique())
fig = make_subplots(rows=3,cols=1)
#for data_block_id in a.data_block_id.unique():
#    a_sub = a.loc[a.data_block_id == data_block_id]
#    fig.add_trace(go.Scatter(x=a_sub.hours_ahead_pred, y=a_sub.y_cons, mode='lines', name='y_cons', legendgroup=data_block_id),row=1,col=1)
#    fig.add_trace(go.Scatter(x=a_sub.hours_ahead_pred, y=a_sub.y_prod, mode='lines', name='y_prod', legendgroup=data_block_id),row=2,col=1)
#    fig.add_trace(go.Scatter(x=a_sub.hours_ahead_pred, y=a_sub.combinedTarg, mode='lines', name='y_prod', legendgroup=data_block_id),row=3,col=1)#

#fig.show()

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=a.unique_time, y=a.y_cons, mode='lines', name='y_cons'))
fig2.add_trace(go.Scatter(x=a.unique_time, y=a.y_prod, mode='lines', name='y_prod'))
fig2.add_trace(go.Scatter(x=a.unique_time, y=a.combinedTarg, mode='lines', name='comb'))
fig2.add_trace(go.Scatter(x=a.unique_time, y=a.temperatureint*100, mode='lines', name='temp'))
fig2.add_trace(go.Scatter(x=a.unique_time, y=a.installed_capacity, mode='lines', name='cap'))
#fig2.add_trace(go.Scatter(x=a.unique_time, y=a.lowest_price_per_mwh, mode='lines', name='lowest_price_per_mwh'))
#fig2.add_trace(go.Scatter(x=a.unique_time, y=a.highest_price_per_mwh, mode='lines', name='highest_price_per_mwh'))
#fig2.add_trace(go.Scatter(x=a.unique_time, y=a.ePrices, mode='lines', name='ePrices'))
fig2.show()

business [1.] prodType [3.] county [0.]


In [115]:
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=a.unique_time, y=a.y_cons, mode='lines', name='y_cons'))
fig2.add_trace(go.Scatter(x=a.unique_time, y=a.y_prod, mode='lines', name='y_prod'))
fig2.add_trace(go.Scatter(x=a.unique_time, y=a.combinedTarg, mode='lines', name='comb'))
fig2.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [98]:
a.loc[a.hours_ahead == 8]

Unnamed: 0,y_cons,y_prod,isTrain,data_block_id,county,is_business,product_type,prediction_unit_id,eic_count,installed_capacity,business_prodType,ind_customer_id,lowest_price_per_mwh,highest_price_per_mwh,yearday,weekday,month,monthday,year,holiday,no_workday,sinYearDay,cosYearDay,ePrices,County,hours_ahead,temperatureint,dewpointint,cloudcover_highint,cloudcover_lowint,cloudcover_midint,cloudcover_totalint,10_metre_u_wind_componentint,10_metre_v_wind_componentint,direct_solar_radiationint,surface_solar_radiation_downwardsint,snowfallint,total_precipitationint,sinDay,cosDay,County.1,temperatureint.1,dewpointint.1,rainint,snowfallint.1,surface_pressureint,cloudcover_totalint.1,cloudcover_lowint.1,cloudcover_midint.1,cloudcover_highint.1,windspeed_10mint,winddirection_10mint,shortwave_radiationint,direct_solar_radiationint.1,diffuse_radiationint,meltingInCM,snowHeightFlux,snowcover,unique_time,ind_customer_id2,combinedTarg
239120,5688.884,0.002,1.0,124.0,11.0,1.0,3.0,48.0,118.0,5803.9,6.0,100.0,82.74,89.99,3.0,0.0,1.0,3.0,2022.0,0.0,0.0,0.05162,0.998667,48.95,11.0,8.0,-3.813776,-5.072595,1.0,0.99999,0.563822,1.000001,-2.440021,1.149926,0.0,6.981019,1e-06,1e-06,-0.866025,0.5,11.0,-6.849833,-9.50182,0.0,0.0,1009.468668,15.819536,17.177134,0.0,0.0,7.358036,176.528058,0.0,0.0,0.0,-0.0,0.0,20.67087,2022-01-02 08:00:00,1094,5688.886
239144,6485.259,0.0,1.0,125.0,11.0,1.0,3.0,48.0,118.0,5803.9,6.0,100.0,75.0,90.0,4.0,1.0,1.0,4.0,2022.0,0.0,0.0,0.068802,0.99763,131.47,11.0,8.0,1.101377,0.520333,0.011075,1.0,0.757791,1.0,0.322856,3.944953,0.0,3.53128,0.000113,0.00032,-0.866025,0.5,11.0,-5.260934,-7.658447,0.0,0.0,998.080589,100.0,98.0,79.574467,98.097665,5.272765,294.634211,0.0,0.0,0.0,-0.0,0.0,21.316284,2022-01-03 08:00:00,1094,6485.259
239168,6588.234,0.005,1.0,126.0,11.0,1.0,3.0,48.0,118.0,5803.9,6.0,100.0,80.0,97.79,5.0,2.0,1.0,5.0,2022.0,0.0,0.0,0.085965,0.996298,174.61,11.0,8.0,0.712437,0.358603,0.998824,1.0,1.0,1.0,1.202228,3.952595,0.0,3.806074,0.000516,0.000732,-0.866025,0.5,11.0,-2.538914,-3.390901,0.0,0.261805,987.231908,100.0,87.572937,99.642402,47.941149,2.919152,286.178948,0.0,0.0,0.0,-0.0,0.261805,22.158024,2022-01-04 08:00:00,1094,6588.239
239192,6673.206,0.01,1.0,127.0,11.0,1.0,3.0,48.0,118.0,5803.9,6.0,100.0,89.0,112.6,6.0,3.0,1.0,6.0,2022.0,0.0,0.0,0.103102,0.994671,147.74,11.0,8.0,-1.511758,-2.179047,0.0,1.0,0.289713,1.000006,4.107317,-0.977171,0.049358,8.080908,2.9e-05,3e-05,-0.866025,0.5,11.0,1.94222,1.790234,0.23576,0.0,983.348954,99.357598,99.357598,24.954824,6.698518,4.186491,237.892331,0.0,0.0,0.0,-0.1748,-0.1748,20.639336,2022-01-05 08:00:00,1094,6673.216
239216,6571.488,0.001,1.0,128.0,11.0,1.0,3.0,48.0,118.0,5803.9,6.0,100.0,92.74,108.0,7.0,4.0,1.0,7.0,2022.0,0.0,0.0,0.120208,0.992749,136.88,11.0,8.0,-8.021141,-11.624007,0.0,0.208692,0.429005,0.559816,5.225091,-1.411591,128.160798,18.860921,0.0,0.0,-0.866025,0.5,11.0,-1.082954,-2.108947,0.0,0.051805,981.522961,100.0,73.961782,91.675462,9.384282,4.471439,232.505002,0.0,0.0,0.0,-0.0,0.051805,19.812301,2022-01-06 08:00:00,1094,6571.489
239240,3758.523,0.0,1.0,129.0,11.0,1.0,3.0,48.0,118.0,5803.9,6.0,100.0,97.0,102.94,8.0,5.0,1.0,8.0,2022.0,0.0,1.0,0.137279,0.990532,175.66,11.0,8.0,-6.366041,-8.109536,0.999639,1.0,0.906303,1.0,-2.856708,0.877213,0.00527,6.145826,3.5e-05,3.6e-05,-0.866025,0.5,11.0,-7.006793,-9.620533,0.0,0.0,992.788201,27.562635,30.757964,0.0,0.0,4.951707,138.359412,0.0,0.0,0.0,-0.0,0.0,22.519583,2022-01-07 08:00:00,1094,3758.523
239264,3145.923,0.0,1.0,130.0,11.0,1.0,3.0,48.0,118.0,5803.9,6.0,100.0,94.0,100.0,9.0,6.0,1.0,9.0,2022.0,0.0,1.0,0.154309,0.988023,182.03,11.0,8.0,-2.053788,-3.555338,0.996974,0.582212,0.739533,0.999486,1.567011,3.760533,0.116729,9.99264,1.1e-05,1.2e-05,-0.866025,0.5,11.0,-6.742553,-9.334273,0.0,0.0,1001.627935,64.467095,71.791633,0.0,0.0,4.88466,161.602667,0.0,0.0,0.0,-0.0,0.0,22.795061,2022-01-08 08:00:00,1094,3145.923
239288,6565.505,0.0,1.0,131.0,11.0,1.0,3.0,48.0,118.0,5803.9,6.0,100.0,94.0,103.99,10.0,0.0,1.0,10.0,2022.0,0.0,0.0,0.171293,0.98522,97.26,11.0,8.0,-2.886459,-3.796314,0.0,0.999995,0.0,0.999995,-1.621059,-3.446116,0.0,9.238112,1e-05,7e-06,-0.866025,0.5,11.0,-2.912253,-5.257598,0.0,0.121805,1001.090355,97.920531,30.626006,100.0,99.0,4.538174,280.154078,0.0,0.0,0.0,-0.0,0.121805,23.275443,2022-01-09 08:00:00,1094,6565.505
239312,6766.2,0.0,1.0,132.0,11.0,1.0,3.0,48.0,118.0,5803.9,6.0,100.0,94.0,103.98,11.0,1.0,1.0,11.0,2022.0,0.0,0.0,0.188227,0.982126,313.92,11.0,8.0,-14.358712,-16.612324,0.0,0.999093,0.0,0.999093,-1.109429,0.137628,32.219593,18.398978,4e-06,7e-06,-0.866025,0.5,11.0,-2.308947,-3.64222,0.0,0.108132,1003.022142,100.0,97.804671,22.367319,47.733689,2.497102,251.803154,0.0,0.0,0.0,-0.0,0.108132,25.651175,2022-01-10 08:00:00,1094,6766.2


In [99]:
appendedDf.loc[(appendedDf.data_block_id == 124)]

Unnamed: 0,y_cons,y_prod,isTrain,data_block_id,county,is_business,product_type,prediction_unit_id,eic_count,installed_capacity,business_prodType,ind_customer_id,lowest_price_per_mwh,highest_price_per_mwh,yearday,weekday,month,monthday,year,holiday,no_workday,sinYearDay,cosYearDay,ePrices,County,hours_ahead,temperatureint,dewpointint,cloudcover_highint,cloudcover_lowint,cloudcover_midint,cloudcover_totalint,10_metre_u_wind_componentint,10_metre_v_wind_componentint,direct_solar_radiationint,surface_solar_radiation_downwardsint,snowfallint,total_precipitationint,sinDay,cosDay,County.1,temperatureint.1,dewpointint.1,rainint,snowfallint.1,surface_pressureint,cloudcover_totalint.1,cloudcover_lowint.1,cloudcover_midint.1,cloudcover_highint.1,windspeed_10mint,winddirection_10mint,shortwave_radiationint,direct_solar_radiationint.1,diffuse_radiationint,meltingInCM,snowHeightFlux,snowcover,unique_time,ind_customer_id2,combinedTarg
31560,118.786,0.0,1.0,124.0,6.0,1.0,3.0,24.0,6.0,324.2,6.0,2088.0,82.74,89.99,3.0,0.0,1.0,3.0,2022.0,0.0,0.0,0.05162,0.998667,57.08,6.0,0.0,-0.992759,-1.542436,0.944265,1.000000,0.502558,1.000000,-1.843494,2.946663,0.0,0.0,1.256026e-06,4.943668e-05,-1.224647e-16,-1.000000,6.0,-1.873922,-3.670360,0.0,0.000000,1007.727083,89.120697,99.120697,0.000000,0.000000,7.399113,167.711042,4.120697,0.000000,4.120697,-0.0,0.000000,10.159517,2022-01-02 00:00:00,625,118.786
31561,123.240,0.0,1.0,124.0,6.0,1.0,3.0,24.0,6.0,324.2,6.0,2088.0,82.74,89.99,3.0,0.0,1.0,3.0,2022.0,0.0,0.0,0.05162,0.998667,55.09,6.0,1.0,-0.843348,-1.355157,0.999933,1.000000,0.234849,1.000000,-1.993470,2.113021,0.0,0.0,6.752089e-09,1.983581e-05,-2.588190e-01,-0.965926,6.0,-2.671294,-4.737713,0.0,0.000000,1009.615013,89.662287,99.662287,0.000000,0.000000,7.518337,163.590345,26.699803,0.120697,26.579107,-0.0,0.000000,10.159517,2022-01-02 01:00:00,625,123.240
31562,120.891,0.0,1.0,124.0,6.0,1.0,3.0,24.0,6.0,324.2,6.0,2088.0,82.74,89.99,3.0,0.0,1.0,3.0,2022.0,0.0,0.0,0.05162,0.998667,52.82,6.0,2.0,-0.853958,-1.350137,0.999363,1.000001,0.148208,1.000000,-2.101652,1.960313,0.0,0.0,-1.117587e-08,4.871103e-07,-5.000000e-01,-0.866025,6.0,-2.906189,-5.536399,0.0,0.000000,1011.347470,86.466557,96.008148,0.000000,0.000000,7.734445,155.024378,47.184491,0.337713,46.846778,-0.0,0.000000,10.159517,2022-01-02 02:00:00,625,120.891
31563,116.271,0.0,1.0,124.0,6.0,1.0,3.0,24.0,6.0,324.2,6.0,2088.0,82.74,89.99,3.0,0.0,1.0,3.0,2022.0,0.0,0.0,0.05162,0.998667,34.94,6.0,3.0,-0.835093,-1.325540,0.999904,1.000001,0.107640,1.000000,-1.919806,2.293209,0.0,0.0,8.446646e-08,1.159790e-06,-7.071068e-01,-0.707107,6.0,-2.952030,-6.244717,0.0,0.000000,1012.847470,71.660452,79.743632,0.000000,0.000000,7.306399,158.662287,61.923711,14.427785,47.495926,-0.0,0.000000,10.159517,2022-01-02 03:00:00,625,116.271
31564,119.374,0.0,1.0,124.0,6.0,1.0,3.0,24.0,6.0,324.2,6.0,2088.0,82.74,89.99,3.0,0.0,1.0,3.0,2022.0,0.0,0.0,0.05162,0.998667,29.56,6.0,4.0,-0.757274,-1.226643,0.999879,0.999997,0.317394,1.000000,-1.768804,2.577231,0.0,0.0,1.474010e-06,4.176169e-05,-8.660254e-01,-0.500000,6.0,-3.155782,-6.810946,0.0,0.000000,1014.125769,34.221363,38.087528,0.000000,0.000000,8.595470,161.903681,63.814253,17.680417,46.133836,-0.0,0.000000,10.159517,2022-01-02 04:00:00,625,119.374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252283,357.089,0.0,1.0,124.0,15.0,1.0,3.0,60.0,42.0,1533.7,6.0,758.0,82.74,89.99,3.0,0.0,1.0,3.0,2022.0,0.0,0.0,0.05162,0.998667,71.72,15.0,19.0,-2.103747,-2.972164,0.593469,0.999984,0.999983,1.000002,-1.143496,2.332662,0.0,0.0,3.890231e-05,5.095562e-05,9.659258e-01,-0.258819,15.0,-14.067134,-15.545871,0.0,0.000000,1006.883572,81.586343,63.409864,22.046579,42.253042,2.785468,191.725379,0.000000,0.000000,0.000000,-0.0,0.000000,24.537444,2022-01-02 19:00:00,1323,357.089
252284,359.246,0.0,1.0,124.0,15.0,1.0,3.0,60.0,42.0,1533.7,6.0,758.0,82.74,89.99,3.0,0.0,1.0,3.0,2022.0,0.0,0.0,0.05162,0.998667,56.73,15.0,20.0,-1.644708,-2.486733,0.493467,0.999998,0.951938,0.999999,-0.772590,2.445778,0.0,0.0,5.625024e-05,5.783954e-05,8.660254e-01,-0.500000,15.0,-13.914497,-15.400321,0.0,0.000000,1005.890660,100.000000,79.164281,93.310777,55.313253,2.480463,212.414847,0.000000,0.000000,0.000000,-0.0,0.000000,24.537444,2022-01-02 20:00:00,1323,359.246
252285,336.553,0.0,1.0,124.0,15.0,1.0,3.0,60.0,42.0,1533.7,6.0,758.0,82.74,89.99,3.0,0.0,1.0,3.0,2022.0,0.0,0.0,0.05162,0.998667,46.16,15.0,21.0,-1.059791,-1.829904,0.422324,1.000004,0.585003,1.000002,-0.137335,2.346284,0.0,0.0,6.900923e-05,6.743974e-05,7.071068e-01,-0.707107,15.0,-12.935167,-14.470310,0.0,0.000000,1004.838615,100.000000,67.365516,96.381655,99.577694,2.613445,227.845344,0.000000,0.000000,0.000000,-0.0,0.000000,24.537444,2022-01-02 21:00:00,1323,336.553
252286,328.797,0.0,1.0,124.0,15.0,1.0,3.0,60.0,42.0,1533.7,6.0,758.0,82.74,89.99,3.0,0.0,1.0,3.0,2022.0,0.0,0.0,0.05162,0.998667,46.29,15.0,22.0,-0.327149,-0.794744,0.280994,1.000002,0.423383,0.999997,0.711429,2.188232,0.0,0.0,1.049446e-04,1.081447e-04,5.000000e-01,-0.866025,15.0,-11.680396,-13.222627,0.0,0.000000,1004.896384,100.000000,60.708295,100.000000,100.000000,2.457001,227.156121,0.000000,0.000000,0.000000,-0.0,0.000000,24.537444,2022-01-02 22:00:00,1323,328.797


In [196]:
corrMatNight = appendedDf.loc[appendedDf.hours_ahead_pred == 0].corr()

In [199]:
corrMatNight[['y_cons','y_prod']].T

Unnamed: 0,y_cons,y_prod,isTrain,data_block_id,county,is_business,product_type,prediction_unit_id,eic_count,installed_capacity,business_prodType,ind_customer_id,lowest_price_per_mwh,highest_price_per_mwh,yearday,weekday,month,monthday,year,holiday,no_workday,sinYearDay,cosYearDay,ePrices,County_pred,hours_ahead_pred,temperatureint_pred,dewpointint_pred,cloudcover_highint_pred,cloudcover_lowint_pred,cloudcover_midint_pred,cloudcover_totalint_pred,10_metre_u_wind_componentint_pred,10_metre_v_wind_componentint_pred,direct_solar_radiationint_pred,surface_solar_radiation_downwardsint_pred,snowfallint_pred,total_precipitationint_pred,sinDay_pred,cosDay_pred,County,temperatureint,dewpointint,rainint,snowfallint,surface_pressureint,cloudcover_totalint,cloudcover_lowint,cloudcover_midint,cloudcover_highint,windspeed_10mint,winddirection_10mint,shortwave_radiationint,direct_solar_radiationint,diffuse_radiationint,meltingInCM,snowHeightFlux,snowcover,unique_time,ind_customer_id2,combinedTarg,value_changed_eic,value_changed_cap
y_cons,1.0,0.191362,-0.054191,0.040562,-0.165159,0.242654,0.237967,-0.173092,0.408594,0.870402,0.3614,0.012529,-0.025813,-0.023554,-0.029808,-0.006039,-0.029469,-0.007137,0.045654,-0.003588,-0.011875,0.034723,0.039452,-0.008135,-0.165159,,-0.057608,-0.053674,0.005897,0.021938,0.01214,0.017543,-0.015753,0.007088,-0.004298,-0.003992,0.017555,-0.001187,,,-0.165159,-0.050423,-0.050832,-0.009411,0.014986,0.003494,0.012483,0.011923,0.012653,0.006035,0.010285,0.001393,-0.030084,-0.024628,-0.033055,0.049273,0.049334,0.008633,0.040562,-0.130401,0.999989,0.095174,0.094859
y_prod,0.191362,1.0,-0.056882,0.074043,0.074391,0.077125,0.155308,0.051524,0.121363,0.211844,0.158525,0.007538,-0.013038,-0.012885,-0.022897,0.007691,-0.023511,0.009295,0.068303,0.002521,0.0079,0.036032,-0.038653,-0.009785,0.074391,,0.029139,0.025201,-0.000814,-0.023993,-0.012079,-0.01848,0.008092,-0.001697,0.005821,-0.002864,-0.001052,-0.00096,,,0.074391,0.034625,0.028402,0.003787,-0.006731,0.021235,-0.019352,-0.026743,-0.001981,0.002587,0.026187,-0.002922,0.037427,0.03252,0.036079,-0.030616,-0.030388,-0.029707,0.074043,0.084282,0.195957,0.036229,0.036118


### investigate consumption
- some heavy overestimation (when no consumption but we predict one)
- some heave underestimation, when lots of consumption but no prediciton

we can't fit yearly trend!!
things that don't help:
- relu activation function doesn't help (but makes targets all positive)
- input normalization
- target normalization
- bigger network
- using less features to predict
- using weights on day doesn't help (but is definitely needed)


it seems that the consumption data is very sparse & hence really hard to fit 
-> try to find a normalization criteria


In [None]:
#train
print(trainDf.y_cons.mean(),trainDf.y_cons.std())
print(trainDf.pred_cons.mean(),trainDf.pred_cons.std())

In [None]:
#test
print(testDf.y_cons.mean(),testDf.y_cons.std())
print(testDf.pred_cons.mean(),testDf.pred_cons.std())

In [None]:
testDf.absErr_cons.hist(bins=100)

#### train

In [None]:
trainDf.groupby('hours_ahead')[['absErr_cons','pred_cons','y_cons']].mean().plot()
trainDf.groupby('weekday')[['absErr_cons','pred_cons','y_cons']].mean().plot()

In [None]:
trainDf.groupby('unique_time')[['y_cons','absErr_cons','pred_cons']].mean().plot()
trainDf.groupby('unique_time')[['y_cons','absErr_cons','pred_cons']].std().plot()

In [None]:
# check correlations to y_cons
for f in ['county', 'is_business',
       'product_type', 
       'business_prodType'
]:
       print('mean', trainDf.groupby(f)[['y_cons']].mean(),'std', trainDf.groupby(f)[['y_cons']].std())
       #print()

for f in ['eic_count', 'installed_capacity','ind_customer_id', 'prediction_unit_id', 
       'lowest_price_per_mwh','highest_price_per_mwh', 
       'yearday', 'weekday', 'month', 'monthday',
]:
       trainDf.groupby(f)[['y_cons']].mean().plot()

### test

In [None]:
testDf.groupby('hours_ahead')[['absErr_cons','pred_cons','y_cons']].mean().plot()
testDf.groupby('weekday')[['absErr_cons','pred_cons','y_cons']].mean().plot()

In [None]:
# notice that feb-may in 2023 are way different predictions
testDf.groupby('unique_time')[['y_cons','absErr_cons','pred_cons']].mean().plot()
testDf.groupby('unique_time')[['y_cons','absErr_cons','pred_cons']].std().plot()

In [None]:
testDf.loc[testDf.absErr_cons > 1300].data_block_id.value_counts()

In [None]:
pd.set_option('Display.max_columns',None)
testDf.loc[(testDf.data_block_id == 463) & (testDf.prediction_unit_id == 5)]#.sort_values(by='absErr_cons')

In [None]:
testDf.loc[(testDf.data_block_id == 463) & (testDf.hours_ahead == 1)].sort_values(by='absErr_cons')

In [None]:
testDf.loc[testDf.ePrices > 1000].unique_time.value_counts()

In [None]:
testDf['normCons'] = testDf['y_cons'] / testDf['windspeed_10mint']
testDf['normConsProd'] = testDf['pred_cons'] / testDf['installed_capacity']
testDf = testDf.sort_values(by='unique_time')
a = testDf.loc[(testDf.month == 8)]

fig = subplots.make_subplots(rows=1, cols=1,shared_xaxes=True)

fig.add_trace(go.Scatter(x = a['unique_time'], y = a['y_cons'],mode ='markers', name='y_cons'),row=1, col=1)
fig.add_trace(go.Scatter(x = a['unique_time'], y = a['pred_cons'],mode ='markers', name='pred_cons'),row=1, col=1)
#fig.add_trace(go.Scatter(x = a['unique_time'], y = a['normCons'],mode ='markers', name='normCons'),row=1, col=1)
#fig.add_trace(go.Scatter(x = a['unique_time'], y = a['normConsProd'],mode ='markers', name='normConsProd'),row=1, col=1)
fig.show()

### check individual day difference
maybe some features corrupt result, seems that predictions are sometimes far off
features that are iffy: eic_count, ind_customer_id, prediction_unit_id, year

In [None]:

f = ['y_cons','pred_cons','absErr_cons','county',
 'is_business',
 'product_type',

 'installed_capacity',
 'business_prodType',
 #'lowest_price_per_mwh',
 #'highest_price_per_mwh',
 #'yearday',
 #'weekday',
 #'month',
 #'monthday'
 ]
a = testDf.loc[(testDf.month == 8) & (testDf.monthday == 3) & (testDf.hours_ahead == 25)][f]

In [None]:
b = testDf.loc[(testDf.month == 3) & (testDf.monthday == 3) & (testDf.hours_ahead == 25)][f]

In [None]:
pd.set_option('Display.max_columns', None)
pd.set_option('Display.max_rows', None)
c = pd.merge(a,b,on=['county','is_business','product_type','business_prodType'], suffixes=('_a','_b'),how='outer')
sorted_columns = sorted(c.columns)
c = c[sorted_columns]
c
#features that are iffy: eic_count, ind_customer_id, prediction_unit_id, year

In [None]:
pd.set_option('Display.max_columns', 10)
pd.set_option('Display.max_rows', 10)