### Imports

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.regularizers import l2
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.initializers import GlorotNormal
import random
import optuna
import tensorflow as tf

In [19]:
# Plot function

def plot_feature(dataset:pd.DataFrame, featureName:str):
    
    fig, axs = plt.subplots(1, 1, figsize=(20, 10))

    dataset[['date_forecast', featureName]].set_index("date_forecast").plot(ax=axs, title=featureName, color='red')

# Data preperation

In [20]:
# Get data, combine and sort

data_test_A = pd.read_csv("current_csv_files/data_test_A.csv", index_col='Unnamed: 0')
data_test_B = pd.read_csv("current_csv_files/data_test_B.csv", index_col='Unnamed: 0')
data_test_C = pd.read_csv("current_csv_files/data_test_C.csv", index_col='Unnamed: 0')

data_test_ALL = pd.concat([data_test_A, data_test_B, data_test_C], ignore_index=True)
data_test_ALL = data_test_ALL.sort_values(['date_forecast', 'A', 'B', 'C'], ascending=[True, False, False, False])
#data_test_ALL = data_test_ALL.set_index('date_forecast')
data_test_ALL.rename(columns={'pv_measurement': 'target'}, inplace=True)

In [21]:
# Split data

data_ALL = data_test_ALL[data_test_ALL['train'] == 1]
test_ALL = data_test_ALL[data_test_ALL['train'] == 0].drop('target', axis='columns')

In [22]:
# Separate features and target variable
X = data_ALL.drop('target', axis='columns')
y = data_ALL[['date_forecast', 'target']]

In [23]:
# Split data
"""
"""
def split_data(df, percent):
    split_index = int( np.floor( len(df)*percent ) )
    df_first = df[:split_index]
    df_last = df[split_index:]
    return df_first, df_last

train_percent = 0.94 # Of all
val_percent = 0.5 # Of non-train

X_train, X_non_train = split_data(X, train_percent)
X_val, X_test = split_data(X_non_train, val_percent)

y_train, y_non_train = split_data(y, train_percent)
y_val, y_test = split_data(y_non_train, val_percent)

X_kaggle = test_ALL


In [24]:
# Sample DataFrame creation.
# Assuming df is your original DataFrame and 'date' is the column with dates.
# df = pd.DataFrame({'date': pd.date_range(start='2022-01-01', periods=100, freq='D'), 'data': range(100)})

# Make sure 'date' column is datetime type
"""
"""
X['date_forecast'] = pd.to_datetime(X['date_forecast'])
y['date_forecast'] = pd.to_datetime(y['date_forecast'])

# Define your date range
start_date = "2022-04-01 00:00:00"
end_date = "2022-08-03 23:00:00"

# Convert strings to datetime
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

# Filter rows within the date range
mask_X = (X['date_forecast'] >= start_date) & (X['date_forecast'] <= end_date)
mask_y = (y['date_forecast'] >= start_date) & (y['date_forecast'] <= end_date)
X_val = X.loc[mask_X]
y_val = y.loc[mask_y]

# Pop out rows within the date range if you want to remove them from the original df
"""
X_train = X.loc[~mask_X]
y_train = y.loc[~mask_y]
"""


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['date_forecast'] = pd.to_datetime(y['date_forecast'])


'\nX_train = X.loc[~mask_X]\ny_train = y.loc[~mask_y]\n'

In [25]:
# Load X_test_kaggle, best sub on kaggle, sort properly (hell)

test_ALL_to_sub = pd.read_csv("current_csv_files/test_ALL.csv")
test_ALL_to_sub_ABC = test_ALL_to_sub.sort_values(['A', 'B', 'C', 'date_forecast'], ascending=[False, False, False, True])
y_hat_kaggle = pd.read_csv("teo_subs/kaggle_149.csv", index_col='id')
test_ALL_to_sub_ABC['new_index'] = range(2160)
test_ALL_to_sub_ABC = test_ALL_to_sub_ABC.set_index('new_index')
test_ALL_to_sub_ABC['y_hat_kaggle'] = y_hat_kaggle
test_ALL_to_sub_sorted = test_ALL_to_sub_ABC.sort_values(['date_forecast', 'A', 'B', 'C'], ascending=[True, False, False, False])
test_ALL_to_sub_sorted = test_ALL_to_sub_sorted.set_index('date_forecast')
y_hat = test_ALL_to_sub_sorted['y_hat_kaggle']

In [26]:
# Filter by summer months

y = y[ X['date_forecast'].dt.month.between(4, 7) ]
X = X[ X['date_forecast'].dt.month.between(4, 7) ]


In [27]:
X.drop('date_forecast', axis='columns', inplace=True)
y.drop('date_forecast', axis='columns', inplace=True)

X_train.drop('date_forecast', axis='columns', inplace=True)
y_train.drop('date_forecast', axis='columns', inplace=True)

X_non_train.drop('date_forecast', axis='columns', inplace=True)
y_non_train.drop('date_forecast', axis='columns', inplace=True)

X_val.drop('date_forecast', axis='columns', inplace=True)
y_val.drop('date_forecast', axis='columns', inplace=True)

X_test.drop('date_forecast', axis='columns', inplace=True)
y_test.drop('date_forecast', axis='columns', inplace=True)

X_kaggle.drop('date_forecast', axis='columns', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop('date_forecast', axis='columns', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test.drop('date_forecast', axis='columns', inplace=True)


In [28]:
# Prepare features for scaling

feature_dont_touch = [
    'date_forecast',
    'is_day:idx',
    'is_in_shadow:idx',
    'pv_measurement',
    'est',
    'train',
    'A',
    'B',
    'C',
    'precip_type_0',
    'precip_type_1',
    'daily_sinus',
    'annual_sinus',
    'bad_cloud_data',
    'open_sky'
]

feature_to_standardize = [
    'absolute_humidity_2m:gm3',
    'air_density_2m:kgm3',
    'dew_point_2m:K',
    'pressure_100m:hPa',
    'relative_humidity_1000hPa:p',
    't_1000hPa:K',
    'wind_speed_u_10m:ms',
    'wind_speed_v_10m:ms',
    #'clear_sky_rad_CD', # Central difference

    # Kinda useless
    #'pressure_50m:hPa',
    #'msl_pressure:hPa',
    #'sfc_pressure:hPa',
]

feature_to_normalize = [
    'cloud_base_agl:m_y',
    'clear_sky_energy_1h:J',
    'diffuse_rad_1h:J',
    'direct_rad_1h:J',
    'precip_5min:mm',
    'rain_water:kgm2',
    'snow_water:kgm2',
    'super_cooled_liquid_water:kgm2',
    'clear_sky_rad:W',
    'diffuse_rad:W',
    'direct_rad:W',
    'direct_rad:W_lag_avg', # Lag
    'direct_rad:W_lead_avg', # Lead
    'effective_cloud_cover:p',
    'sun_azimuth:d',
    'total_cloud_cover:p',
    'visibility:m',
    'wind_speed_10m:ms',
    'sun_elevation:d', # Clipped version
    'year',
    'month',
    'day',

    # Kinda useless
    #'fresh_snow_12h:cm',
    #'fresh_snow_24h:cm'
]


In [29]:
# Scale features

standard_scaler = StandardScaler()
min_max_scaler = MinMaxScaler()

for feature in feature_to_standardize:
    #X_train[feature] = standard_scaler.fit_transform(X_train[[feature]])
    X[feature] = standard_scaler.fit_transform(X[[feature]])
    
    X_non_train[feature] = standard_scaler.transform(X_non_train[[feature]])
    X_val[feature] = standard_scaler.transform(X_val[[feature]])
    X_test[feature] = standard_scaler.transform(X_test[[feature]])
    X_kaggle[feature] = standard_scaler.transform(X_kaggle[[feature]])
    

for feature in feature_to_normalize:
    #X_train[feature] = standard_scaler.fit_transform(X_train[[feature]])
    X[feature] = min_max_scaler.fit_transform(X[[feature]])
    
    X_non_train[feature] = min_max_scaler.transform(X_non_train[[feature]])
    X_val[feature] = min_max_scaler.transform(X_val[[feature]])
    X_test[feature] = min_max_scaler.transform(X_test[[feature]])
    X_kaggle[feature] = min_max_scaler.transform(X_kaggle[[feature]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = standard_scaler.transform(X_test[[feature]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = standard_scaler.transform(X_test[[feature]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[feature] = standard_scaler.transform(X_test[[feature]])
A value is tryin

# Model training

In [30]:
trial_params = {'seed': 94, 'n_neurons_1': 169, 'n_neurons_2': 158, 'n_neurons_3': 100, 'n_neurons_4': 61, 'kernel_regularizer': 0.06010848923666125, 'learning_rate': 0.0008571647307496605, 'beta_1': 0.8091963165438008, 'min_delta': 470.3578972108609, 'batch_size': 32, 'patience': 15}

s = trial_params['seed']
np.random.seed(s)
random.seed(s)
tf.random.set_seed(s)

init = 'HeNormal'


# Define the Keras model
model = Sequential([
    Dense(trial_params["n_neurons_1"], input_dim=X_train.shape[1], activation='tanh', kernel_initializer=init),
    Dropout(0.1, (trial_params["n_neurons_1"],)),
    Dense(trial_params["n_neurons_2"], activation='relu', kernel_initializer=init, kernel_regularizer=l2(trial_params["kernel_regularizer"])),
    Dense(trial_params["n_neurons_3"], activation='relu', kernel_initializer=init),
    Dense(trial_params["n_neurons_4"], activation='relu', kernel_initializer=init),
    Dense(1, activation='relu', kernel_initializer=init)
])

opt = Adam(learning_rate=trial_params["learning_rate"], beta_1=trial_params["beta_1"])
model.compile(loss='mean_absolute_error', optimizer=opt)

# Fit the model
history = model.fit(
    X, y,
    validation_data=(X_kaggle, y_hat),
    #validation_split=0.2,
    epochs=13,
    batch_size=trial_params['batch_size'],
    verbose=1,
    use_multiprocessing=True, workers=4,
)


Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


# Making prediction

In [31]:
# Preds

kaggle_pred = model.predict(X_kaggle, verbose=1).ravel()

# Merge with kaggle_test data
test_ALL_merge = pd.read_csv("current_csv_files/test_ALL.csv")
test_ALL_merge['prediction'] = kaggle_pred

# Correctly sort test data for submission
test_ALL_merge_sorted = test_ALL_merge.sort_values(['A', 'B', 'C', 'date_forecast'], ascending=[False, False, False, True])
test_ALL_merge_sorted['id'] = range(2160)
test_ALL_merge_sorted = test_ALL_merge_sorted.set_index('id')
test_ALL_merge_sorted['id'] = range(2160)

# Comparison to best sub on kaggle
print("MAE: ", mean_absolute_error(kaggle_pred, y_hat))

MAE:  61.83069806086658


In [32]:
# Average best pred with new best kaggle sub
df_merge = pd.DataFrame()
df_merge['DNN'] = pd.read_csv("dnn_kaggle3/optuna_sub_0.csv", index_col='id')
df_merge['cat'] = pd.read_csv("teo_subs/kaggle_149.csv", index_col='id')

df_merge

Unnamed: 0_level_0,DNN,cat
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.000000,1.141681
1,0.000000,1.438430
2,0.000000,1.138146
3,35.690308,45.634729
4,631.002260,335.761563
...,...,...
2155,16.857908,29.307958
2156,1.460704,4.800279
2157,0.000000,0.000000
2158,0.000000,0.000000


In [33]:
df_merge['prediction'] = df_merge.mean(axis='columns')
df_merge

Unnamed: 0_level_0,DNN,cat,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.000000,1.141681,0.570841
1,0.000000,1.438430,0.719215
2,0.000000,1.138146,0.569073
3,35.690308,45.634729,40.662519
4,631.002260,335.761563,483.381911
...,...,...,...
2155,16.857908,29.307958,23.082933
2156,1.460704,4.800279,3.130492
2157,0.000000,0.000000,0.000000
2158,0.000000,0.000000,0.000000


In [34]:
df_merge['DNN'].to_csv("teo_subs/teo_sub_11.csv", index=True)

In [17]:
# Comparison to best sub on kaggle
this = df_merge['DNN']
that = df_merge['cat']

best_sub = pd.read_csv("teo_subs/best_sub.csv", index_col='id')
print("MAE: ", mean_absolute_error(this, that))

slide = 0
start = 0 + slide
stop = 1000 + slide

plt.figure(figsize=(30, 8))
plt.plot(range(start, stop), that.iloc[start:stop], alpha=0.5)
plt.plot(range(start, stop), this.iloc[start:stop], alpha=0.5)
plt.title('Predicted Values')
plt.show()

NameError: name 'df_merge' is not defined

# Prediction log