In [None]:
import os
import polars as pl

final_df = pl.read_parquet("data/joined_df.parquet")
# List of columns to keep
keep_cols = [
    "Időpont",
    "Negatív Mérlegköri kiegyenlítő energia egységára (HUF/kWh)",
    "Pozitív Mérlegköri kiegyenlítő energia egységára (HUF/kWh)",
    "Rendszer-irány (kWh)",
    "Naperőművek becsült termelése (aktuális)",
    "Naperőművek becsült termelése (intraday)",
    "Naperőművek becsült termelése (dayahead)",
    "HU-UK",
    "HU-SK",
    # "HU-SI", too many NaN values
    "HU-RS",
    "HU-AT",
    "HU-HR",
    "HU-RO",
    "HU-AT menetrend",
    "HU-HR menetrend",
    # "HU-SI menetrend (RIR NT)", too many NaN values
    "HU-SK menetrend",
    "HU-RS menetrend",
    "HU-UK menetrend",
    "HU-RO menetrend",
    "Bruttó terv erőművi termelés",
    "Bruttó tény erőművi termelés",
    "Bruttó hitelesített rendszerterhelés tény",
    "Bruttó rendszerterhelés becslés (dayahead)",
    "Szélerőművek becsült termelése (aktuális)",
    "Szélerőművek becsült termelése (dayahead)",
    "Szélerőművek becsült termelése (intraday)",
]

# Filter the DataFrame to keep only the specified columns
final_df = final_df.select(keep_cols)
# 100-200 NaN max per col (out of 210k), just fill it
final_df = final_df.fill_null(strategy="forward")
# Define columns to create forward lags for
cols_10_h = [
    "Szélerőművek becsült termelése (aktuális)",
    "Szélerőművek becsült termelése (dayahead)",
    "Szélerőművek becsült termelése (intraday)",
    "Naperőművek becsült termelése (aktuális)",
    "Naperőművek becsült termelése (intraday)",
    "Naperőművek becsült termelése (dayahead)",
]

# Max number of 15-minute steps (10 hours)
max_ahead = 40

# Collect all new columns
new_cols = []

for col in cols_10_h:
    for i in range(1, max_ahead + 1):
        new_col_name = f"{col}_t+{i*15}min"
        new_cols.append(pl.col(col).shift(-i).alias(new_col_name))

# Apply with_columns to add all new forward lags
final_df = final_df.with_columns(new_cols)
# Define columns to create forward lags for
cols_12_h = [
    "Bruttó terv erőművi termelés",
    "Bruttó rendszerterhelés becslés (dayahead)",
    "HU-AT menetrend",
    "HU-HR menetrend",
    # "HU-SI menetrend (RIR NT)",
    "HU-SK menetrend",
    "HU-RS menetrend",
    "HU-UK menetrend",
    "HU-RO menetrend",
]

# Max number of 15-minute steps (12 hours)
max_ahead = 48

# Collect all new columns
new_cols = []

for col in cols_12_h:
    for i in range(1, max_ahead + 1):
        new_col_name = f"{col}_t+{i*15}min"
        new_cols.append(pl.col(col).shift(-i).alias(new_col_name))

# Apply with_columns to add all new forward lags
final_df = final_df.with_columns(new_cols)
# Targets to forecast
target_cols = [
    "Negatív Mérlegköri kiegyenlítő energia egységára (HUF/kWh)",
    "Pozitív Mérlegköri kiegyenlítő energia egységára (HUF/kWh)",
    "Rendszer-irány (kWh)",
]

# 15-minute steps for 4h to 5h ahead (i.e., 16 to 20 steps)
ahead_steps = range(16, 21)

# Create forward lag columns
target_future_cols = []

for col in target_cols:
    for i in ahead_steps:
        new_col = f"{col}_t+{i*15}min"
        target_future_cols.append(pl.col(col).shift(-i).alias(new_col))

# Apply to dataframe
final_df = final_df.with_columns(target_future_cols)
# Remove the last 48 rows (12 hours) from the final_df, since forward lags cause NaN values
final_df = final_df.slice(0, final_df.height - 48)
import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import statsmodels.api as sm
import numpy as np
import polars as pl
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error

# Convert to pandas and set index
df = final_df.to_pandas()
df = df.set_index("Időpont")

# Base predictors (originals)
base_predictors = [
    "Naperőművek becsült termelése (aktuális)",
    "Naperőművek becsült termelése (intraday)",
    "Naperőművek becsült termelése (dayahead)",
    "HU-UK",
    "HU-SK",
    "HU-RS",
    "HU-AT",
    "HU-HR",
    "HU-RO",
    "HU-AT menetrend",
    "HU-HR menetrend",
    "HU-SK menetrend",
    "HU-RS menetrend",
    "HU-UK menetrend",
    "HU-RO menetrend",
    "Bruttó terv erőművi termelés",
    "Bruttó tény erőművi termelés",
    "Bruttó hitelesített rendszerterhelés tény",
    "Bruttó rendszerterhelés becslés (dayahead)",
    "Szélerőművek becsült termelése (aktuális)",
    "Szélerőművek becsült termelése (dayahead)",
    "Szélerőművek becsült termelése (intraday)",
]

# Forecast horizons (in minutes)
forecast_steps = [240, 255, 270, 285, 300]

# Define target variable bases
target_bases = [
    "Negatív Mérlegköri kiegyenlítő energia egységára (HUF/kWh)",
    "Pozitív Mérlegköri kiegyenlítő energia egységára (HUF/kWh)",
    "Rendszer-irány (kWh)",
]

# 1. Build full predictor list (originals + all forward lags)
all_predictors = []
for base in base_predictors:
    # Include the original base predictor if present
    if base in df.columns:
        all_predictors.append(base)

    # Include all forward-lagged versions of this predictor
    forward_lagged = [col for col in df.columns if col.startswith(base + "_t+")]
    all_predictors.extend(forward_lagged)

print(all_predictors)
# 2. Build full target list (only forward lags)
target_columns = []
for target_base in target_bases:
    for step in forecast_steps:
        target_col = f"{target_base}_t+{step}min"
        if target_col in df.columns:
            target_columns.append(target_col)
print(target_columns)
# 3. Train models
results = []

for target_col in target_columns:
    # Drop NaNs
    data = df[all_predictors + [target_col]].dropna()
    if data.shape[0] < 100:
        continue

    X = data[all_predictors]
    y = data[target_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Fit XGBoost
    model = XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
    model.fit(X_train_scaled, y_train)

    # Predict
    y_pred = model.predict(X_test_scaled)

    # Evaluate
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Extract step in minutes
    step = int(target_col.split("_t+")[1].replace("min", ""))

    results.append({"Target": target_col.split("_t+")[0], "Step (min)": step, "RMSE": rmse, "MAE": mae, "R²": r2})

# Display
results_df = pd.DataFrame(results)
display(results_df.sort_values(["Target", "Step (min)"]))
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Target bases and steps
target_bases = [
    "Negatív Mérlegköri kiegyenlítő energia egységára (HUF/kWh)",
    "Pozitív Mérlegköri kiegyenlítő energia egységára (HUF/kWh)",
    "Rendszer-irány (kWh)",
]
forecast_steps = [240, 255, 270, 285, 300]  # minutes

# Build full feature list
all_predictors = []
for base in base_predictors:
    if base in df.columns:
        all_predictors.append(base)
    all_predictors += [col for col in df.columns if col.startswith(base + "_t+")]

# Results
results = []

for target_base in target_bases:
    for step in forecast_steps:
        target_col = f"{target_base}_t+{step}min"
        if target_col not in df.columns:
            continue

        # Prepare data
        feature_cols = all_predictors.copy()
        usable_cols = feature_cols + [target_col]
        data = df[usable_cols].dropna()
        if data.shape[0] < 100:
            continue

        X = data[feature_cols].values
        y = data[target_col].values

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

        # Scale
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Define model
        model = Sequential(
            [
                Dense(256, activation="relu", input_shape=(X_train_scaled.shape[1],)),
                Dropout(0.3),
                Dense(128, activation="relu"),
                Dropout(0.2),
                Dense(1),
            ]
        )
        model.compile(optimizer="adam", loss="mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

        # Train
        model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, verbose=0)

        # Predict and evaluate
        y_pred = model.predict(X_test_scaled).flatten()
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        results.append({"Target": target_base, "Step (min)": step, "RMSE": rmse, "MAE": mae, "R²": r2})

# Show results
results_df = pd.DataFrame(results)
display(results_df.sort_values(["Target", "Step (min)"]))