In [9]:
# Essential imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [10]:
# Load preprocessed time series dataset
df = pd.read_csv("preprocess_data_timeseries.csv")

In [None]:
# Convert 'Time' to datetime and sort chronologically
df['Time'] = pd.to_datetime(df['Time'])
df = df.sort_values('Time')

print(f"Loaded dataset shape: {df.shape}")
print(f"Date range: {df['Time'].min()} to {df['Time'].max()}")

# Define feature sets
basic_features = ['Season', 'Day_of_the_week', 'DHI', 'DNI', 'GHI', 'Wind_speed', 'Humidity', 'Temperature']
time_features = ['hour', 'day_of_week', 'day_of_month', 'month', 'weekend',
                 'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
                 'is_night', 'is_morning', 'is_afternoon', 'is_evening']
explicit_cols = ['Time', 'PV_production', 'Wind_production', 'Electric_demand']
lag_features = [col for col in df.columns if 'lag' in col or 'rolling' in col]

# Combine all features
all_features = basic_features + time_features + lag_features+explicit_cols
available_features = [f for f in all_features if f in df.columns]

print(f"\nAvailable features: {len(available_features)}")
print(f"Basic features: {len([f for f in basic_features if f in df.columns])}")
print(f"Time features: {len([f for f in time_features if f in df.columns])}")
print(f"Lag features: {len([f for f in lag_features if f in df.columns])}")
print(f"Explicit columns: {len([f for f in explicit_cols if f in df.columns])}")

# Define targets
targets = ['PV_production', 'Wind_production']

# OPTION 1: TIME SERIES MODELS (LSTM, CNN-LSTM) - Chronological Split
print("\n" + "~"*50)
print("TIME SERIES DATA (Chronological Split)")
print("~"*50)

def time_series_split(df, test_size=0.3):
    split_idx = int(len(df) * (1 - test_size))
    train = df.iloc[:split_idx]
    test = df.iloc[split_idx:]
    return train, test

# Split chronologically
train_ts, test_ts = time_series_split(df)

print(f"Training period: {train_ts['Time'].min()} to {train_ts['Time'].max()}")
print(f"Testing period: {test_ts['Time'].min()} to {test_ts['Time'].max()}")

# Save time series train/test data
train_ts.to_csv("train_timeseries.csv", index=False)
test_ts.to_csv("test_timeseries.csv", index=False)

print("Time series train/test data saved.")
print(f"Train shape: {train_ts.shape}, Test shape: {test_ts.shape}")

# OPTION 2: BACKWARD COMPATIBILITY (Original Features Only)
print("\n" + "~"*50)
print("BACKWARD COMPATIBLE DATA (Original Features)")
print("~"*50)

x_basic_features = df[basic_features]
y_basic_targets = df[targets]

X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
    x_basic_features, y_basic_targets, test_size=0.3, random_state=42, shuffle=True
)

# Save backward compatible train/test data
train_data_orig = pd.concat([X_train_orig, y_train_orig], axis=1)
test_data_orig = pd.concat([X_test_orig, y_test_orig], axis=1)

train_data_orig.to_csv("train_multi_output.csv", index=False)
test_data_orig.to_csv("test_multi_output.csv", index=False)

print("Backward compatible train/test data saved.")
print(f"Train shape: {X_train_orig.shape}, Test shape: {X_test_orig.shape}")

print("\n" + "~"*60)
print("DATA SPLITTING DONE!!")
print("~"*60)
print("2 datasets created:")
print("1. train_timeseries.csv / test_timeseries.csv - Chronological split for time series models")
print("2. train_multi_output.csv / test_multi_output.csv - Original features (backward compatible)")

Loaded dataset shape: (38879, 70)
Date range: 2019-01-01 00:00:00 to 2019-05-15 23:50:00

Available features: 70
Basic features: 8
Time features: 13
Lag features: 45
Explicit columns: 4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
TIME SERIES DATA (Chronological Split)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training period: 2019-01-01 00:00:00 to 2019-04-05 11:50:00
Testing period: 2019-04-05 11:55:00 to 2019-05-15 23:50:00
Time series train/test data saved.
Train shape: (27215, 70), Test shape: (11664, 70)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BACKWARD COMPATIBLE DATA (Original Features)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Backward compatible train/test data saved.
Train shape: (27215, 8), Test shape: (11664, 8)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
DATA SPLITTING DONE!!
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2 datasets created:
1. train_timeseries.csv / test_timeseries.csv - Chronological sp