In [1]:
# Essential imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Load preprocessed time series dataset
df = pd.read_csv("preprocess_data_timeseries.csv")

In [3]:
# Convert Time to datetime and sort
df['Time'] = pd.to_datetime(df['Time'])
df = df.sort_values('Time')

print(f"Loaded dataset shape: {df.shape}")
print(f"Date range: {df['Time'].min()} to {df['Time'].max()}")

# Define feature sets for different model types
basic_features = ['Season', 'Day_of_the_week', 'DHI', 'DNI', 'GHI', 'Wind_speed', 'Humidity', 'Temperature']
time_based_features = ['hour', 'day_of_week', 'month', 'weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos','is_night', 'is_morning', 'is_afternoon', 'is_evening']
lag_features = [column_name for column_name in df.columns if 'lag' in column_name or 'rolling' in column_name]

# Combine all available features
all_features = basic_features + time_based_features + lag_features
available_features = [f for f in all_features if f in df.columns]

print(f"\nAvailable features: {len(available_features)}")
print(f"Base features: {len([f for f in basic_features if f in df.columns])}")
print(f"Time features: {len([f for f in time_based_features if f in df.columns])}")
print(f"Lag features: {len([f for f in lag_features if f in df.columns])}")

# Define targets
targets = ['PV_production', 'Wind_production']

# OPTION 1: Traditional Models (Random Forest, XGBoost, Linear Regression)
print("\n" + "~"*50)
print("OPTION 1: TRADITIONAL MODELS like Random Forest, XGBoost, Linear Regression")
print("~"*50)

X_traditional = df[available_features]
y_traditional = df[targets]

# Split with shuffle for traditional models
X_train_all_features, X_test_all_features, y_train_all_features, y_test_all_features = train_test_split(
    X_traditional, y_traditional, test_size=0.3, random_state=42, shuffle=True
)

# Save traditional model data
train_data_trad = pd.concat([X_train_all_features, y_train_all_features], axis=1)
test_data_trad = pd.concat([X_test_all_features, y_test_all_features], axis=1)

train_data_trad.to_csv("train_traditional.csv", index=False)
test_data_trad.to_csv("test_traditional.csv", index=False)

print("Traditional Train/Test data saved.")
print(f"Train shape: {X_train_all_features.shape}, Test shape: {X_test_all_features.shape}")

# OPTION 2: Time Series Models (LSTM, CNN-LSTM) - Chronological Split
print("\n" + "~"*50)
print("OPTION 2: TIME SERIES MODELS")
print("~"*50)

def time_series_split(df, test_size=0.3):
    split_idx = int(len(df) * (1 - test_size))
    train = df.iloc[:split_idx]
    test = df.iloc[split_idx:]
    return train, test

# Split chronologically (NO SHUFFLE)
train_ts, test_ts = time_series_split(df)

print(f"Training period: {train_ts['Time'].min()} to {train_ts['Time'].max()}")
print(f"Testing period: {test_ts['Time'].min()} to {test_ts['Time'].max()}")

# Save time series data
train_ts.to_csv("train_timeseries.csv", index=False)
test_ts.to_csv("test_timeseries.csv", index=False)

print("Time Series Train/Test data saved chronologically.")
print(f"Train shape: {train_ts.shape}, Test shape: {test_ts.shape}")

# OPTION 3: Backward Compatibility (Your original format)
print("\n" + "~"*50)
print("OPTION 3: BACKWARD COMPATIBILITY")
print("~"*50)

# Use only original features for backward compatibility
x_basic_features = df[basic_features]
y_basic_targets = df[targets]

X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(x_basic_features, y_basic_targets, test_size=0.3, random_state=42, shuffle=True
)

train_data_orig = pd.concat([X_train_orig, y_train_orig], axis=1)
test_data_orig = pd.concat([X_test_orig, y_test_orig], axis=1)

train_data_orig.to_csv("train_multi_output.csv", index=False)
test_data_orig.to_csv("test_multi_output.csv", index=False)

print("Backward compatible data saved.")
print(f"Train shape: {X_train_orig.shape}, Test shape: {X_test_orig.shape}")

print("\n" + "~"*60)
print("DATA SPLITTING DONE!!")
print("~"*60)
print("3 datasets created:")
print("1. train_traditional.csv/test_traditional.csv - All features + shuffle")
print("2. train_timeseries.csv/test_timeseries.csv - Time series chronological")
print("3. train_multi_output.csv/test_multi_output.csv - Original features (backward compatible)")

Loaded dataset shape: (38879, 70)
Date range: 2019-01-01 00:00:00 to 2019-05-15 23:50:00

Available features: 65
Base features: 8
Time features: 12
Lag features: 45

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
OPTION 1: TRADITIONAL MODELS like Random Forest, XGBoost, Linear Regression
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Traditional Train/Test data saved.
Train shape: (27215, 65), Test shape: (11664, 65)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
OPTION 2: TIME SERIES MODELS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training period: 2019-01-01 00:00:00 to 2019-04-05 11:50:00
Testing period: 2019-04-05 11:55:00 to 2019-05-15 23:50:00
Time Series Train/Test data saved chronologically.
Train shape: (27215, 70), Test shape: (11664, 70)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
OPTION 3: BACKWARD COMPATIBILITY
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Backward compatible data saved.
Train shape: (27215, 8), Test shape: (11664, 8)

~~