In [12]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')

In [13]:
def preprocess_data(filename, N, points_prior):
    """
    Preprocesses a time series dataset and sets up time series cross-validation folds.

    Args:
        filename (str): Path to CSV file.
        N (int): Window size for rolling and lag features.
        points_prior (int): Number of points to shift for prediction target.

    Returns:
        df (pd.DataFrame)): DataFrame
    """

    df = pd.read_csv(filename)
    feature_columns = ['HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL']

    # Rolling and lag features
    for col in feature_columns:
        df[f"rolling_mean_{col}"] = df[col].rolling(window=N).mean().shift(points_prior)
        df[f"rolling_std_{col}"] = df[col].rolling(window=N).std().shift(points_prior)
        for i in range(1, N + 1):
            df[f'{col}_lag{i + points_prior}'] = df[col].shift(i + points_prior)

    # Drop current columns and missing values
    df = df.drop(columns=feature_columns)
    df = df.dropna().reset_index(drop=True)

    # Time feature extraction
    df["date"] = pd.to_datetime(df["date"])
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["dow"] = df["date"].dt.day_of_week
    df["hour"] = df["date"].dt.hour
    df["minute"] = df["date"].dt.minute
    df = df.drop(columns="date")

    # Memory optimization
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = df[col].astype('int32')

    return df

def time_series_split(df, target_col, n_splits=3, max_train_size=None, test_ratio = 0.20):
    """
    splitting time series data into train, val, and test, using time series split
    """
    X = df.drop(columns=[target_col])
    y = df[target_col]

    n_total = len(df)
    n_test = int(n_total * test_ratio)
    split_idx = n_total - n_test

    X_trainval, X_test = X[:split_idx], X[split_idx:]
    y_trainval, y_test = y[:split_idx], y[split_idx:]

    print(f"Train/Val samples: {len(X_trainval)}, Test samples: {len(X_test)}")

    # ---- Time Series Cross-Validation for train/val ----
    tscv = TimeSeriesSplit(n_splits=n_splits, max_train_size=max_train_size)

    return X_trainval, y_trainval, X_test, y_test, tscv

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor

models = [
    # 1. Random Forest Regressor
    ["Random Forest Regressor",
     Pipeline([
         ('scaler', StandardScaler()),
         ('model', RandomForestRegressor(random_state=42, n_jobs=-1))
     ]),
     {
         "model__n_estimators": [100, 300],
         "model__max_depth": [5, 15],
         "model__min_samples_split": [2, 8],
         "model__min_samples_leaf": [1, 3],
     }],

    # 2. Support Vector Regressor (SVR)
    ["Support Vector Regressor (SVR)",
     Pipeline([
         ('scaler', StandardScaler()),
         ('model', SVR())
     ]),
     {
         "model__kernel": ['linear', 'rbf', 'poly'],
         "model__C": [0.1, 1, 10],
         "model__gamma": ['scale', 'auto', 0.1],
         "model__epsilon": [0.1, 0.4],
     }]
]

N_list = [12, 24, 48] # can add 96 if you want (and have enough time)
points_priors = [4, 96, 672]
filenames = ["trans_1.csv", "trans_2.csv"]

# Store the overall best N for each configuration
overall_best_results = {}

for (model_name, model, model_params) in models:
    for filename in filenames:
        for points_prior in points_priors:

            print(f"\n========================================================")
            print(f"Processing: File={filename}, PointsPrior={points_prior}, Model={model_name}")
            print(f"========================================================\n")

            results_per_n = {}
            for N in N_list:
                df = preprocess_data(filename, N = N, points_prior= points_prior)
                X_trainval, y_trainval, X_test, y_test, tscv = time_series_split(df, "OT")

                print("Training Configurations")
                print(f"file: {filename}")
                print(f"N: {N}")
                print(f"points_prior: {points_prior}")
                print(f"model: {model_name}")

                # ---- FEATURE SELECTION using RFE ----
                base_model = XGBRegressor(
                    objective='reg:squarederror',
                    random_state=42,
                    n_estimators=200,
                    learning_rate=0.05,
                    max_depth=5,
                    subsample=0.8
                )

                # Select top 20 features
                print("Selecting top 20 features...")
                rfe = RFE(estimator=base_model, n_features_to_select=20, step=0.1)
                rfe.fit(X_trainval, y_trainval)

                # Mask of selected features
                selected_features = list(X_trainval.columns[rfe.support_])
                print("Top 20 selected features:\n", list(selected_features))

                # Filter columns
                X_trainval_sel = X_trainval[selected_features]
                X_test_sel = X_test[selected_features]

                # GridSearchCV with TimeSeriesSplit
                grid_search = GridSearchCV(
                    estimator=model,
                    param_grid=model_params,
                    cv=tscv,
                    scoring="neg_mean_squared_error",
                    verbose=1,
                    n_jobs=-1
                )

                # Fit using only train/val (no test!)
                print("Training data using train and val...")
                grid_search.fit(X_trainval_sel, y_trainval)

                # Show CV results
                print("Best parameters:", grid_search.best_params_)
                print("Best CV RMSE:", np.sqrt(-grid_search.best_score_))

                # ---- Final evaluation on the hold-out test set ----
                best_model = grid_search.best_estimator_
                y_pred = best_model.predict(X_test_sel)
                test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

                print(f"Final Test RMSE: {test_rmse:.4f}")

                results_per_n[N] = {
                    "rmse": test_rmse,
                    "best_params": grid_search.best_params_,
                    "selected_features": selected_features,
                    "best_model" : best_model
                }

            best_n_value = min(results_per_n.keys(), key=lambda n: results_per_n[n]["rmse"])
            best_result = results_per_n[best_n_value]
            best_model = best_result["best_model"]

            # saving best model
            model_filename = f"Model_{model_name} N_{best_n_value} points_prior_{points_prior} dataset_{filename}.pkl"
            with open(model_filename, 'wb') as file:
                pickle.dump(best_model, file)

            print(f"Model saved to {model_filename}")
            
            print(f"\n--------------------------------------------------------")
            print(f"*** BEST N for {filename}, points_prior: {points_prior}, Model {model_name} ***")
            print(f"Best N: \t\t{best_n_value}")
            print(f"Best Test RMSE: \t{best_result['rmse']:.4f}")
            print(f"Best Hyperparameters: \t{best_result['best_params']}")
            print(f"--------------------------------------------------------\n")
            
            # Store this in the overall results for a final summary
            config_key = (filename, points_prior, model_name)
            overall_best_results[config_key] = {
                "best_n": best_n_value,
                "rmse": best_result['rmse'],
                "best_params": best_result['best_params'],
                "selected_features": best_result['selected_features']
            } 

# 4. Print a final summary of all configurations
print("\n\n===================================")
print("           FINAL SUMMARY           ")
print("===================================")
for config, result in overall_best_results.items():
    print(f"Config (File, Prior, Model): {config}")
    print(f"  -> Best N: {result['best_n']} (RMSE: {result['rmse']:.4f})\n")


Processing: File=trans_1.csv, PointsPrior=4, Model=Random Forest Regressor

Train/Val samples: 55732, Test samples: 13932
Training Configurations
file: trans_1.csv
N: 12
points_prior: 4
model: Random Forest Regressor
Selecting top 20 features...
Top 20 selected features:
 ['rolling_mean_HUFL', 'HUFL_lag14', 'HUFL_lag16', 'rolling_mean_MUFL', 'MUFL_lag14', 'MUFL_lag15', 'MUFL_lag16', 'rolling_mean_MULL', 'MULL_lag5', 'MULL_lag6', 'MULL_lag7', 'MULL_lag9', 'rolling_mean_LUFL', 'LUFL_lag12', 'rolling_mean_LULL', 'LULL_lag5', 'LULL_lag16', 'year', 'month', 'day']
Training data using train and val...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters: {'model__max_depth': 15, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Best CV RMSE: 13.521101461648648
Final Test RMSE: 7.1539
Train/Val samples: 55722, Test samples: 13930
Training Configurations
file: trans_1.csv
N: 24
points_prior: 4
model: Random Forest Regressor
Select

KeyboardInterrupt: 

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor

models = [
    # 2. Support Vector Regressor (SVR)
    ["Support Vector Regressor (SVR)",
     Pipeline([
         ('scaler', StandardScaler()),
         ('model', SVR())
     ]),
     {
         "model__kernel": ['rbf', 'poly'],
         "model__C": [0.1, 1],
         "model__gamma": ['scale', 'auto'],
         "model__epsilon": [0.1, 0.4],
     }]
]

N_list = [12, 24, 48] # can add 96 if you want (and have enough time)
points_priors = [4, 96, 672]
filenames = ["trans_1.csv", "trans_2.csv"]

# Store the overall best N for each configuration
overall_best_results = {}

for (model_name, model, model_params) in models:
    for filename in filenames:
        for points_prior in points_priors:

            print(f"\n========================================================")
            print(f"Processing: File={filename}, PointsPrior={points_prior}, Model={model_name}")
            print(f"========================================================\n")

            results_per_n = {}
            for N in N_list:
                df = preprocess_data(filename, N = N, points_prior= points_prior)
                X_trainval, y_trainval, X_test, y_test, tscv = time_series_split(df, "OT")

                print("Training Configurations")
                print(f"file: {filename}")
                print(f"N: {N}")
                print(f"points_prior: {points_prior}")
                print(f"model: {model_name}")

                # ---- FEATURE SELECTION using RFE ----
                base_model = XGBRegressor(
                    objective='reg:squarederror',
                    random_state=42,
                    n_estimators=200,
                    learning_rate=0.05,
                    max_depth=5,
                    subsample=0.8
                )

                # Select top 20 features
                print("Selecting top 20 features...")
                rfe = RFE(estimator=base_model, n_features_to_select=20, step=0.1)
                rfe.fit(X_trainval, y_trainval)

                # Mask of selected features
                selected_features = list(X_trainval.columns[rfe.support_])
                print("Top 20 selected features:\n", list(selected_features))

                # Filter columns
                X_trainval_sel = X_trainval[selected_features]
                X_test_sel = X_test[selected_features]

                # GridSearchCV with TimeSeriesSplit
                grid_search = GridSearchCV(
                    estimator=model,
                    param_grid=model_params,
                    cv=tscv,
                    scoring="neg_mean_squared_error",
                    verbose=1,
                    n_jobs=-1
                )

                # Fit using only train/val (no test!)
                print("Training data using train and val...")
                grid_search.fit(X_trainval_sel, y_trainval)

                # Show CV results
                print("Best parameters:", grid_search.best_params_)
                print("Best CV RMSE:", np.sqrt(-grid_search.best_score_))

                # ---- Final evaluation on the hold-out test set ----
                best_model = grid_search.best_estimator_
                y_pred = best_model.predict(X_test_sel)
                test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

                print(f"Final Test RMSE: {test_rmse:.4f}")

                results_per_n[N] = {
                    "rmse": test_rmse,
                    "best_params": grid_search.best_params_,
                    "selected_features": selected_features,
                    "best_model" : best_model
                }

            best_n_value = min(results_per_n.keys(), key=lambda n: results_per_n[n]["rmse"])
            best_result = results_per_n[best_n_value]
            best_model = best_result["best_model"]

            # saving best model
            model_filename = f"Model_{model_name} N_{best_n_value} points_prior_{points_prior} dataset_{filename}.pkl"
            with open(model_filename, 'wb') as file:
                pickle.dump(best_model, file)

            print(f"Model saved to {model_filename}")
            
            print(f"\n--------------------------------------------------------")
            print(f"*** BEST N for {filename}, points_prior: {points_prior}, Model {model_name} ***")
            print(f"Best N: \t\t{best_n_value}")
            print(f"Best Test RMSE: \t{best_result['rmse']:.4f}")
            print(f"Best Hyperparameters: \t{best_result['best_params']}")
            print(f"--------------------------------------------------------\n")
            
            # Store this in the overall results for a final summary
            config_key = (filename, points_prior, model_name)
            overall_best_results[config_key] = {
                "best_n": best_n_value,
                "rmse": best_result['rmse'],
                "best_params": best_result['best_params'],
                "selected_features": best_result['selected_features']
            }

# 4. Print a final summary of all configurations
print("\n\n===================================")
print("           FINAL SUMMARY           ")
print("===================================")
for config, result in overall_best_results.items():
    print(f"Config (File, Prior, Model): {config}")
    print(f"  -> Best N: {result['best_n']} (RMSE: {result['rmse']:.4f})\n")


Processing: File=trans_1.csv, PointsPrior=4, Model=Support Vector Regressor (SVR)

Train/Val samples: 55732, Test samples: 13932
Training Configurations
file: trans_1.csv
N: 12
points_prior: 4
model: Support Vector Regressor (SVR)
Selecting top 20 features...
Top 20 selected features:
 ['rolling_mean_HUFL', 'HUFL_lag14', 'HUFL_lag16', 'rolling_mean_MUFL', 'MUFL_lag14', 'MUFL_lag15', 'MUFL_lag16', 'rolling_mean_MULL', 'MULL_lag5', 'MULL_lag6', 'MULL_lag7', 'MULL_lag9', 'rolling_mean_LUFL', 'LUFL_lag12', 'rolling_mean_LULL', 'LULL_lag5', 'LULL_lag16', 'year', 'month', 'day']
Training data using train and val...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters: {'model__C': 0.1, 'model__epsilon': 0.1, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
Best CV RMSE: 11.547398500287073
Final Test RMSE: 4.1735
Train/Val samples: 55722, Test samples: 13930
Training Configurations
file: trans_1.csv
N: 24
points_prior: 4
model: Support Vector Regressor (SVR)
Selecting