Forecast ML  

In [None]:
# import libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [None]:
# set up data paths

drive_folder = '/MyDrive/Projects/home-co2-forecast/data/'
mount_folder = '/content/drive'
data_folder = mount_folder + drive_folder
print(data_folder)

import_csv = 'values.csv'
import_path = data_folder + import_csv
print(import_path)

pickle_file = 'co2_ts.pkl'
pickle_path = data_folder + pickle_file

In [None]:
# mount data source

from google.colab import drive
drive.mount(mount_folder)

In [None]:
# read dataframe from pickle
# assumed to have been preprocessed to infill gaps

df = pd.read_pickle(pickle_path)
df.head(10)

In [None]:
# define the list of lags
# old_lag_list = [1, 2, 3, 5, 10, 20, 60]
lag_list = [10, 20, 30, 60]
window = 60

# turning lagged observations into predicting features
for l in lag_list:
    df[f"lag_{l}"] = df["value"].shift(l)

df.head(10)

In [None]:
# rolling over previous 15 rows, excluding current row
df["roll10_mean"] = (
    df["value"]
    .shift(10)
    .rolling(window=10, min_periods=1)
    .mean()
)

df["roll10_std"] = (
    df["value"]
    .shift(10)
    .rolling(window=10, min_periods=1)
    .std()
)

df["roll20_mean"] = (
    df["value"]
    .shift(10)
    .rolling(window=20, min_periods=1)
    .mean()
)

df["roll20_std"] = (
    df["value"]
    .shift(10)
    .rolling(window=20, min_periods=1)
    .std()
)

df["roll50_mean"] = (
    df["value"]
    .shift(10)
    .rolling(window=50, min_periods=1)
    .mean()
)

df["roll50_std"] = (
    df["value"]
    .shift(10)
    .rolling(window=50, min_periods=1)
    .std()
)

# time-of-day features
df["minute"] = df["timestamp_local"].dt.minute
df["hour"] = df["timestamp_local"].dt.hour
df["dayofweek"] = df["timestamp_local"].dt.dayofweek

In [None]:
# Ontario statutory holidays calendar

from pandas.tseries.holiday import Holiday, DateOffset, AbstractHolidayCalendar, MO, GoodFriday, next_monday, next_monday_or_tuesday

class OntarioCalendar(AbstractHolidayCalendar):
    rules=[
        # New Year's Day (Jan 1, next Monday)
        Holiday("New Year's Day", month=1, day=1, observance=next_monday),

        # Family Day (3rd Monday of February)
        Holiday("Family Day", month=2, day=1, offset=DateOffset(weekday=MO(3))),

        # Good Friday (built-in Easter-based holiday)
        GoodFriday,

        # Victoria Day (Monday before May 25)
        Holiday("Victoria Day", month=5, day=25, offset=DateOffset(weekday=MO(-1))),

        # Canada Day (Jul 1, next Monday)
        Holiday("Canada Day", month=7, day=1, observance=next_monday),

        # Labour Day (1st Monday of September)
        Holiday("Labour Day", month=9, day=1, offset=DateOffset(weekday=MO(1))),

        # Thanksgiving (2nd Monday of October)
        Holiday("Thanksgiving", month=10, day=1, offset=DateOffset(weekday=MO(2))),

        # Christmas Day (Dec 25, next Monday)
        Holiday("Christmas Day", month=12, day=25, observance=next_monday),

        # Boxing Day (Dec 26, nearest Monday or Tuesday (if Christmas Day is observed on Monday))
        Holiday("Boxing Day", month=12, day=26, observance=next_monday_or_tuesday),
    ]
mycal= OntarioCalendar()
holidays = mycal.holidays(start=df.index.min(), end=df.index.max())


In [None]:
df["is_weekend"] = (
    df.dayofweek.isin([5, 6])
)
df["is_holiday"] = (
    df.timestamp_local.dt.date.isin(holidays.date)
)
df["day_off"] = (
    df.is_weekend | df.is_holiday
)

df.hr_cont = df.hour + df.minute / 60
df['hr_sin'] = np.sin(2 * np.pi * df.hr_cont / 24)
df['hr_cos'] = np.cos(2 * np.pi * df.hr_cont / 24)

In [None]:
# testing for correct feature values at the boundary between days
start, end = pd.Timestamp("2025-10-13 03:50:00").tz_localize('UTC'), pd.Timestamp("2025-10-13 04:10:00").tz_localize('UTC')
df.loc[start:end]

In [None]:
# removing the entries withing the first fully defined window
df = df.iloc[window:]

In [None]:
import seaborn as sns
sns.set(rc = {'figure.figsize':(15,8)})
sns.heatmap(df.corr(), cmap='Blues', annot=True);

In [None]:
# separating current value into target variable y; other features into the matrix of predicting variables
df = df.drop(columns=["timestamp_local"])
y = df["value"]
X = df.drop(columns=["value"])

Split into classical train / test with 0.1 test size

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [None]:
# serialize train and test datasets

X_train.to_pickle(data_folder + 'X_train.pkl')
X_test.to_pickle(data_folder + 'X_test.pkl')
y_train.to_pickle(data_folder + 'y_train.pkl')
y_test.to_pickle(data_folder + 'y_test.pkl')


Split into 5-fold forward chaining

In [None]:
def create_sequential_ranges(N):
    """
    Break down N into 10 sequential ranges of approximately equal length.

    Parameters:
    -----------
    N : int
        Total number of rows in the dataframe

    Returns:
    --------
    list of tuples
        List of 10 tuples, each containing (start_idx, end_idx) for each range
    """
    # Calculate the base size for each range
    base_size = N // 10
    remainder = N % 10

    ranges = []
    start = 0

    for i in range(10):
        # Distribute the remainder across the first 'remainder' ranges
        # This ensures ranges are as equal as possible
        size = base_size + (1 if i < remainder else 0)
        end = start + size
        ranges.append((start, end))
        start = end

    return ranges

def create_forward_chaining_folds(X, y):
    """
    Create 5 train/test folds for forward chaining cross-validation.

    The data is split into 10 sequential ranges. For each fold i (0 to 4):
    - Training set: ranges 0 to (4+i) inclusive
    - Test set: range (5+i)

    This creates an expanding window where each fold trains on progressively
    more historical data and tests on the next sequential chunk.

    Parameters:
    -----------
    X : pandas.DataFrame
        Feature dataframe
    y : pandas.DataFrame or Series
        Target dataframe/series

    Returns:
    --------
    dict
        Dictionary containing:
        - 'X_train_folds': list of 5 training feature sets
        - 'X_test_folds': list of 5 test feature sets
        - 'y_train_folds': list of 5 training target sets
        - 'y_test_folds': list of 5 test target sets
        - 'ranges': the 10 sequential ranges used
    """
    # Get total number of rows
    N = len(X)

    # Verify X and y have same length
    assert len(X) == len(y), f"X and y must have same length. X: {len(X)}, y: {len(y)}"

    # Create 10 sequential ranges
    ranges = create_sequential_ranges(N)

    # Initialize lists to store folds
    X_train_folds = []
    X_test_folds = []
    y_train_folds = []
    y_test_folds = []

    # Create 5 folds
    for i in range(5):
        # Training set: ranges 0 to (4+i) inclusive
        train_start = ranges[0][0]  # Start of range 0
        train_end = ranges[4 + i][1]  # End of range (4+i)

        # Test set: range (5+i)
        test_start = ranges[5 + i][0]
        test_end = ranges[5 + i][1]

        # Extract the data using iloc
        X_train_folds.append(X.iloc[train_start:train_end])
        X_test_folds.append(X.iloc[test_start:test_end])
        y_train_folds.append(y.iloc[train_start:train_end])
        y_test_folds.append(y.iloc[test_start:test_end])

    return {
        'X_train_folds': X_train_folds,
        'X_test_folds': X_test_folds,
        'y_train_folds': y_train_folds,
        'y_test_folds': y_test_folds,
        'ranges': ranges
    }



In [None]:
folds = create_forward_chaining_folds(X, y)

for i in range(5):
    #X_train_folds = folds['X_train_folds'][i]
    print(folds['X_train_folds'][i])
    folds['X_train_folds'][i].to_pickle(data_folder + 'X_train_folds_'+str(i)+'.pkl')
    # X_test_folds = folds['X_test_folds'][i]
    print(folds['X_test_folds'][i])
    folds['X_test_folds'][i].to_pickle(data_folder + 'X_test_folds_'+str(i)+'.pkl')
    # y_train_folds = folds['X_test_folds'][i]
    print(folds['X_test_folds'][i])
    folds['X_test_folds'][i].to_pickle(data_folder + 'y_train_folds_'+str(i)+'.pkl')
    # y_test_folds = folds['y_test_folds'][i]
    print(folds['y_test_folds'][i])
    folds['y_test_folds'][i].to_pickle(data_folder + 'y_test_folds_'+str(i)+'.pkl')




Try linear regression

In [None]:
from sklearn.linear_model import LinearRegression

# Instantiate a linear regression model
linear_model = LinearRegression()

# Fit the model using the training data
linear_model.fit(X_train, y_train)

# Print out the intercept and coefficients for the linear regression model
print(linear_model.intercept_)
print(linear_model.coef_)

In [None]:
# For each record in the test set, make a prediction for the y value (transformed value of charges)
# The predicted values are stored in the y_pred array
y_pred = linear_model.predict(X_test)

In [None]:
# Metrics

from sklearn import metrics

MSE = (1 / len(y_test)) * sum ((y_test - y_pred)**2)
MAE = (1 / len(y_test)) * sum (abs(y_test - y_pred))
RSS = sum ((y_test - y_pred)**2)
TSS = sum ((y_test - y_test.mean())**2)
R_squared = 1 - (RSS/TSS)

print("\n")
print("Mean squared error (MSE) =", MSE)
print("Mean absolute error (MAE) =", MAE)
print("R^2 =", R_squared)

# The metrics package in Python can derive the model evaluation metrics
print("Mean squared error (MSE) =", metrics.mean_squared_error(y_test, y_pred))
print("Mean absolute error (MAE) =", metrics.mean_absolute_error(y_test, y_pred))
print("R^2 =", metrics.r2_score(y_test, y_pred))

So it looks to be very good at 99% but that's because the 1min is very close!
We should try it with 10 min.
After 10 min the accuracy is 0.9262522995966536.

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

In [None]:
# define a function to score a model

def get_acc(model, X, t):
    # 1. Get a vector of predicted results
    y_pred = model.predict(X) # uses tree.predict method

    # 2. Get a vector of actual labels - t

    # 3. acc = R squared
    acc = metrics.r2_score(t, y_pred)
    return acc

In [None]:
# define a function that selects the best tree model among models with different depths
# selection is based on model score

def select_model(depths,  criterion ):
    out = {}
    for d in depths:
        print('Evaluating on depth {}'.format(d))
        out[d] = {}
        tree = DecisionTreeRegressor(max_depth = d,criterion = criterion)
        tree.fit(X_train, y_train)
        out[d]['test'] = get_acc(tree, X_test, y_test)
        out[d]['train'] = get_acc(tree, X_train, y_train)
        out[d]['model'] = tree
    return out

In [None]:
depths = [2,5,8,9,10,15,20,30,50,70] # the depths we want to explore go in the depths list

res_sq_error = select_model(depths, "squared_error") # training models with different depths using squared error as criterion
# looping over the different models and scores to find the optimal model according to its test score
best_d_sq_error = None
best_acc_sq_error = 0

for d in res_sq_error:
    test_acc = res_sq_error[d]['test']
    print("Depth: {}   Train: {}    Test: {}".format(d, res_sq_error[d]['train'], test_acc))
    if test_acc  > best_acc_sq_error:
        best_d_sq_error = d
        best_acc_sq_error = test_acc

print('Best d (sq_error): ' + str(best_d_sq_error))
print('Best accuracy (sq_error): ' + str(best_acc_sq_error))

In [None]:
# extracting feature importances of the tree
importances = res_sq_error[best_d_sq_error]['model'].feature_importances_
tree_model = res_sq_error[best_d_sq_error]['model']

tree_importances = pd.Series(importances, index=X.columns)
print(tree_importances)

I removed the lags under 10 min.
The roll15 became more promising. Let's add 30 roll too.
This is the best accuracy: 0.9744115244242749.

When I added rolling over 30, it became worse.
0.9737708943769401.

By the way, this is not correct, because if you're predicting 10 minutes ahead, then you don't have access to the average of 15 before it!

Now that I shifted 10 rows to above and now only take 10 and 20 rolling averages above that point, the accuracy dropped to 0.938017820045307.

Now when I made -10, -20, -30, -60 lagged valus, and I measure the rolling means between -10 and -20, -10 and -30, -10 and -60, the accuracy bumped up a bit to 0.9421513157028315.

It's more than the linear regression results, so this set of features is promising.

After adding cyclical encoded time features the accuracy bumped up to 0.9508182107855963.

Time to roll out the MLflow to track the results!

