# Machine Learning Forecasting Notebook

This notebook is generated from machine_learning.py and is split into multiple cells.

In [3]:
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [4]:
# Read CSV and filter data for Canada (index % 9 == 8)
file_path = 'Trimmed_Time_Series_Data.csv'
df = pd.read_csv(file_path)
df_subset = df[df.index % 9 == 8].iloc[:, :]
df_subset.reset_index(drop=True, inplace=True)
print(df_subset.head())
df_subset.to_csv('Trimmed_Time_Series_Data_Canada.csv', index=False)

     REF_DATE     GEO  Number_of_Households  Housing completions  \
0  1986-01-01  Canada               9238000         10938.333333   
1  1986-02-01  Canada               9238000         10938.333333   
2  1986-03-01  Canada               9238000         10938.333333   
3  1986-04-01  Canada               9238000         14070.000000   
4  1986-05-01  Canada               9238000         14070.000000   

   Housing starts  Housing under construction  House only NHPI  \
0     9757.333333                27632.666667             39.4   
1     9757.333333                27632.666667             39.9   
2     9757.333333                27632.666667             40.2   
3    20430.666667                33986.000000             40.6   
4    20430.666667                33986.000000             40.9   

   Land only NHPI  Total (house and land) NHPI  
0            38.8                         39.8  
1            39.2                         40.3  
2            39.3                         40.6 

In [5]:
# Prepare DataFrame for Prophet by renaming "REF_DATE" to "ds" and converting to datetime
df_prophet = df_subset.rename(columns={"REF_DATE": "ds"})
df_prophet["ds"] = pd.to_datetime(df_prophet["ds"])

In [6]:
# Use only one target variable from column index 2
target_columns = list(df_prophet.columns[2:3])

In [7]:
def create_features(df, n_lags=3):
    """
    Create simple lag features and a month feature.
    Assumes df has columns 'ds' and 'y'.
    """
    df_features = df.copy()
    for lag in range(1, n_lags+1):
        df_features[f'lag_{lag}'] = df_features['y'].shift(lag)
    df_features['month'] = df_features['ds'].dt.month
    df_features.dropna(inplace=True)
    return df_features

In [None]:
holdout_period = 30  # e.g., predicting 30 months ahead
n_lags = 3         # number of lag features for ML models

for target in target_columns:
    print(f"\nEvaluating target: {target}")
    
    # --- Prophet Forecast with MAE ---
    df_target = df_prophet[["ds", target]].rename(columns={target: "y"})
    train_prophet = df_target.iloc[:-holdout_period]
    test_prophet  = df_target.iloc[-holdout_period:]
    prophet_model = Prophet(daily_seasonality=False, weekly_seasonality=False)
    prophet_model.fit(train_prophet)
    future = prophet_model.make_future_dataframe(periods=holdout_period, freq='MS')
    forecast = prophet_model.predict(future).tail(holdout_period)
    prophet_mae = mean_absolute_error(test_prophet['y'], forecast['yhat'])

    print("Prophet Forecast (first few rows):")
    print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())
    print("Prophet MAE:", prophet_mae)

    
    # --- Prepare Data for Random Forest / XGBoost ---
    df_ml = create_features(df_target, n_lags=n_lags)
    train_ml = df_ml.iloc[:-holdout_period]
    test_ml = df_ml.iloc[-holdout_period:]
    
    X_train = train_ml.drop(['ds', 'y'], axis=1)
    y_train = train_ml['y']
    X_test  = test_ml.drop(['ds', 'y'], axis=1)
    y_test  = test_ml['y']
    
    # --- Random Forest ---
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_test)
    rf_mae = mean_absolute_error(y_test, rf_pred)
    print("Random Forest MAE:", rf_mae)
    
    # --- XGBoost ---
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
    xgb_model.fit(X_train, y_train)
    xgb_pred = xgb_model.predict(X_test)
    xgb_mae = mean_absolute_error(y_test, xgb_pred)
    print("XGBoost MAE:", xgb_mae)


Evaluating target: Number_of_Households


18:18:20 - cmdstanpy - INFO - Chain [1] start processing
18:18:21 - cmdstanpy - INFO - Chain [1] done processing


Prophet MAE: 5915.2655222881585
Prophet Forecast (first few rows):
            ds          yhat    yhat_lower    yhat_upper
436 2022-05-01  1.581929e+07  1.581284e+07  1.582560e+07
437 2022-06-01  1.582098e+07  1.581440e+07  1.582783e+07
438 2022-07-01  1.582251e+07  1.581621e+07  1.582875e+07
439 2022-08-01  1.582409e+07  1.581796e+07  1.583074e+07
440 2022-09-01  1.582565e+07  1.581939e+07  1.583228e+07
Random Forest MAE: 260977.33333333334
XGBoost MAE: 255595.66666666666
