# Machine Learning Forecasting Notebook

This notebook is generated from machine_learning.py and is split into multiple cells.

In [119]:
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA

In [None]:
file_path = 'Trimmed_Time_Series_Data.csv'
df = pd.read_csv(file_path)

# Read CSV and filter data for Canada (index % 9 == 8)
# df_subset = df[df.index % 9 == 8].iloc[:, :]

df_subset = df.copy()
df_subset.reset_index(drop=True, inplace=True)
print(df_subset.head())
# df_subset.to_csv('Trimmed_Time_Series_Data_Canada.csv', index=False)

     REF_DATE                        GEO  Number_of_Households  \
0  1986-01-01                    Alberta                859000   
1  1986-01-01           British Columbia               1132000   
2  1986-01-01                   Manitoba                392000   
3  1986-01-01              New Brunswick                237000   
4  1986-01-01  Newfoundland and Labrador                161000   

   Housing completions  Housing starts  Housing under construction  \
0           662.000000      603.000000                 1125.000000   
1          1304.333333     1515.666667                 3114.666667   
2           426.333333      536.000000                 1382.000000   
3           329.666667      105.666667                  488.333333   
4           181.000000       74.333333                 1009.333333   

   House only NHPI  Land only NHPI  Total (house and land) NHPI  
0             28.0            22.5                         26.4  
1             79.5            49.2                

In [121]:
# Prepare DataFrame for Prophet by renaming "REF_DATE" to "ds" and converting to datetime
df_prophet = df_subset.rename(columns={"REF_DATE": "ds"})
df_prophet["ds"] = pd.to_datetime(df_prophet["ds"])

In [122]:
# Use only one target variable from column index 2
target_columns = list(df_prophet.columns[2:3])

In [123]:
def create_features(df, n_lags=3):
    """
    Create simple lag features and a month feature.
    Assumes df has columns 'ds' and 'y'.
    """
    df_features = df.copy()
    for lag in range(1, n_lags+1):
        df_features[f'lag_{lag}'] = df_features['y'].shift(lag)
    df_features['month'] = df_features['ds'].dt.month
    df_features.dropna(inplace=True)
    return df_features

In [124]:
holdout_period = 30  # e.g., predicting 30 months ahead
n_lags = 3         # number of lag features for ML models

results = {}

for target in target_columns:
    results[target] = {}
    df_target = df_prophet[["ds", target]].rename(columns={target: "y"})
    
    # Prophet
    train_prophet = df_target.iloc[:-holdout_period]
    test_prophet  = df_target.iloc[-holdout_period:]
    prophet_model = Prophet(daily_seasonality=False, weekly_seasonality=False)
    prophet_model.fit(train_prophet)
    future = prophet_model.make_future_dataframe(periods=holdout_period, freq='MS')
    forecast = prophet_model.predict(future).tail(holdout_period)
    prophet_mae = mean_absolute_error(test_prophet['y'], forecast['yhat'])
    results[target]['Prophet'] = prophet_mae

    # Random Forest / XGBoost
    df_ml = create_features(df_target, n_lags=n_lags)
    train_ml = df_ml.iloc[:-holdout_period]
    test_ml = df_ml.iloc[-holdout_period:]
    X_train = train_ml.drop(['ds', 'y'], axis=1)
    y_train = train_ml['y']
    X_test  = test_ml.drop(['ds', 'y'], axis=1)
    y_test  = test_ml['y']
    
    rf_model = RandomForestRegressor()
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_test)
    results[target]['Random Forest'] = mean_absolute_error(y_test, rf_pred)
    
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
    xgb_model.fit(X_train, y_train)
    xgb_pred = xgb_model.predict(X_test)
    results[target]['XGBoost'] = mean_absolute_error(y_test, xgb_pred)
    
    # ARIMA
    train_arima = df_target.iloc[:-holdout_period]
    test_arima  = df_target.iloc[-holdout_period:]
    arima_model = ARIMA(train_arima['y'], order=(1,1,1))
    arima_fit = arima_model.fit()
    forecast_arima = arima_fit.forecast(steps=holdout_period)
    forecast_df = test_arima.copy()
    forecast_df['yhat'] = forecast_arima.values
    results[target]['ARIMA'] = mean_absolute_error(test_arima['y'], forecast_df['yhat'])

# Print consolidated results
results_df = pd.DataFrame(results).T
print("MAE Summary for each model:")
print(results_df)


00:35:31 - cmdstanpy - INFO - Chain [1] start processing
00:35:31 - cmdstanpy - INFO - Chain [1] done processing


MAE Summary for each model:
                           Prophet  Random Forest       XGBoost         ARIMA
Number_of_Households  3.673162e+06          876.0  12084.226042  3.627858e+06


In [125]:
import pickle

# Assuming rf_model has been trained on your dataset
with open("rf_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)
    
print("Random Forest model saved as rf_model.pkl")

Random Forest model saved as rf_model.pkl
