## Data Preparation

After loading the data, the datatype of REF_DATE was changed to datetime format and the data is trimmed to range from "1986-01-01" till "2024-10-01".

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
file_path = "Final_Merged_Data.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

# Convert REF_DATE to datetime format
df['REF_DATE'] = pd.to_datetime(df['REF_DATE'])

# Define the date range
start_date = "1986-01-01"
end_date = "2024-10-01"

# Filter the dataset
df_trimmed = df[(df['REF_DATE'] >= start_date) & (df['REF_DATE'] <= end_date)]


# Display the first few rows of the trimmed dataset
print(df_trimmed.head())

# Save the trimmed dataset if needed
df_trimmed.to_csv("Trimmed_Time_Series_Data.csv", index=False)

       REF_DATE                        GEO  Number_of_Households  \
1071 1986-01-01                    Alberta                859000   
1072 1986-01-01           British Columbia               1132000   
1073 1986-01-01                   Manitoba                392000   
1074 1986-01-01              New Brunswick                237000   
1075 1986-01-01  Newfoundland and Labrador                161000   

      Housing completions  Housing starts  Housing under construction  \
1071           662.000000      603.000000                 1125.000000   
1072          1304.333333     1515.666667                 3114.666667   
1073           426.333333      536.000000                 1382.000000   
1074           329.666667      105.666667                  488.333333   
1075           181.000000       74.333333                 1009.333333   

      House only NHPI  Land only NHPI  Total (house and land) NHPI  
1071             28.0            22.5                         26.4  
1072          

Checking data structure and quality

In [2]:
# Check data structure and quality
print("Dataset Information:")
df_trimmed.info()
print("\nMissing Values:")
print(df_trimmed.isnull().sum())
print("\nBasic Statistics:")
print(df_trimmed.describe())
print("\nUnique Values:")
print(df_trimmed.nunique())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
Index: 4194 entries, 1071 to 5264
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   REF_DATE                     4194 non-null   datetime64[ns]
 1   GEO                          4194 non-null   object        
 2   Number_of_Households         4194 non-null   int64         
 3   Housing completions          4194 non-null   float64       
 4   Housing starts               4194 non-null   float64       
 5   Housing under construction   4194 non-null   float64       
 6   House only NHPI              4086 non-null   float64       
 7   Land only NHPI               4086 non-null   float64       
 8   Total (house and land) NHPI  4086 non-null   float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(1)
memory usage: 327.7+ KB

Missing Values:
REF_DATE                         0
GEO                      

Handle missing values


In [3]:
# Exclude non-numeric columns before performing numerical operations
numeric_cols = df_trimmed.select_dtypes(include=[np.number]).columns

# Interpolation to fill in missing values with a smooth trend, avoiding sudden jumps
df_trimmed[numeric_cols] = df_trimmed[numeric_cols].interpolate(method='linear')

# Fill remaining missing values with the median of each numeric column to prevent extreme values from skewing data
# Handle columns that still have missing values explicitly
for col in numeric_cols:
    if df_trimmed[col].isnull().sum() > 0:
        df_trimmed[col].fillna(df_trimmed[col].median(), inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trimmed[numeric_cols] = df_trimmed[numeric_cols].interpolate(method='linear')


In [4]:
# Check data structure after handling missing values
print("Dataset Information: After Handling Missing Values")
df_trimmed.info()
print("\nMissing Values After Handling:")
print(df_trimmed.isnull().sum())

Dataset Information: After Handling Missing Values
<class 'pandas.core.frame.DataFrame'>
Index: 4194 entries, 1071 to 5264
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   REF_DATE                     4194 non-null   datetime64[ns]
 1   GEO                          4194 non-null   object        
 2   Number_of_Households         4194 non-null   int64         
 3   Housing completions          4194 non-null   float64       
 4   Housing starts               4194 non-null   float64       
 5   Housing under construction   4194 non-null   float64       
 6   House only NHPI              4194 non-null   float64       
 7   Land only NHPI               4194 non-null   float64       
 8   Total (house and land) NHPI  4194 non-null   float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(1)
memory usage: 327.7+ KB

Missing Values After Handling:
REF_DATE       

## Feature Engineering


In [5]:
# Extract year, month, quarter, and day to enable seasonal and trend analysis
df_trimmed['Year'] = df_trimmed['REF_DATE'].dt.year
df_trimmed['Month'] = df_trimmed['REF_DATE'].dt.month
df_trimmed['Quarter'] = df_trimmed['REF_DATE'].dt.quarter
df_trimmed['Day'] = df_trimmed['REF_DATE'].dt.day

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trimmed['Year'] = df_trimmed['REF_DATE'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trimmed['Month'] = df_trimmed['REF_DATE'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trimmed['Quarter'] = df_trimmed['REF_DATE'].dt.quarter
A value is trying to be set on a copy of 

In [6]:
# Check structure after feature engineering
print("Dataset Information: After Feature Engineering")
df_trimmed.info()
df_trimmed.head()


Dataset Information: After Feature Engineering
<class 'pandas.core.frame.DataFrame'>
Index: 4194 entries, 1071 to 5264
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   REF_DATE                     4194 non-null   datetime64[ns]
 1   GEO                          4194 non-null   object        
 2   Number_of_Households         4194 non-null   int64         
 3   Housing completions          4194 non-null   float64       
 4   Housing starts               4194 non-null   float64       
 5   Housing under construction   4194 non-null   float64       
 6   House only NHPI              4194 non-null   float64       
 7   Land only NHPI               4194 non-null   float64       
 8   Total (house and land) NHPI  4194 non-null   float64       
 9   Year                         4194 non-null   int32         
 10  Month                        4194 non-null   int32         
 11

Unnamed: 0,REF_DATE,GEO,Number_of_Households,Housing completions,Housing starts,Housing under construction,House only NHPI,Land only NHPI,Total (house and land) NHPI,Year,Month,Quarter,Day
1071,1986-01-01,Alberta,859000,662.0,603.0,1125.0,28.0,22.5,26.4,1986,1,1,1
1072,1986-01-01,British Columbia,1132000,1304.333333,1515.666667,3114.666667,79.5,49.2,66.3,1986,1,1,1
1073,1986-01-01,Manitoba,392000,426.333333,536.0,1382.0,37.7,26.9,34.7,1986,1,1,1
1074,1986-01-01,New Brunswick,237000,329.666667,105.666667,488.333333,75.1,56.1,70.5,1986,1,1,1
1075,1986-01-01,Newfoundland and Labrador,161000,181.0,74.333333,1009.333333,39.4,35.3,38.7,1986,1,1,1


Creating Lag Features (Using 1, 3, and 6 months lag).
These help the model recognize past patterns and predict future trends

In [7]:
lag_columns = ['Number_of_Households', 'Housing completions', 'Housing starts',
               'Housing under construction', 'House only NHPI', 'Land only NHPI',
               'Total (house and land) NHPI']

for col in lag_columns:
    for lag in [1, 3, 6]:  # Using past values from 1, 3, and 6 months ago to capture short- and mid-term trends
        df_trimmed[f'{col}_lag_{lag}'] = df_trimmed[col].shift(lag)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trimmed[f'{col}_lag_{lag}'] = df_trimmed[col].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trimmed[f'{col}_lag_{lag}'] = df_trimmed[col].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trimmed[f'{col}_lag_{lag}'] = df_trimmed[col].shift(lag)
A value is trying to be s

In [8]:
# Check structure after creating lag features
print("Dataset Information: After Lag Features")
df_trimmed.info()
df_trimmed.head()

Dataset Information: After Lag Features
<class 'pandas.core.frame.DataFrame'>
Index: 4194 entries, 1071 to 5264
Data columns (total 34 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   REF_DATE                           4194 non-null   datetime64[ns]
 1   GEO                                4194 non-null   object        
 2   Number_of_Households               4194 non-null   int64         
 3   Housing completions                4194 non-null   float64       
 4   Housing starts                     4194 non-null   float64       
 5   Housing under construction         4194 non-null   float64       
 6   House only NHPI                    4194 non-null   float64       
 7   Land only NHPI                     4194 non-null   float64       
 8   Total (house and land) NHPI        4194 non-null   float64       
 9   Year                               4194 non-null   int32         
 10

Unnamed: 0,REF_DATE,GEO,Number_of_Households,Housing completions,Housing starts,Housing under construction,House only NHPI,Land only NHPI,Total (house and land) NHPI,Year,...,Housing under construction_lag_6,House only NHPI_lag_1,House only NHPI_lag_3,House only NHPI_lag_6,Land only NHPI_lag_1,Land only NHPI_lag_3,Land only NHPI_lag_6,Total (house and land) NHPI_lag_1,Total (house and land) NHPI_lag_3,Total (house and land) NHPI_lag_6
1071,1986-01-01,Alberta,859000,662.0,603.0,1125.0,28.0,22.5,26.4,1986,...,,,,,,,,,,
1072,1986-01-01,British Columbia,1132000,1304.333333,1515.666667,3114.666667,79.5,49.2,66.3,1986,...,,28.0,,,22.5,,,26.4,,
1073,1986-01-01,Manitoba,392000,426.333333,536.0,1382.0,37.7,26.9,34.7,1986,...,,79.5,,,49.2,,,66.3,,
1074,1986-01-01,New Brunswick,237000,329.666667,105.666667,488.333333,75.1,56.1,70.5,1986,...,,37.7,28.0,,26.9,22.5,,34.7,26.4,
1075,1986-01-01,Newfoundland and Labrador,161000,181.0,74.333333,1009.333333,39.4,35.3,38.7,1986,...,,75.1,79.5,,56.1,49.2,,70.5,66.3,


Creating Rolling Mean Features (3-month moving average) which smooths out fluctuations to highlight long-term trends.

In [9]:
for col in lag_columns:
    df_trimmed[f'{col}_rolling_mean_3'] = df_trimmed[col].rolling(window=3).mean()

# Drop rows with NaN values introduced by shifting (since lag features create missing values at the start)
df_trimmed.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trimmed[f'{col}_rolling_mean_3'] = df_trimmed[col].rolling(window=3).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trimmed[f'{col}_rolling_mean_3'] = df_trimmed[col].rolling(window=3).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trimmed[f'{col}_rolling_mean_3'] = df_trimme

In [10]:
print("Dataset Information: After Final Cleaning")
df_trimmed.info()
df_trimmed.head()


Dataset Information: After Final Cleaning
<class 'pandas.core.frame.DataFrame'>
Index: 4188 entries, 1077 to 5264
Data columns (total 41 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   REF_DATE                                    4188 non-null   datetime64[ns]
 1   GEO                                         4188 non-null   object        
 2   Number_of_Households                        4188 non-null   int64         
 3   Housing completions                         4188 non-null   float64       
 4   Housing starts                              4188 non-null   float64       
 5   Housing under construction                  4188 non-null   float64       
 6   House only NHPI                             4188 non-null   float64       
 7   Land only NHPI                              4188 non-null   float64       
 8   Total (house and land) NHPI                 4188

Unnamed: 0,REF_DATE,GEO,Number_of_Households,Housing completions,Housing starts,Housing under construction,House only NHPI,Land only NHPI,Total (house and land) NHPI,Year,...,Total (house and land) NHPI_lag_1,Total (house and land) NHPI_lag_3,Total (house and land) NHPI_lag_6,Number_of_Households_rolling_mean_3,Housing completions_rolling_mean_3,Housing starts_rolling_mean_3,Housing under construction_rolling_mean_3,House only NHPI_rolling_mean_3,Land only NHPI_rolling_mean_3,Total (house and land) NHPI_rolling_mean_3
1077,1986-01-01,Prince Edward Island,41000,82.0,40.666667,95.0,36.65,38.6,37.6,1986,...,36.7,70.5,26.4,1177000.0,1522.111111,1183.333333,4178.222222,36.35,39.4,37.666667
1078,1986-01-01,Quebec,2421000,2503.666667,2803.0,7382.0,40.3,32.9,38.5,1986,...,37.6,38.7,66.3,1930333.0,2296.333333,2092.888889,6302.444444,36.65,38.6,37.6
1079,1986-01-01,Canada,9238000,10938.333333,9757.333333,27632.666667,39.4,38.8,39.8,1986,...,38.5,36.7,34.7,3900000.0,4508.0,4200.333333,11703.222222,38.783333,36.766667,38.633333
1080,1986-02-01,Alberta,859000,662.0,603.0,1125.0,28.5,22.5,26.8,1986,...,39.8,37.6,70.5,4172667.0,4701.333333,4387.777778,12046.555556,36.066667,31.4,35.033333
1081,1986-02-01,British Columbia,1132000,1304.333333,1515.666667,3114.666667,79.6,49.4,66.4,1986,...,26.8,38.5,38.7,3743000.0,4301.555556,3958.666667,10624.111111,49.166667,36.9,44.333333


Plotting time series trends for key variables after missing value handling


In [11]:
# plt.figure(figsize=(12, 6))
# for col in lag_columns:
#     plt.plot(df_trimmed['REF_DATE'], df_trimmed[col], label=col)
# plt.xlabel("Date")
# plt.ylabel("Values")
# plt.title("Time Series Trends After Handling Missing Values")
# plt.legend()
# plt.show()

In [12]:

# fig, axes = plt.subplots(len(lag_columns), 1, figsize=(12, 3 * len(lag_columns)), sharex=True)
# fig.suptitle("Time Series Trends After Handling Missing Values", fontsize=16)

# for i, col in enumerate(lag_columns):
#     axes[i].plot(df_trimmed['REF_DATE'], df_trimmed[col], label=col, color='tab:blue')
#     axes[i].set_ylabel(col)
#     axes[i].legend()
#     axes[i].grid()

# plt.xlabel("Date")
# plt.show()


In [13]:
df_trimmed.tail()

Unnamed: 0,REF_DATE,GEO,Number_of_Households,Housing completions,Housing starts,Housing under construction,House only NHPI,Land only NHPI,Total (house and land) NHPI,Year,...,Total (house and land) NHPI_lag_1,Total (house and land) NHPI_lag_3,Total (house and land) NHPI_lag_6,Number_of_Households_rolling_mean_3,Housing completions_rolling_mean_3,Housing starts_rolling_mean_3,Housing under construction_rolling_mean_3,House only NHPI_rolling_mean_3,Land only NHPI_rolling_mean_3,Total (house and land) NHPI_rolling_mean_3
5260,2024-10-01,Newfoundland and Labrador,220000,0.0,157.666667,0.0,108.8,105.7,107.1,2024,...,120.0,126.6,143.3,373333.3,0.0,452.555556,0.0,128.133333,106.033333,122.433333
5261,2024-10-01,Ontario,5984000,0.0,5429.0,0.0,125.9,118.6,123.6,2024,...,107.1,140.2,125.0,2180333.0,0.0,2045.888889,0.0,118.733333,109.066667,116.9
5262,2024-10-01,Prince Edward Island,68000,0.0,119.0,0.0,127.0,104.5,122.6,2024,...,123.6,120.0,122.2,2090667.0,0.0,1901.888889,0.0,120.566667,109.6,117.766667
5263,2024-10-01,Quebec,3908000,0.0,5271.333333,0.0,150.2,124.6,143.3,2024,...,122.6,107.1,126.6,3320000.0,0.0,3606.444444,0.0,134.366667,115.9,129.833333
5264,2024-10-01,Canada,16233000,0.0,21462.333333,0.0,127.3,117.2,124.5,2024,...,143.3,123.6,140.2,6736333.0,0.0,8950.888889,0.0,134.833333,115.433333,130.133333


 Saving the cleaned dataset to avoid reprocessing later

In [14]:
df_trimmed.to_csv("Trimmed_Time_Series_Data.csv", index=False)