In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.tseries.offsets import MonthEnd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, f1_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

In [2]:
tsx = yf.download("^GSPTSE", start="2010-01-01", end="2023-01-01")
tsx.sort_values(by=['Date'], inplace=True, ascending=False)
tsx.reset_index(inplace=True)

START_DATE = "2010/01/01"
END_DATE = "2023/01/01"

file_path_cpi_inflation = "src/CPI_and_inflation.csv"
cpi_inflation = pd.read_csv(file_path_cpi_inflation)

file_path_gdp = "src/GDP_growthRate.csv"
gdp = pd.read_csv(file_path_gdp)

file_path_unemployment = "src/Unemployment rates.csv"
unemployment = pd.read_csv(file_path_unemployment)

[*********************100%***********************]  1 of 1 completed
  gdp = pd.read_csv(file_path_gdp)


In [3]:
tsx['Date'] = pd.to_datetime(tsx['Date']).dt.strftime('%Y/%m/%d')

cpi_inflation['Date'] = pd.to_datetime(cpi_inflation['REF_DATE'], format='%Y-%m').dt.strftime('%Y/%m/01')

gdp['Date'] = pd.to_datetime(gdp['REF_DATE'], format='%Y-%m').dt.strftime('%Y/%m/01')

unemployment['Date'] = pd.to_datetime(unemployment['REF_DATE'].astype(str) + '-01-01', format='%Y-%m-%d').dt.strftime('%Y/%m/%d')

In [4]:
cpi_filtered_data = cpi_inflation[(cpi_inflation['Date'] >= START_DATE) & (cpi_inflation['Date'] <= END_DATE)]
gdp_filtered_data = gdp[(gdp['Date'] >= START_DATE) & (gdp['Date'] <= END_DATE)]
unemployment_filtered_data = unemployment[(unemployment['Date'] >= START_DATE) & (unemployment['Date'] <= END_DATE)]

In [5]:
print(f"{cpi_filtered_data.columns} '\n', {gdp_filtered_data.columns}'\n', {unemployment_filtered_data.columns}'\n'")

Index(['REF_DATE', 'GEO', 'DGUID', 'Alternative measures', 'UOM', 'UOM_ID',
       'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS',
       'SYMBOL', 'TERMINATED', 'DECIMALS', 'Date'],
      dtype='object') '
', Index(['REF_DATE', 'GEO', 'DGUID', 'Seasonal adjustment', 'Prices',
       'North American Industry Classification System (NAICS)', 'UOM',
       'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE',
       'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS', 'Date'],
      dtype='object')'
', Index(['REF_DATE', 'GEO', 'DGUID',
       'Characteristics of the population aged 15 and over',
       'Educational attainment', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID',
       'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED',
       'DECIMALS', 'Date'],
      dtype='object')'
'


In [6]:
cpi_filtered_data = cpi_inflation[(cpi_inflation['Date'] >= START_DATE) & (cpi_inflation['Date'] <= END_DATE)]
gdp_filtered_data = gdp[(gdp['Date'] >= START_DATE) & (gdp['Date'] <= END_DATE)]
unemployment_filtered_data = unemployment[(unemployment['Date'] >= START_DATE) & (unemployment['Date'] <= END_DATE)]

In [7]:
print(cpi_filtered_data['Alternative measures'].unique())

['Measure of core inflation based on a factor model, CPI-common (year-over-year percent change)'
 'Measure of core inflation based on a weighted median approach, CPI-median (year-over-year percent change)'
 'Measure of core inflation based on a trimmed mean approach, CPI-trim (year-over-year percent change)'
 'Measure of core inflation based on a weighted median approach, CPI-median (index, 198901=100)'
 'Measure of core inflation based on a trimmed mean approach, CPI-trim (index, 198901=100)'
 'Consumer Price Index (CPI), all-items excluding eight of the most volatile components as defined by the Bank of Canada and excluding the effect of changes in indirect taxes'
 'Consumer Price Index (CPI), all-items excluding eight of the most volatile components as defined by the Bank of Canada'
 'Consumer Price Index (CPI), all-items excluding the effect of indirect taxes'
 'Consumer Price Index (CPI), all-items excluding eight of the most volatile components as defined by the Bank of Canada an

In [8]:
cpi_filtered_data = cpi_filtered_data[['Date', 'Alternative measures', 'VALUE']]
condition = (
    cpi_filtered_data['Alternative measures'] ==
    "Consumer Price Index (CPI), all-items excluding eight of the most volatile components as defined by the Bank of Canada and excluding the effect of changes in indirect taxes"
)
cpi_filtered_data = cpi_filtered_data[condition]

cpi_filtered_data.pop('Alternative measures')
cpi_filtered_data.rename(columns={'VALUE': 'CPI value'}, inplace=True)

In [9]:
print(gdp_filtered_data['North American Industry Classification System (NAICS)'].unique())

['All industries [T001]' 'Goods-producing industries [T002]'
 'Service-producing industries [T003]' 'Business sector industries [T004]'
 'Business sector, goods [T005]' 'Business sector, services [T006]'
 'Non-business sector industries [T007]'
 'Non-business sector, goods [T008]'
 'Non-business sector, services [T009]' 'Industrial production [T010]'
 'Non-durable manufacturing industries [T011]'
 'Durable manufacturing industries [T012]'
 'Information and communication technology sector [T013]'
 'Information and communication technology, manufacturing [T014]'
 'Information and communication technology, services [T015]'
 'Energy sector [T016]' 'Industrial production (1950 definition) [T017]'
 'Public Sector [T018]' 'Content and media sector [T019]'
 'All industries (except cannabis sector) [T020]' 'Cannabis sector [T021]'
 'Cannabis sector (licensed) [T022]' 'Cannabis sector (unlicensed) [T023]'
 'All industries (except unlicensed cannabis sector) [T024]'
 'Agriculture, forestry, fishi

In [10]:
gdp_filtered_data = gdp_filtered_data[['Date', 'North American Industry Classification System (NAICS)', 'VALUE']]
condition = (
    gdp_filtered_data['North American Industry Classification System (NAICS)'] == "All industries [T001]"
)
gdp_filtered_data = gdp_filtered_data[condition]
gdp_filtered_data.rename(columns={'VALUE': 'GDP growth rate value'}, inplace=True)
gdp_filtered_data.pop('North American Industry Classification System (NAICS)')

134316    All industries [T001]
134603    All industries [T001]
134890    All industries [T001]
135177    All industries [T001]
135464    All industries [T001]
                  ...          
268058    All industries [T001]
268345    All industries [T001]
268632    All industries [T001]
268919    All industries [T001]
269206    All industries [T001]
Name: North American Industry Classification System (NAICS), Length: 471, dtype: object

In [11]:
print(unemployment_filtered_data['Educational attainment'].unique())
print(unemployment_filtered_data['Characteristics of the population aged 15 and over'].unique())

['All levels of education' 'Less than high school' 'High school'
 'College or trade' 'University']
['Population, Canada' 'Population, off reserve Indigenous']


In [12]:
unemployment_filtered_data = unemployment_filtered_data[['Date',
                                                         'Characteristics of the population aged 15 and over',
                                                         'Educational attainment',
                                                         'VALUE']]

condition = (
    (unemployment_filtered_data['Characteristics of the population aged 15 and over'] == "Population, Canada") &
    (unemployment_filtered_data['Educational attainment'] == "All levels of education")
)

unemployment_filtered_data = unemployment_filtered_data[condition]

unemployment_filtered_data.rename(columns={'VALUE': 'Unemployment rate value'}, inplace=True)

unemployment_filtered_data.pop('Characteristics of the population aged 15 and over')
unemployment_filtered_data.pop('Educational attainment')

200    All levels of education
210    All levels of education
220    All levels of education
230    All levels of education
240    All levels of education
250    All levels of education
260    All levels of education
270    All levels of education
280    All levels of education
290    All levels of education
300    All levels of education
310    All levels of education
320    All levels of education
330    All levels of education
Name: Educational attainment, dtype: object

In [13]:
# Merge datasets
# Expand CPI data to daily, assuming the CPI value is valid for the entire month
expanded_cpi = []
for idx, row in cpi_filtered_data.iterrows():
    start_date = pd.to_datetime(row['Date'])
    end_date = start_date + MonthEnd(0)  # End of the month
    daily_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    for date in daily_dates:
        expanded_cpi.append({'Date': date.strftime('%Y/%m/%d'), 'CPI value': row['CPI value']})

cpi_expanded = pd.DataFrame(expanded_cpi)

# Expand GDP data to daily, assuming the GDP value is valid for the entire month
expanded_gdp = []
for idx, row in gdp_filtered_data.iterrows():
    start_date = pd.to_datetime(row['Date'])
    end_date = start_date + MonthEnd(0)  # End of the month
    daily_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    for date in daily_dates:
        expanded_gdp.append({'Date': date.strftime('%Y/%m/%d'), 'GDP growth rate': row['GDP growth rate value']})

gdp_expanded = pd.DataFrame(expanded_gdp)

# Expand unemployment data to daily, assuming the same unemployment rate for the entire year
unemployment_expanded = []
for idx, row in unemployment_filtered_data.iterrows():
    start_date = pd.to_datetime(row['Date'])
    end_date = start_date + pd.DateOffset(years=1) - pd.DateOffset(days=1)  # End of the year
    daily_dates = pd.date_range(start=start_date, end=end_date, freq='D')
    for date in daily_dates:
        unemployment_expanded.append({'Date': date.strftime('%Y/%m/%d'), 'Unemployment rate': row['Unemployment rate value']})

unemployment_expanded = pd.DataFrame(unemployment_expanded)

In [14]:
tsx.columns = ['_'.join(filter(None, col)) if isinstance(col, tuple) else col for col in tsx.columns]

merged_data = pd.merge(tsx, cpi_expanded, on='Date', how='inner')
merged_data = pd.merge(merged_data, gdp_expanded, on='Date', how='inner')
merged_data = pd.merge(merged_data, unemployment_expanded, on='Date', how='inner')
merged_data.pop('Date')

0       2022/12/30
1       2022/12/30
2       2022/12/30
3       2022/12/29
4       2022/12/29
           ...    
9778    2010/01/05
9779    2010/01/05
9780    2010/01/04
9781    2010/01/04
9782    2010/01/04
Name: Date, Length: 9783, dtype: object

In [15]:
train_data, temp_data = train_test_split(merged_data, test_size=0.3, random_state=42)
validation_data, test_data = train_test_split(temp_data, test_size=0.3333, random_state=42)
target_columns = ['CPI value', 'GDP growth rate', 'Unemployment rate'] 

In [16]:
X_train = train_data.drop(columns=target_columns)
y_train = train_data[target_columns]

X_val = validation_data.drop(columns=target_columns)
y_val = validation_data[target_columns]

X_test = test_data.drop(columns=target_columns)
y_test = test_data[target_columns]

In [17]:
lr_model = LinearRegression()
lr_multi_output = MultiOutputRegressor(lr_model)
lr_multi_output.fit(X_train, y_train)

lr_preds = lr_multi_output.predict(X_val)

lr_mae = mean_absolute_error(y_val, lr_preds)
lr_rmse = mean_squared_error(y_val, lr_preds)
lr_r2 = r2_score(y_val, lr_preds)

print("Linear Regression ::: MAE: {:.4f}, RMSE: {:.4f}, R2: {}".format(lr_mae, lr_rmse, lr_r2))

Linear Regression ::: MAE: 263031.5553, RMSE: 235931173666.2933, R2: 0.33299064229650516


In [18]:
svr_model = SVR()
svr_multi_output = MultiOutputRegressor(svr_model)
svr_multi_output.fit(X_train, y_train)

svr_preds = svr_multi_output.predict(X_val)


svr_mae = mean_absolute_error(y_val, svr_preds)
svr_rmse = mean_squared_error(y_val, svr_preds)
svr_r2 = r2_score(y_val, svr_preds)
print("SVM ::: MAE: {:.4f}, RMSE: {:.4f}, R2: {}".format(lr_mae, lr_rmse, lr_r2))

SVM ::: MAE: 263031.5553, RMSE: 235931173666.2933, R2: 0.33299064229650516


In [19]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_multi_output = MultiOutputRegressor(rf_model)
rf_multi_output.fit(X_train, y_train)

rf_preds = rf_multi_output.predict(X_val)

rf_mae = mean_absolute_error(y_val, rf_preds)
rf_rmse = mean_squared_error(y_val, rf_preds)
rf_r2 = r2_score(y_val, rf_preds)
print("Random forest ::: MAE: {:.4f}, RMSE: {:.4f}, R2: {}".format(lr_mae, lr_rmse, lr_r2))

Random forest ::: MAE: 263031.5553, RMSE: 235931173666.2933, R2: 0.33299064229650516


In [23]:
X = merged_data.drop(columns=target_columns)  # Drop the target column to get the features
y = merged_data['CPI value']  # Target variable (dependent)
lr_cv = cross_val_score(lr_model, X, y, cv=5, scoring='neg_mean_absolute_error')
svr_cv = cross_val_score(svr_model, X, y, cv=5, scoring='neg_mean_absolute_error')
rf_cv = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_absolute_error')

print(f"Linear Regression ::: CV MAE: {-lr_cv.mean()}")
print(f"SVR ::: CV MAE: {-svr_cv.mean()}")
print(f"Random Forest ::: CV MAE: {-rf_cv.mean()}")

Linear Regression ::: CV MAE: 3.4370395109355014
SVR ::: CV MAE: 8.708376330944109
Random Forest ::: CV MAE: 4.948426725518924
