In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

SALARIES_AND_WAGES_FILENAME = "salaries_and_wages_df.xlsx"
CPI_FILENAME = "consumers-price-index-june-2024-quarter-seasonally-adjusted.csv"

In [2]:
salaries_and_wages = pd.read_excel(f'./data/{SALARIES_AND_WAGES_FILENAME}')
cpi = pd.read_csv(f'./data/{CPI_FILENAME}')

In [3]:
# Of Interest is understanding the relationship between salaries and wages data and the consumer price index data
# Can we build a model to predict the CPI based on the salaries and wages data?

In [4]:
# Considerations:

# The model will be a regression model, as we are using numerical data to predict a numerical target variable

# The CPI dataset is seasonally adjusted-
# so we will use the seasonally adjusted salaries and wages data

# Because it is of interest to me to understand the relationship between salaries and wages and the CPI,
# it is prudent to use a model where the results are interpretable. i.e. Linear Regression

# As we are building a linear regression model, there isn't the need to split the data into train/test sets.

In [5]:
# There are only 32 unique periods in the salaries and wages dataset
# Failing at the first hurdle a little- not a lot of data points to train a model.
# Let's go for it anyway just for fun.
salaries_and_wages['Period'].nunique() 

32

In [6]:
# Take the logarithm of the salaries and wages data and store in X
X = pd.DataFrame(np.log(salaries_and_wages.groupby('Period')['Data_value'].agg('sum')))
y = cpi[(cpi['Period'] >= 2016.06) & (cpi['Series_title_2'] == 'Non-Tradable All Groups')].iloc[:-1, :]['Period'].to_numpy()

In [7]:
# Model

reg = LinearRegression().fit(X, y)
reg.score(X,y).round(4) # R^2 = 0.9044


0.9044

In [8]:
# Interpretation - 

# 90% of the variation in the Seasonally adjusted CPI is explained by the variation in our explanatory variable, in this case the 
# salaries and wages data.