This notebook will write a class that performs PCA and HMM on macro indicators (maybe will have two classes, one for PCA and one for HMM)

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns

In [13]:
class MacroPCA:
    def __init__(self, data, n_components = None):
        """
        Initialse the PCA wrapper.
        param: data - dataframe of macroeconomic indicators
        param: n_components - number of principal components to keep (default: all)
        """
        self.data = data
        self.n_components = n_components
        self.scaler = StandardScaler()
        self.pca = PCA(n_components=self.n_components)
        self.fitted = False

    def standardise(self):
        self.data_scaled = self.scaler.fit_transform(self.data)
        return self.data_scaled
    
    def run_pca(self):
        if not hasattr(self, 'data_scaled'):
            raise ValueError("You must call standardize() before run_pca().")
        self.components = self.pca.fit_transform(self.data_scaled)
        self.explained_variance = self.pca.explained_variance_ratio_
        self.loadings = pd.DataFrame(self.pca.components_.T,
                                     index=self.data.columns,
                                     columns=[f'PC{i+1}' for i in range(self.pca.n_components_)])
        self.fitted = True
        return pd.DataFrame(self.components, columns=self.loadings.columns)
        
    def get_explained_variance(self):
        if not self.fitted:
            raise RuntimeError("PCA has not been run yet.")
        return self.explained_variance

    def get_loadings(self):
        if not self.fitted:
            raise RuntimeError("PCA has not been run yet.")
        return self.loadings




In [16]:
# Get the macro data
macro_df = pd.read_csv("monthly_macro.csv")
macro_df.set_index('Date', inplace=True)
macro_df = macro_df[macro_df.index >= '1990-01-01']


In [18]:
model = MacroPCA(data=macro_df, n_components=4)
model.standardise()
pc_df = model.run_pca()

# Optional:
print(model.get_explained_variance())
print(model.get_loadings())


[0.44346413 0.18114306 0.13684418 0.09245506]
                       PC1       PC2       PC3       PC4
VIX              -0.177389  0.535876 -0.244452  0.448400
2Y                0.458181  0.280575  0.140574  0.008471
10Y               0.370743  0.344026  0.430815 -0.017367
YieldCurve       -0.351235  0.033261  0.565788 -0.057706
CreditSpread     -0.400851  0.304392 -0.115394  0.204326
FedFundsRate      0.459148  0.270138  0.026473  0.026666
Inflation_YoY     0.143042 -0.339343  0.207414  0.867295
UnemploymentRate -0.307593  0.179755  0.571097 -0.019255
GDP_YoY           0.105900 -0.448954  0.177692  0.014249
