<a href="https://colab.research.google.com/github/nsambel1980/causal_discovery/blob/main/HHMM_market.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas

In [2]:

import numpy as np
from scipy.stats import norm
from typing import List, Dict, Tuple

class StockHHMM:
    def __init__(self, n_market_states: int = 2, n_industry_states: int = 2,
                 n_stock_states: int = 2):
        self.n_market_states = n_market_states
        self.n_industry_states = n_industry_states
        self.n_stock_states = n_stock_states

    def initialize_parameters(self, industries: Dict[str, List[str]]):
        """
        Initialize model parameters

        Args:
            industries: Dictionary mapping industry names to lists of stock tickers
        """
        # Market-level parameters
        self.market_trans_matrix = np.random.dirichlet(
            np.ones(self.n_market_states), size=self.n_market_states)
        self.market_initial_probs = np.ones(self.n_market_states) / self.n_market_states

        # Industry-level parameters
        self.industry_params = {}
        for industry in industries.keys():
            self.industry_params[industry] = {
                'trans_matrices': np.array([
                    np.random.dirichlet(np.ones(self.n_industry_states),
                                      size=self.n_industry_states)
                    for _ in range(self.n_market_states)
                ]),
                'initial_probs': np.ones(self.n_industry_states) / self.n_industry_states
            }

        # Stock-level parameters
        self.stock_params = {}
        for industry, stocks in industries.items():
            for stock in stocks:
                self.stock_params[stock] = {
                    'trans_matrices': np.array([
                        [np.random.dirichlet(np.ones(self.n_stock_states),
                                           size=self.n_stock_states)
                         for _ in range(self.n_industry_states)]
                        for _ in range(self.n_market_states)
                    ]),
                    'initial_probs': np.ones(self.n_stock_states) / self.n_stock_states,
                    'emission_params': {
                        'mean': np.random.normal(0, 0.1, size=self.n_stock_states),
                        'std': np.abs(np.random.normal(0.01, 0.005,
                                                     size=self.n_stock_states))
                    }
                }

    def forward_algorithm(self, observations: Dict[str, np.ndarray],
                         industries: Dict[str, List[str]]) -> Tuple[float, Dict]:
        """
        Implement forward algorithm for the HHMM

        Args:
            observations: Dictionary mapping stock tickers to their volatility observations
            industries: Dictionary mapping industry names to lists of stock tickers

        Returns:
            log_likelihood: Log likelihood of the observations
            alpha: Forward variables
        """
        T = len(list(observations.values())[0])

        # Initialize forward variables
        alpha = {
            'market': np.zeros((T, self.n_market_states)),
            'industry': {ind: np.zeros((T, self.n_market_states, self.n_industry_states))
                        for ind in industries.keys()},
            'stock': {stock: np.zeros((T, self.n_market_states,
                                     self.n_industry_states, self.n_stock_states))
                     for ind in industries.values() for stock in ind}
        }

        # Initialize first time step
        for m in range(self.n_market_states):
            market_prob = self.market_initial_probs[m]

            for ind, stocks in industries.items():
                ind_probs = self.industry_params[ind]['initial_probs']

                for i in range(self.n_industry_states):
                    for stock in stocks:
                        stock_probs = self.stock_params[stock]['initial_probs']
                        emission_probs = self._compute_emission_probs(
                            observations[stock][0],
                            self.stock_params[stock]['emission_params'])

                        for s in range(self.n_stock_states):
                            alpha['stock'][stock][0, m, i, s] = (
                                market_prob * ind_probs[i] * stock_probs[s] *
                                emission_probs[s]
                            )

                    alpha['industry'][ind][0, m, i] = np.sum(
                        [alpha['stock'][stock][0, m, i, :].sum()
                         for stock in stocks]
                    )

            alpha['market'][0, m] = np.sum(
                [alpha['industry'][ind][0, m, :].sum() for ind in industries.keys()]
            )

        # Forward pass for remaining time steps
        for t in range(1, T):
            for m in range(self.n_market_states):
                for prev_m in range(self.n_market_states):
                    market_trans = self.market_trans_matrix[prev_m, m]

                    for ind, stocks in industries.items():
                        ind_trans = self.industry_params[ind]['trans_matrices'][m]

                        for i in range(self.n_industry_states):
                            for prev_i in range(self.n_industry_states):
                                for stock in stocks:
                                    stock_trans = (
                                        self.stock_params[stock]['trans_matrices'][m][i]
                                    )
                                    emission_probs = self._compute_emission_probs(
                                        observations[stock][t],
                                        self.stock_params[stock]['emission_params']
                                    )

                                    for s in range(self.n_stock_states):
                                        for prev_s in range(self.n_stock_states):
                                            alpha['stock'][stock][t, m, i, s] += (
                                                alpha['stock'][stock][t-1, prev_m,
                                                                    prev_i, prev_s] *
                                                market_trans * ind_trans[prev_i, i] *
                                                stock_trans[prev_s, s] *
                                                emission_probs[s]
                                            )

                            alpha['industry'][ind][t, m, i] = np.sum(
                                [alpha['stock'][stock][t, m, i, :].sum()
                                 for stock in stocks]
                            )

                alpha['market'][t, m] = np.sum(
                    [alpha['industry'][ind][t, m, :].sum()
                     for ind in industries.keys()]
                )

        log_likelihood = np.log(alpha['market'][-1].sum())
        return log_likelihood, alpha

    def _compute_emission_probs(self, observation: float,
                              emission_params: Dict) -> np.ndarray:
        """
        Compute emission probabilities for a given observation
        """
        return np.array([
            norm.pdf(observation, loc=emission_params['mean'][s],
                    scale=emission_params['std'][s])
            for s in range(len(emission_params['mean']))
        ])

    def fit(self, observations: Dict[str, np.ndarray],
            industries: Dict[str, List[str]],
            max_iter: int = 100,
            tol: float = 1e-6):
        """
        Fit the HHMM using the Baum-Welch algorithm
        """
        self.initialize_parameters(industries)

        prev_ll = float('-inf')
        for iteration in range(max_iter):
            # E-step
            log_likelihood, alpha = self.forward_algorithm(observations, industries)

            if abs(log_likelihood - prev_ll) < tol:
                break

            # M-step would go here - update transition matrices and emission parameters
            # using the forward-backward algorithm results

            prev_ll = log_likelihood

        return self

    def predict_states(self, observations: Dict[str, np.ndarray],
                      industries: Dict[str, List[str]]) -> Dict:
        """
        Predict the most likely state sequence using the Viterbi algorithm
        """
        # Implementation of Viterbi algorithm would go here
        pass

In [None]:

import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple

def download_stock_data(tickers: Dict[str, List[str]],
                       start_date: str = '2020-01-01',
                       end_date: str = '2024-01-01') -> Tuple[Dict[str, pd.DataFrame], Dict[str, np.ndarray]]:
    """
    Download stock data and calculate daily volatility
    """
    # Download data
    all_data = {}
    volatilities = {}

    for industry, stocks in tickers.items():
        for ticker in stocks:
            # Download daily data
            stock = yf.Ticker(ticker)
            df = stock.history(start=start_date, end=end_date)

            # Calculate daily log returns
            df['Returns'] = np.log(df['Close']).diff()

            # Calculate rolling 21-day volatility (annualized)
            df['Volatility'] = df['Returns'].rolling(window=21).std() * np.sqrt(252)

            # Store data
            all_data[ticker] = df
            volatilities[ticker] = df['Volatility'].dropna().values

    return all_data, volatilities

# Define industry-stock structure
tickers = {
    'Technology': ['AAPL', 'MSFT', 'NVDA'],
    'Finance': ['JPM', 'GS', 'MS'],
    'Consumer': ['AMZN', 'WMT', 'PG', 'KO']
}

# Download data and calculate volatilities
print("Downloading stock data...")
all_data, volatilities = download_stock_data(tickers)

# Initialize and fit the HHMM model
print("\nFitting HHMM model...")
model = StockHHMM(n_market_states=2, n_industry_states=2, n_stock_states=2)
fitted_model = model.fit(volatilities, tickers)

# Analyze the results
print("\nAnalyzing results...")

# Plot volatility regimes for each stock
plt.figure(figsize=(15, 10))
for i, (ticker, vol_data) in enumerate(volatilities.items(), 1):
    plt.subplot(4, 3, i)
    plt.plot(all_data[ticker].index[-len(vol_data):], vol_data, label='Volatility')
    plt.title(f'{ticker} Volatility')
    plt.xticks(rotation=45)
    plt.grid(True)
plt.tight_layout()
plt.show()

# Print summary statistics
print("\nSummary Statistics:")
for industry, stocks in tickers.items():
    print(f"\n{industry} Sector:")
    for stock in stocks:
        vol = volatilities[stock]
        print(f"{stock}:")
        print(f"  Mean Volatility: {vol.mean():.3f}")
        print(f"  Min Volatility: {vol.min():.3f}")
        print(f"  Max Volatility: {vol.max():.3f}")
        print(f"  Volatility Std: {vol.std():.3f}")

# Calculate and print correlations between stocks
print("\nVolatility Correlations:")
correlation_matrix = pd.DataFrame(
    {stock1: {stock2: np.corrcoef(volatilities[stock1], volatilities[stock2])[0,1]
              for stock2 in volatilities.keys()}
     for stock1 in volatilities.keys()}
)
print(correlation_matrix.round(3))

Downloading stock data...

Fitting HHMM model...


  log_likelihood = np.log(alpha['market'][-1].sum())
  if abs(log_likelihood - prev_ll) < tol:
