# Classification with Hidden Markov Models
Running this notebook will:

- load data generated by `01_generate_data.ipynb` from file chosen in setup part
- split data into train and test subsets with chosen ratio
- fit Hidden Markov Models using both AIC and BIC criteria for selection of number of hidden states
- show comparison of number of hidden states in true and fitted models
- run classification on test subset and show results for both AIC and BIC versions

## Setup

In [1]:
import pandas as pd
import numpy as np
from hmmlearn import hmm
import pickle

DATA_PICKLE_FILE = "./data/genrated_data_1657983167.pkl"

In [2]:
def fit_with_known_no_hidden_states(samples: list[np.ndarray], no_hidden_states: int, trials: int) -> hmm.GaussianHMM:
    """Fit GaussianHMM when number of hidden states is known.
    EM Algorithm will stuck in local optima, so it is recommended to try to fit model multiple times 
    and select the one with highest score. Number of iterations is set via `trials` argument.
    """
    lengths = [len(sample) for sample in samples]
    X = np.concatenate(samples)
    best_model = None
    best_score = -np.infty
    for t in range(trials):
        remodel = hmm.GaussianHMM(n_components = no_hidden_states).fit(X, lengths)
        if not remodel.monitor_.converged:
            print(f"Model in trial {t} didn't converge.") 
        score = remodel.score(X)
        if score > best_score:
            best_model = remodel
            best_score = score
    return best_model

def AIC(samples: list[np.ndarray], model: hmm.BaseHMM) -> float:
    """Akaike Information Criterion implemented for Hidden Markov Model."""
    lengths = [len(sample) for sample in samples]
    X = np.concatenate(samples)
    loglik = model.score(X, lengths)
    k = model.n_components
    return 2*k - 2*loglik

def BIC(samples: list[np.ndarray], model: hmm.BaseHMM) -> float:
    """Bayesian Information Criterion implemented for Hidden Markov Model."""
    lengths = [len(sample) for sample in samples]
    X = np.concatenate(samples)
    loglik = model.score(X, lengths)
    k = model.n_components
    n = sum(lengths) # TODO jak to dziala jak to rozbijam na czesci?
    return k*np.log(n) - 2*loglik

def fit_and_compute_criteria(samples: list[np.ndarray], search_space: list[int], trials: int) -> dict:
    """Return dict containing models and corresponding information criteria values."""
    models = [fit_with_known_no_hidden_states(samples, nhs, trials) for nhs in search_space]
    AICs = [AIC(samples, model) for model in models]
    BICs = [BIC(samples, model) for model in models]
    return {"models": models, "AIC": AICs, "BIC": BICs}

def fit_best_model(samples: list[np.ndarray], search_space: list[int], trials: int = 10, criterion: str = "AIC") -> hmm.BaseHMM:
    """Fit hmm.GaussianHMM models for all number of hidden states in `search_space` and return the one selected
    by given criterion."""
    results = fit_and_compute_criteria(samples, search_space, trials)
    best_model_id = np.argmin(results[criterion]) # Information criteria are minimized, not maximized.
    return results["models"][best_model_id]

def classify_sample(X: np.ndarray, fitted_models: list[hmm.BaseHMM]) -> int:
    """Return id of fitted model with best log-likelihood given (one) sample X."""
    scores = [model.score(X) for model in fitted_models]
    return np.argmax(scores)

## Load data and prepare train and test splits

In [3]:
with open(DATA_PICKLE_FILE, "rb") as f:
    data = pickle.load(f)

data.keys()


dict_keys(['models_lst', 'labels_df', 'all_X_samples', 'all_Z_samples'])

In [None]:
all_X_samples = data["all_X_samples"]
labels_df = data["labels_df"]

In [17]:
TRAIN_SUBSET_SIZE = 0.8 # stratified sampling
train_samples_ids = []
test_samples_ids = []
print("Number of samples in train and test splits grouped by true labels:")
print("label |train |test")
for label, sub_df in labels_df.groupby("true_label"):
    n = sub_df.shape[0]
    train_size = int(n*TRAIN_SUBSET_SIZE)
    test_size = n - train_size
    if test_size < 1:
        raise Exception(f"For {TRAIN_SUBSET_SIZE = } and {n = } test subset in group {label} is empty.")
    print(f"{label}     |{train_size}     |{test_size}")
    train_ids = sub_df.sample(8).index.values
    train_samples_ids.extend(train_ids)
    test_samples_ids.extend(sub_df.drop(train_ids).index.values)


Number of samples in train and test splits grouped by true labels:
label |train |test
0     |8     |2
1     |8     |2
2     |8     |2
3     |8     |2
4     |8     |2
5     |8     |2
6     |8     |2
7     |8     |2
8     |8     |2


## Fit the models: use both AIC and BIC

In [10]:
SEARCH_SPACE = list(range(1,6)) # numbers of hidden states to try

fitted_models_AIC = []
fitted_models_BIC = []
for label, sub_df in labels_df.loc[train_samples_ids].groupby("true_label"):
    int_ids = sub_df.index.values.astype('int')
    X = [all_X_samples[id] for id in int_ids]
    fitted_models_AIC.append(fit_best_model(X, SEARCH_SPACE, criterion="AIC"))
    fitted_models_BIC.append(fit_best_model(X, SEARCH_SPACE, criterion="BIC"))
    

### Compare numbers of hidden states fitted and true models

In [11]:
true_models = data["models_lst"]
print("Number of hidden states in model used to generate samples vs fitted model:")
print("label |true k |fitted k (AIC) |fitted k (BIC)")
for i in range(len(fitted_models_AIC)):
    k_true = true_models[i].n_components
    k_fit_AIC = fitted_models_AIC[i].n_components
    k_fit_BIC = fitted_models_BIC[i].n_components
    print(f"{i}     |{k_true}      |{k_fit_AIC}              |{k_fit_BIC}")

Number of hidden states in model used to generate samples vs fitted model:
label |true k |fitted k (AIC) |fitted k (BIC)
0     |2      |2              |2
1     |2      |3              |3
2     |2      |2              |2
3     |1      |4              |2
4     |1      |2              |1
5     |1      |3              |1
6     |5      |5              |5
7     |3      |3              |3
8     |3      |3              |3


BIC seems better than AIC (no overfitting, too many hidden states in HMM is overfitting). 

## Classify test samples

In [None]:
test_df = labels_df.loc[test_samples_ids]
int_ids = test_df.index.values.astype('int')
test_samples = [all_X_samples[id] for id in int_ids]
predictions_AIC = [classify_sample(X, fitted_models_AIC) for X in test_samples]
predictions_BIC = [classify_sample(X, fitted_models_BIC) for X in test_samples]

predictions_df = pd.DataFrame({
    "predicted_label_AIC": predictions_AIC,
    "predicted_label_BIC": predictions_BIC
}, index= test_df.index)

In [20]:
classification_results = test_df.join(predictions_df)
display(classification_results)

Unnamed: 0,true_label,predicted_label_AIC,predicted_label_BIC
3,0,0,0
5,0,0,0
17,1,1,1
19,1,0,0
25,2,2,2
29,2,2,2
36,3,3,3
38,3,3,3
41,4,4,4
47,4,4,4
