# Generate data

In [23]:
import pandas as pd
import numpy as np
from hmmlearn import hmm
import matplotlib.pyplot as plt
import pickle
from time import time

DATA_PICKLE_FILE = "./data/genrated_data_1657969230.pkl"

In [None]:
with open(DATA_PICKLE_FILE, "rb") as f:
    data = pickle.load(f)

In [6]:
def fit_with_known_no_hidden_states(X, lengths, no_hidden_states, trials, print_scores = False) -> hmm.GaussianHMM:
    """Fit GaussianHMM with known number of hidden states."""
    best_model = None
    best_score = -np.infty
    for t in range(trials):
        remodel = hmm.GaussianHMM(n_components = no_hidden_states).fit(X, lengths)
        if not remodel.monitor_.converged:
            print(f"Model in trial {t} didn't converge.") 
        score = remodel.score(X) # log-likelihood of the model
        if score > best_score:
            best_model = remodel
            best_score = score
            if print_scores: 
                print(f"Model in trial {t} is best so far with score {score}.")
    return best_model

def AIC(X, lengths, model) -> float:
    loglik = model.score(X, lengths)
    k = model.n_components
    return 2*k - 2*loglik

def BIC(X, lengths, model) -> float:
    loglik = model.score(X, lengths)
    k = model.n_components
    n = sum(lengths) # TODO jak to dziala jak to rozbijam na czesci?
    return k*np.log(n) - 2*loglik

def compute_AIC_BIC(X, lengths, search_space: list[int], trials):
    models = [fit_with_known_no_hidden_states(X, lengths, nhs, trials) for nhs in search_space]
    AICs = [AIC(X, lengths, model) for model in models]
    BICs = [BIC(X, lengths, model) for model in models]
    # print(f"Models:\n{models}\nAICs:\n{AICs}\nBICs:\n{BICs}")
    return {"models": models, "AIC": AICs, "BIC": BICs}

def fit_best_model(X, lengths, trials = 10, criterion = "AIC", print_debug = False):
    results = compute_AIC_BIC(X, lengths, [1,2,3,4,5,6], trials)
    best_model_id = np.argmin(results[criterion])
    if print_debug: 
        print("Fitting best model - results:")
        print(results)
    return results["models"][best_model_id]

fitted_models_lst = []
for i in range(train_df.shape[1]):
    X = np.array([train_df.iloc[:,i]]).transpose()
    fitted_models_lst.append(fit_best_model(X, sample_lengths[:-1]))


Use fitted models to classification

test_sample_id | true_label | predicted_label | (*) kind of probits


In [7]:
def classify(X, fitted_models) -> int:
    """Return id of fitted model with best log-likelihood given (one) sample X."""
    scores = [model.score(X) for model in fitted_models]
    return np.argmax(scores)

test_samples = [] 
true_labels = []  
for col in test_df: 
    start_id = test_df.index[0]
    for i in range(samples_per_model):
        sample = np.array([test_df[col].loc[start_id: start_id + (i+1)*sample_n]])
        test_samples.append(sample)
        true_labels.append(i)

predictions = [classify(X, fitted_models_lst) for X in test_samples]
predictions_dtf = pd.DataFrame({
    "true_label": true_labels, 
    "prediction": predictions
    })