# Compare results

## Setup

In [None]:
import pandas as pd
import numpy as np
import pickle 
import os
import matplotlib.pyplot as plt
import time 

# set the directory containing results files to analyze
RESULTS_DIR = "results_HMM" 
# directory where dataframes with summarized results etc will be stored
SUMMARY_OUTPUT_DIR = "summaries"

## Load results of classification

In [None]:
results_lst = []
filepaths_lst = []
for filename in os.listdir(RESULTS_DIR):
    path = f"{RESULTS_DIR}/{filename}"
    with open(path, "rb") as f:
        filepaths_lst.append(path)
        print(f"Reading data from {path}")
        results_lst.append(pickle.load(f))

In [None]:
for i, r in enumerate(results_lst):
    with open(r["data_filename"], "rb") as f:
        data = pickle.load(f)
        print(f"Info about generated data in {i}:")
        print(data.keys())

## Compare accuracies

- `time_id`: time index of file with generated data / results
- `gen_with`: type of model the data was generated with ("HMM" / "ARIMA" / "ARIMA_all_statio" etc)
- `n_train`: number of train samples per model
- `n_test`: number of test samples per model
- `min_len`: min sample size parameter used in generation
- `max_len`: max sample size parameter used in generation
- `cls_with`: type of model used to classify samples ("HMM" / "DTW")
- `variant`: variant of classification, e.g. for HMM: "AIC" or "BIC", for DTW: "1NN" or "5NN" derived as a key from `predictions_dfs` list
- `acc`: accuracy of predictions


In [None]:
time_id = []
names = []
gen_with = []
n_train = []
n_test = []
min_len = []
max_len = []
cls_with = []
variants = []
accs = []

for r in results_lst:
    for variant, acc in r["accuracies"].items():
        with open(r["data_filename"], "rb") as f:
            data = pickle.load(f)
            time_id.append(r["time_index"])
            name = r["data_filename"].split("/")[1].split(".")[0]
            names.append(name)
            gen_with.append(data["generating_model"])
            try:
                n_train.append(data["metadata"]["N_TRAIN_SAMPLES_PER_MODEL"])
                n_test.append(data["metadata"]["N_TEST_SAMPLES_PER_MODEL"])
            except KeyError:
                n_train.append(data["metadata"]["NO_TRAIN_SAMPLES"])
                n_test.append(data["metadata"]["NO_TEST_SAMPLES"])
            min_len.append(data["metadata"]["MIN_SAMPLE_LEN"])
            max_len.append(data["metadata"]["MAX_SAMPLE_LEN"])
            cls_with.append(r["classificator"])
            variants.append(variant)
            accs.append(acc)

acc_summary = pd.DataFrame({
    "time_id": time_id,
    "filename": names,
    "gen_with": gen_with,
    "n_train": n_train,
    "n_test": n_test,
    "min_len": min_len,
    "max_len": max_len,
    "cls_with": cls_with,
    "variant": variants,
    "acc": accs
})
            

In [None]:
for g, df in acc_summary.sort_values(
    by=['acc'], ascending = False).drop(["time_id", "gen_with"], axis = "columns").groupby(
        ["filename", "cls_with"]):
        print(g)
        display(df)

In [None]:
t = int(time.time())
summary_data = {
    "creation_date": t,
    "results_files": filepaths_lst,
    "accuracies_df": acc_summary

}
with open(f"{SUMMARY_OUTPUT_DIR}/summary_{t}.pkl", "wb") as f:
    pickle.dump(summary_data, f)

#

# Number of hidden states


In [None]:
for r in results_lst:
    print(r["data_filename"])
    display(r["hidden_states_df"])

# Misclasified samples check

### Find ids of misclassified

In [None]:
r = results_lst[2]
print(r["data_filename"])
pred_df = r["predictions_dfs"]["1NN"]
wrong = (pred_df["true_label"] != pred_df["pred"])
sub_df = pred_df.iloc[np.where(wrong)[0]].query("true_label == 3")
sub_df

In [None]:
wrong_samples_ids = pred_df.iloc[np.where(wrong)[0]]["sample_id"].values
with open(r["data_filename"], "rb") as f:
    data = pickle.load(f)
    all_X = data["all_X_samples"]
    wrong = [all_X[i] for i in wrong_samples_ids]

wrong[0].shape

### Plot samples

In [None]:
i = 0
X = wrong[i]
true = sub_df.iloc[i]["true_label"]
pred = sub_df.iloc[i]["pred"]
# labels = ["DD", "UD", "DU", "UU"] # use for twopat
labels = list(range(9))
plt.rcParams["figure.figsize"] = (20, 8)
plt.rcParams['font.size'] = 20
fig,ax = plt.subplots(1,1)
ax.plot(X, lw = 3)
ax.set_title(f"Sample from {labels[true]} class classified as {labels[pred]}")
ax.grid()

plt.show()