# Extract Viterbi paths from HMM objects

---------------------------------
Exports subject-level Viterbi paths to CSV, preserving the original
subject IDs embedded in the pickle (entry["subject_ids"]).

Run:  python extract_statepaths_k14_withIDs.py

In [1]:
import argparse
import os
import pickle
import numpy as np
import pandas as pd

In [3]:
PKL_PATH = "/home/jovyan/narratives-project/hmm-objects/hmmlearn_consensus_results_k14_withIDs.pkl"
OUT_CSV  = "/home/jovyan/narratives-project/viterbi-paths/statepaths_k14.csv"
K        = 14       # number of states in the model
PAD_VALUE = np.nan  # pad shorter runs with NaN

In [4]:
def load_entry(results, k):
    """Return the dict matching the requested k."""
    matches = [d for d in results if d.get("k") == k]
    if not matches:
        raise ValueError(f"No entry with k={k} in pickle.")
    if len(matches) > 1:
        raise ValueError(f"Duplicate entries with k={k}.")
    return matches[0]


def build_matrix(paths, pad_value=PAD_VALUE):
    """Pad variable-length paths into a rectangular array."""
    max_len = max(len(p) for p in paths)
    mat = np.full((len(paths), max_len), pad_value, dtype=float)
    for i, seq in enumerate(paths):
        mat[i, :len(seq)] = seq        # up-casts ints to float automatically
    return mat


def main():
    if not os.path.exists(PKL_PATH):
        raise FileNotFoundError(f"Pickle not found: {PKL_PATH}")

    with open(PKL_PATH, "rb") as f:
        results = pickle.load(f)

    entry = load_entry(results, k=K)
    mat   = build_matrix(entry["subject_paths"])

    # ─── Pick real subject IDs if available ───────────────────────────────────
    ids = entry.get("subject_ids")
    if ids is None:
        # Fallback: artificial labels
        ids = [f"sub-{i+1:03d}" for i in range(mat.shape[0])]
        print("⚠️  subject_ids not found in pickle – using synthetic labels.")
    else:
        print(f"✔ Using {len(ids)} subject IDs from pickle.")

    # ─── Write CSV ────────────────────────────────────────────────────────────
    df = pd.DataFrame(mat, index=ids, dtype=float)
    df.to_csv(OUT_CSV, index=True)
    print(f"✅  Wrote {df.shape[0]} subjects × {df.shape[1]} TRs → {OUT_CSV}")


if __name__ == "__main__":
    main()


✔ Using 75 subject IDs from pickle.
✅  Wrote 75 subjects × 279 TRs → /home/jovyan/narratives-project/viterbi-paths/statepaths_k14.csv
