In [1]:
# =============================================
# FULL END-TO-END VITERBI DEMO FOR "HELLO"
# =============================================

# 1) INSTALL & IMPORT LIBRARIES
!pip install librosa soundfile scikit-learn --quiet

import numpy as np
import librosa
from sklearn.cluster import KMeans

# =========================
# HMM DEFINITION
# =========================

# Hidden states (phonemes)
states = ["S1", "S2", "S3", "S4"]
state_labels = ["/h/", "/e/", "/l/", "/o/"]  # for nice printing
num_states = len(states)

# Transition probability matrix A (from rows -> to cols)
A = np.array([
    [0.0, 0.7, 0.3, 0.0],  # from S1 (/h/)
    [0.0, 0.2, 0.6, 0.2],  # from S2 (/e/)
    [0.0, 0.0, 0.3, 0.7],  # from S3 (/l/)
    [0.0, 0.0, 0.1, 0.9]   # from S4 (/o/)
])

# Emission probability matrix B (Si emits Oj)
# rows: S1..S4, cols: O1..O4
B = np.array([
    [0.6, 0.2, 0.1, 0.1],  # S1
    [0.1, 0.7, 0.1, 0.1],  # S2
    [0.1, 0.1, 0.6, 0.2],  # S3
    [0.2, 0.1, 0.2, 0.5]   # S4
])

# Initial probabilities (π) – we always start in /h/
pi = np.array([1.0, 0.0, 0.0, 0.0])

# =========================
# 2) LOAD AUDIO & EXTRACT FEATURES
# =========================

# --- Load your recorded "hello" ---
# Upload `hello.wav` in Colab (left panel: Files → upload).
audio_path = "hello.wav"

y, sr = librosa.load(audio_path, sr=None)  # keep original sample rate

# Optional: trim leading/trailing silence
y, _ = librosa.effects.trim(y, top_db=30)

# --- Frame-level MFCC extraction ---
# Use a standard 25 ms window, 10 ms hop
frame_length = int(0.025 * sr)
hop_length   = int(0.010 * sr)

# 13 MFCCs per frame (common choice)
mfcc = librosa.feature.mfcc(
    y=y, sr=sr, n_mfcc=13, n_fft=frame_length, hop_length=hop_length
)  # shape: (13, num_frames)

# Transpose to shape (num_frames, 13)
mfcc_frames = mfcc.T
num_frames = mfcc_frames.shape[0]

print(f"Loaded audio: {audio_path}")
print(f"Sample rate: {sr} Hz")
print(f"Number of frames (MFCC): {num_frames}")

# =========================
# 3) VECTOR QUANTIZATION -> DISCRETE OBSERVATIONS
# =========================

# We use KMeans to quantize MFCC frames into 4 clusters.
# Each cluster corresponds to an observation symbol O1..O4.

num_observations = 4  # O1, O2, O3, O4

kmeans = KMeans(n_clusters=num_observations, random_state=42, n_init=10)
kmeans.fit(mfcc_frames)

# Labels for each frame: values in {0,1,2,3}
obs_seq = kmeans.labels_.tolist()

print("\nFirst 20 observation symbols (0→O1, 1→O2, 2→O3, 3→O4):")
print(obs_seq[:20])

# =========================
# 4) VITERBI ALGORITHM
# =========================

def viterbi(obs, A, B, pi):
    """
    obs : list of observation indices (0..M-1)
    A   : NxN transition matrix
    B   : NxM emission matrix
    pi  : length-N initial distribution
    """
    N = A.shape[0]       # number of states
    T = len(obs)         # length of observation sequence

    # delta[t, i]: best probability of any path ending in state i at time t
    delta = np.zeros((T, N))
    # psi[t, i]: argmax previous state index that leads to i at time t
    psi = np.zeros((T, N), dtype=int)

    # 1) Initialization
    delta[0, :] = pi * B[:, obs[0]]
    psi[0, :] = 0

    # 2) Recursion
    for t in range(1, T):
        for j in range(N):
            probs = delta[t - 1, :] * A[:, j]   # from all i -> j
            psi[t, j] = np.argmax(probs)
            delta[t, j] = probs[psi[t, j]] * B[j, obs[t]]

    # 3) Termination
    best_last_state = np.argmax(delta[T - 1, :])
    best_prob = delta[T - 1, best_last_state]

    # 4) Path backtracking
    best_path = [best_last_state]
    for t in range(T - 1, 0, -1):
        best_last_state = psi[t, best_last_state]
        best_path.insert(0, best_last_state)

    return best_path, best_prob, delta, psi

# Run Viterbi on the full observation sequence from the audio
best_path, best_prob, delta, psi = viterbi(obs_seq, A, B, pi)

# Map state indices → phoneme labels
best_states = [states[i] for i in best_path]
best_phonemes = [state_labels[i] for i in best_path]

# =========================
# 5) PRINT RESULTS
# =========================

print("\n===== VITERBI DECODING RESULT =====")
print(f"Length of observation sequence: {len(obs_seq)} frames")

print("\nMost likely hidden state sequence (indices):")
print(best_path[:50], "...")  # print first 50 to keep it short

print("\nMost likely hidden state sequence (state names):")
print(best_states[:50], "...")

print("\nMost likely phoneme sequence for first 50 frames:")
print(best_phonemes[:50], "...")

print("\nProbability of this most likely path:")
print(best_prob)

# Simple high-level inference (frame-wise):
# We can summarize by taking the majority phoneme in four equal segments
segments = 4
segment_len = len(best_phonemes) // segments
segment_summary = []

for i in range(segments):
    start = i * segment_len
    end = (i + 1) * segment_len if i < segments - 1 else len(best_phonemes)
    segment = best_phonemes[start:end]
    # majority phoneme in this segment
    values, counts = np.unique(segment, return_counts=True)
    majority = values[np.argmax(counts)]
    segment_summary.append(majority)

print("\nSegment-wise majority phonemes (approx. H-E-L-L/O pattern):")
print(segment_summary)

if segment_summary == ["/h/", "/e/", "/l/", "/o/"]:
    print("\nInference: Segment-wise decoding strongly matches /h/ /e/ /l/ /o/,")
    print("so the HMM with Viterbi successfully recognizes the word 'hello'.")
else:
    print("\nInference: The segment-wise decoded pattern is:")
    print(segment_summary)
    print("This is the most likely phoneme sequence the HMM associates with this recording.")


Loaded audio: hello.wav
Sample rate: 22050 Hz
Number of frames (MFCC): 42

First 20 observation symbols (0→O1, 1→O2, 2→O3, 3→O4):
[1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0]

===== VITERBI DECODING RESULT =====
Length of observation sequence: 42 frames

Most likely hidden state sequence (indices):
[np.int64(0), np.int64(1), np.int64(1), np.int64(2), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3), np.int64(3)] ...

Most likely hidden state sequence (state names):
['S1', 'S2', 'S2', 'S3', 'S4', 'S4', 'S4', 'S4', 'S4', 'S4', 'S4', 'S4', 'S4', 'S4'