In [2]:
pip install tensorflow librosa hmmlearn scikit-learn numpy tensorflow-datasets

Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading hmmlearn-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.9/165.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hmmlearn
Successfully installed hmmlearn-0.3.3


In [3]:
pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [4]:
import os
import numpy as np
import librosa
import tensorflow_datasets as tfds
from hmmlearn import hmm
from sklearn.model_selection import train_test_split
from collections import defaultdict, Counter

# Load the Speech Commands Dataset
def load_data():
    dataset, info = tfds.load('speech_commands', with_info=True, split='train[:5%]')
    label_names = info.features['label'].names
    data = []
    labels = []

    for example in tfds.as_numpy(dataset):
        audio = example['audio']
        label = example['label']
        sr = 16000  # default sample rate
        mfcc = librosa.feature.mfcc(y=audio.astype(float), sr=sr, n_mfcc=13).T
        data.append(mfcc)
        labels.append(label_names[label])

    return data, labels, label_names

# HMM Model Training per label
def train_hmm_models(X_train, y_train, label_names):
    models = {}
    for label in label_names:
        label_features = [x for x, y in zip(X_train, y_train) if y == label]
        lengths = [len(feat) for feat in label_features]
        X_concat = np.concatenate(label_features)

        model = hmm.GaussianHMM(n_components=5, covariance_type="diag", n_iter=100)
        model.fit(X_concat, lengths)
        models[label] = model
    return models

# N-gram (Bigram) Model Training
def train_ngram_model(sentences):
    bigrams = defaultdict(Counter)
    for sent in sentences:
        for i in range(len(sent)-1):
            bigrams[sent[i]][sent[i+1]] += 1
    return bigrams

def predict_ngram(bigram_model, prefix):
    return bigram_model[prefix].most_common(1)[0][0] if prefix in bigram_model else None

# Predict HMM Label
def predict_hmm(models, sample):
    scores = {label: model.score(sample) for label, model in models.items()}
    return max(scores, key=scores.get)

# Main flow
if __name__ == "__main__":
    print("Loading data...")
    X, y, label_names = load_data()

    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Training HMM models...")
    hmm_models = train_hmm_models(X_train, y_train, label_names)

    print("Training N-gram model...")
    ngram_model = train_ngram_model([y_train])  # sentence of labels

    print("Testing on sample inputs...")
    correct = 0
    for x, true_label in zip(X_test, y_test):
        pred_label = predict_hmm(hmm_models, x)
        print(f"True: {true_label}, Predicted: {pred_label}")
        if pred_label == true_label:
            correct += 1

    acc = correct / len(y_test)
    print(f"\nHMM Model Accuracy: {acc * 100:.2f}%")

    # N-gram example
    print("\nN-gram Prediction Example:")
    prefix = y_test[0]
    next_label = predict_ngram(ngram_model, prefix)
    print(f"Given '{prefix}', predicted next label is: '{next_label}'")


Loading data...




Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/speech_commands/0.0.3...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/incomplete.UWMEIO_0.0.3/speech_commands-train.tfrecord*...…

Generating validation examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/incomplete.UWMEIO_0.0.3/speech_commands-validation.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/incomplete.UWMEIO_0.0.3/speech_commands-test.tfrecord*...:…



Dataset speech_commands downloaded and prepared to /root/tensorflow_datasets/speech_commands/0.0.3. Subsequent calls will reuse this data.
Splitting data...
Training HMM models...
Training N-gram model...
Testing on sample inputs...
True: _unknown_, Predicted: no
True: _unknown_, Predicted: no
True: _unknown_, Predicted: _unknown_
True: _unknown_, Predicted: _unknown_
True: left, Predicted: no
True: _unknown_, Predicted: _unknown_
True: _unknown_, Predicted: yes
True: _silence_, Predicted: _silence_
True: _unknown_, Predicted: down
True: stop, Predicted: stop
True: _unknown_, Predicted: _unknown_
True: _unknown_, Predicted: _unknown_
True: _unknown_, Predicted: left
True: _unknown_, Predicted: no
True: _unknown_, Predicted: go
True: _unknown_, Predicted: no
True: stop, Predicted: go
True: yes, Predicted: _unknown_
True: _unknown_, Predicted: off
True: _unknown_, Predicted: yes
True: go, Predicted: up
True: yes, Predicted: yes
True: _unknown_, Predicted: down
True: go, Predicted: down
T