In [102]:
import librosa
import numpy as np
import os
import math
from sklearn.cluster import KMeans
import hmmlearn.hmm

## lấy mfcc của file wav

In [103]:
# read file
def get_mfcc(file_path):
    y, sr = librosa.load(file_path) # read .wav file
    hop_length = math.floor(sr*0.010) # 10ms hop
    win_length = math.floor(sr*0.025) # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y, sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # substract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1,1)) 
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0) # O^r
    # return T x 36 (transpose of X)
    return X.T # hmmlearn use T x N matrix

## lấy mfcc của tất cả các file trong dir

In [104]:
# lấy mfcc của tất cả các file wav trong wav
def get_class_data(data_dir):
    files = os.listdir(data_dir)
    mfcc = [get_mfcc(os.path.join(data_dir,f)) for f in files if f.endswith(".wav")]
    return mfcc

## Clustering

In [105]:
def clustering(X, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0, verbose=0)
    kmeans.fit(X)
    print("centers", kmeans.cluster_centers_.shape)
    return kmeans

## Data

In [106]:
class_names = ['bệnh nhân', 'chúng ta', 'có thể', 'người','Việt Nam']
dataset = {}
for cname in class_names:
    print(f"Load {cname} dataset")
    dataset[cname] = get_class_data(os.path.join(cname))

Load bệnh nhân dataset
Load chúng ta dataset
Load có thể dataset
Load người dataset
Load Việt Nam dataset


In [107]:
print(len(dataset['Việt Nam']))

100


In [108]:
# Get all vectors in the datasets
all_vectors = np.concatenate([np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0)
print("vectors", all_vectors.shape)
# Run K-Means algorithm to get clusters
kmeans = clustering(all_vectors)
print("centers", kmeans.cluster_centers_.shape)
print(kmeans)

vectors (17412, 36)
centers (10, 36)
centers (10, 36)
KMeans(n_clusters=10, n_init=50, random_state=0)


## split train test

In [109]:
trainset = {}
testset = {}
n_test = {'bệnh nhân': 0, 'chúng ta': 0, 'có thể': 0, 'người': 0,'Việt Nam': 0}
for cname in class_names:
    dataset[cname] = list([kmeans.predict(v).reshape(-1, 1) for v in dataset[cname]])
    n = len(dataset[cname])
    n_train = math.floor(n*0.7)
    trainset[cname] = dataset[cname][:n_train]
    testset[cname] = dataset[cname][n_train:]
    n_test[cname] += len(testset[cname])
    
print(len(trainset['Việt Nam']))

70


# Train

In [110]:
models = {}

## Models cho 'bệnh nhân' 6x3

In [111]:
hmm = hmmlearn.hmm.MultinomialHMM(n_components=6*3, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

Xbn = np.concatenate(trainset['bệnh nhân'])
lengths = list([len(x) for x in trainset['bệnh nhân']])
print("training class", 'bệnh nhân')
print(Xbn.shape, lengths, len(lengths))
hmm.fit(Xbn, lengths=lengths)
models['bệnh nhân'] = hmm
print("Training done")

training class bệnh nhân
(2832, 1) [37, 37, 40, 38, 47, 35, 41, 41, 44, 32, 35, 28, 32, 44, 47, 33, 34, 45, 51, 41, 62, 40, 44, 47, 52, 39, 54, 35, 41, 43, 47, 41, 51, 39, 38, 38, 43, 44, 55, 45, 26, 29, 43, 33, 30, 37, 30, 45, 42, 42, 45, 41, 44, 52, 41, 43, 41, 36, 38, 46, 46, 34, 33, 35, 37, 28, 30, 45, 36, 44] 70


         1       -6157.3374             +nan
         2       -4500.6322       +1656.7052
         3       -3941.6309        +559.0013
         4       -3611.4185        +330.2124
         5       -3460.3957        +151.0228
         6       -3391.9661         +68.4295
         7       -3345.4623         +46.5038
         8       -3313.2878         +32.1745
         9       -3300.1127         +13.1751
        10       -3292.7102          +7.4026
        11       -3285.7912          +6.9189
        12       -3278.9471          +6.8441
        13       -3271.8799          +7.0672
        14       -3264.3188          +7.5610
        15       -3257.6852          +6.6337
        16       -3253.0933          +4.5919
        17       -3250.1822          +2.9111
        18       -3248.2813          +1.9008
        19       -3246.8701          +1.4112
        20       -3245.6611          +1.2091
        21       -3244.5638          +1.0973
        22       -3243.5640          +0.9998
        23

Training done


        89       -3178.4486          +0.0796
        90       -3178.4280          +0.0206
        91       -3178.4220          +0.0059


## Models cho 'chúng ta' 5x3

In [112]:
hmm = hmmlearn.hmm.MultinomialHMM(n_components=5*3, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

Xct = np.concatenate(trainset['chúng ta'])
lengths = list([len(x) for x in trainset['chúng ta']])
print("training class", 'chúng ta')
print(Xct.shape, lengths, len(lengths))
hmm.fit(Xct, lengths=lengths)
models['chúng ta'] = hmm
print("Training done")

training class chúng ta
(2264, 1) [41, 38, 46, 47, 36, 43, 34, 36, 42, 36, 40, 36, 37, 30, 40, 36, 41, 36, 37, 36, 42, 40, 35, 39, 38, 38, 41, 36, 44, 38, 59, 49, 37, 31, 43, 52, 42, 50, 48, 46, 54, 55, 47, 45, 35, 38, 38, 42, 38, 43, 33, 35, 32, 42, 32, 39] 56


         1       -4948.7688             +nan
         2       -3647.3602       +1301.4086
         3       -3184.1604        +463.1998
         4       -2893.8665        +290.2939
         5       -2694.3797        +199.4867
         6       -2569.8097        +124.5700
         7       -2520.6194         +49.1903
         8       -2494.4006         +26.2188
         9       -2479.4805         +14.9201
        10       -2472.2739          +7.2066
        11       -2469.0580          +3.2159
        12       -2467.6227          +1.4353
        13       -2466.8965          +0.7261
        14       -2466.4610          +0.4356
        15       -2466.1769          +0.2841
        16       -2465.9887          +0.1882
        17       -2465.8641          +0.1247
        18       -2465.7814          +0.0827
        19       -2465.7262          +0.0552
        20       -2465.6889          +0.0373
        21       -2465.6633          +0.0256
        22       -2465.6454          +0.0179


Training done


        23       -2465.6326          +0.0128
        24       -2465.6231          +0.0094


## Models cho 'có thể' 4x3

In [113]:
hmm = hmmlearn.hmm.MultinomialHMM(n_components=4*3, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

Xcth = np.concatenate(trainset['có thể'])
lengths = list([len(x) for x in trainset['có thể']])
print("training class", 'có thể')
print(Xcth.shape, lengths, len(lengths))
hmm.fit(Xcth, lengths=lengths)
models['có thể'] = hmm
print("Training done")

         1       -5622.6906             +nan
         2       -4323.8284       +1298.8622
         3       -3840.6512        +483.1772


training class có thể
(2537, 1) [34, 31, 56, 39, 44, 40, 46, 35, 45, 47, 37, 56, 36, 31, 39, 35, 42, 30, 30, 37, 36, 39, 26, 34, 47, 36, 41, 34, 26, 30, 25, 27, 16, 39, 27, 52, 25, 27, 20, 22, 36, 53, 44, 44, 28, 57, 53, 36, 37, 30, 34, 38, 51, 43, 40, 47, 38, 44, 102, 44, 52, 56, 35, 37, 39] 65


         4       -3606.2123        +234.4389
         5       -3468.8224        +137.3900
         6       -3386.7207         +82.1016
         7       -3355.6761         +31.0446
         8       -3333.9532         +21.7229
         9       -3310.4941         +23.4591
        10       -3279.0641         +31.4301
        11       -3241.6965         +37.3676
        12       -3215.2734         +26.4231
        13       -3201.7427         +13.5306
        14       -3183.3315         +18.4113
        15       -3176.4165          +6.9150
        16       -3173.1338          +3.2827
        17       -3171.2074          +1.9263
        18       -3169.9341          +1.2734
        19       -3168.9653          +0.9688
        20       -3168.1941          +0.7712
        21       -3167.7064          +0.4876
        22       -3167.3507          +0.3557
        23       -3166.8168          +0.5339
        24       -3166.0219          +0.7949
        25       -3165.6338          +0.3881
        26

Training done


        32       -3165.4055          +0.0100
        33       -3165.3973          +0.0082


## Models cho 'người' 3x3

In [114]:
hmm = hmmlearn.hmm.MultinomialHMM(n_components=3*3, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

Xcth = np.concatenate(trainset['người'])
lengths = list([len(x) for x in trainset['người']])
print("training class", 'người')
print(Xcth.shape, lengths, len(lengths))
hmm.fit(Xcth, lengths=lengths)
models['người'] = hmm
print("Training done")

         1       -4312.7406             +nan
         2       -2884.6245       +1428.1160


training class người
(1951, 1) [29, 27, 25, 18, 18, 19, 20, 22, 26, 23, 31, 30, 21, 19, 18, 22, 15, 29, 26, 19, 22, 51, 37, 42, 37, 18, 32, 62, 26, 32, 21, 28, 43, 23, 25, 36, 42, 20, 18, 20, 21, 38, 22, 22, 50, 18, 18, 21, 16, 19, 23, 19, 23, 17, 24, 27, 21, 21, 13, 22, 36, 35, 26, 30, 35, 28, 18, 58, 23, 35, 25, 27, 28] 73


         3       -2497.4535        +387.1710
         4       -2385.6280        +111.8255
         5       -2333.7765         +51.8516
         6       -2299.7037         +34.0727
         7       -2286.1781         +13.5257
         8       -2280.8986          +5.2794
         9       -2276.8413          +4.0573
        10       -2272.1707          +4.6706
        11       -2266.6379          +5.5328
        12       -2261.6744          +4.9635
        13       -2256.4254          +5.2490
        14       -2248.5036          +7.9218
        15       -2239.9802          +8.5234
        16       -2233.9014          +6.0788
        17       -2229.9097          +3.9918
        18       -2225.8433          +4.0663
        19       -2220.5980          +5.2453
        20       -2215.0949          +5.5031
        21       -2210.6336          +4.4613
        22       -2207.7265          +2.9071
        23       -2206.1579          +1.5687
        24       -2205.4643          +0.6936
        25

Training done


        37       -2198.6509          +0.0229
        38       -2198.6400          +0.0109
        39       -2198.6331          +0.0069


## Models cho 'Việt Nam' 6x3

In [115]:
hmm = hmmlearn.hmm.MultinomialHMM(n_components=6*3, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

Xvn = np.concatenate(trainset['Việt Nam'])
lengths = list([len(x) for x in trainset['Việt Nam']])
print("training class", 'Việt Nam')
print(Xvn.shape, lengths, len(lengths))
hmm.fit(Xvn, lengths=lengths)
models['Việt Nam'] = hmm
print("Training done")

         1       -5836.1032             +nan


training class Việt Nam
(2648, 1) [42, 34, 48, 36, 42, 34, 37, 42, 32, 38, 35, 40, 44, 37, 38, 37, 31, 27, 29, 32, 28, 22, 26, 32, 37, 39, 27, 33, 29, 41, 34, 30, 31, 36, 26, 27, 40, 27, 37, 26, 59, 53, 46, 32, 35, 30, 57, 36, 35, 60, 34, 57, 35, 44, 33, 39, 31, 28, 35, 43, 57, 56, 47, 44, 44, 47, 42, 41, 44, 41] 70


         2       -4195.2101       +1640.8931
         3       -3693.1392        +502.0709
         4       -3448.4671        +244.6720
         5       -3337.3010        +111.1662
         6       -3286.7410         +50.5600
         7       -3262.6474         +24.0936
         8       -3248.7688         +13.8785
         9       -3238.9332          +9.8356
        10       -3228.8349         +10.0983
        11       -3217.6969         +11.1381
        12       -3206.7237         +10.9732
        13       -3197.7962          +8.9275
        14       -3187.9611          +9.8351
        15       -3179.0737          +8.8874
        16       -3172.3528          +6.7209
        17       -3165.3033          +7.0495
        18       -3155.2225         +10.0808
        19       -3144.4843         +10.7382
        20       -3138.2987          +6.1856
        21       -3131.7901          +6.5086
        22       -3127.7793          +4.0108
        23       -3124.7609          +3.0184
        24

Training done


        82       -2975.1718          +0.0100


# Test

In [116]:
print("Testing")
n_correct = {'bệnh nhân': 0, 'chúng ta': 0, 'có thể': 0, 'người': 0,'Việt Nam': 0}
for true_cname in class_names:
    for O in testset[true_cname]:
        score = {cname: model.score(O, [len(O)]) for cname, model in models.items()}
        if (true_cname == max(score, key=score.get)): n_correct[true_cname] += 1
        print(true_cname, score, 'predict:', max(score, key=score.get))

Testing
bệnh nhân {'bệnh nhân': -49.10216538313062, 'chúng ta': -136.5790578447051, 'có thể': -86.77747133541487, 'người': -110.7386567533027, 'Việt Nam': -506.10885236828443} predict: bệnh nhân
bệnh nhân {'bệnh nhân': -49.88290607521227, 'chúng ta': -123.78419448594474, 'có thể': -88.76231784655232, 'người': -122.25319115199531, 'Việt Nam': -355.36926868648106} predict: bệnh nhân
bệnh nhân {'bệnh nhân': -63.29253961595809, 'chúng ta': -297.76171580543223, 'có thể': -177.65655631436587, 'người': -2873.003277746407, 'Việt Nam': -1471.2722774073295} predict: bệnh nhân
bệnh nhân {'bệnh nhân': -50.07574753883376, 'chúng ta': -158.29135002320388, 'có thể': -107.72598994573337, 'người': -3794.5236366892013, 'Việt Nam': -inf} predict: bệnh nhân
bệnh nhân {'bệnh nhân': -46.087144163077305, 'chúng ta': -155.3776188505524, 'có thể': -103.97278123180602, 'người': -3708.76944684147, 'Việt Nam': -inf} predict: bệnh nhân
bệnh nhân {'bệnh nhân': -46.492659333402, 'chúng ta': -121.96694721736583, 'có 

In [117]:
for cname in class_names:
    print('Accuracy:', cname, n_correct[cname]/n_test[cname])

Accuracy: bệnh nhân 0.8666666666666667
Accuracy: chúng ta 0.75
Accuracy: có thể 0.896551724137931
Accuracy: người 0.875
Accuracy: Việt Nam 0.9


In [118]:
print(n_test)

{'bệnh nhân': 30, 'chúng ta': 24, 'có thể': 29, 'người': 32, 'Việt Nam': 30}


In [124]:
np.around(models['người'].transmat_, 2)

array([[0.86, 0.03, 0.11, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.76, 0.24, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.74, 0.07, 0.19, 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.72, 0.27, 0.01, 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.78, 0.09, 0.13, 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.8 , 0.  , 0.2 , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.87, 0.13, 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.79, 0.21],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  ]])