In [297]:
import librosa
import numpy as np
import os
import math
from sklearn.cluster import KMeans
import hmmlearn.hmm

## lấy mfcc của file wav

In [298]:
# read file
def get_mfcc(file_path):
    y, sr = librosa.load(file_path) # read .wav file
    hop_length = math.floor(sr*0.010) # 10ms hop
    win_length = math.floor(sr*0.025) # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y, sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # substract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1,1)) 
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0) # O^r
    # return T x 36 (transpose of X)
    return X.T # hmmlearn use T x N matrix

## lấy mfcc của tất cả các file trong dir

In [299]:
# lấy mfcc của tất cả các file wav trong wav
def get_class_data(data_dir):
    files = os.listdir(data_dir)
    mfcc = [get_mfcc(os.path.join(data_dir,f)) for f in files if f.endswith(".wav")]
    return mfcc

## Clustering

In [300]:
def clustering(X, n_clusters=13):
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0, verbose=0)
    kmeans.fit(X)
    print("centers", kmeans.cluster_centers_.shape)
    return kmeans

## Data

In [301]:
class_names = ['bệnh nhân', 'chúng ta', 'có thể', 'người','Việt Nam']
dataset = {}
for cname in class_names:
    print(f"Load {cname} dataset")
    dataset[cname] = get_class_data(os.path.join(cname))

Load bệnh nhân dataset
Load chúng ta dataset
Load có thể dataset
Load người dataset
Load Việt Nam dataset


In [302]:
print(len(dataset['Việt Nam']))

100


In [303]:
# Get all vectors in the datasets
all_vectors = np.concatenate([np.concatenate(v, axis=0) for k, v in dataset.items()], axis=0)
print("vectors", all_vectors.shape)
# Run K-Means algorithm to get clusters
kmeans = clustering(all_vectors)
print("centers", kmeans.cluster_centers_.shape)
print(kmeans)

vectors (18138, 36)
centers (13, 36)
centers (13, 36)
KMeans(n_clusters=13, n_init=50, random_state=0)


## split train test

In [304]:
trainset = {}
testset = {}
n_test = {'bệnh nhân': 0, 'chúng ta': 0, 'có thể': 0, 'người': 0,'Việt Nam': 0}
for cname in class_names:
    dataset[cname] = list([kmeans.predict(v).reshape(-1, 1) for v in dataset[cname]])
    n = len(dataset[cname])
    n_train = math.floor(n*0.7)
    trainset[cname] = dataset[cname][:n_train]
    testset[cname] = dataset[cname][n_train:]
    n_test[cname] += len(testset[cname])
    
print(len(trainset['Việt Nam']))

70


# Train

In [305]:
models = {}

## Models cho 'bệnh nhân' 6x3

In [306]:
hmm = hmmlearn.hmm.MultinomialHMM(n_components=6*3, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

Xbn = np.concatenate(trainset['bệnh nhân'])
lengths = list([len(x) for x in trainset['bệnh nhân']])
print("training class", 'bệnh nhân')
print(Xbn.shape, lengths, len(lengths))
hmm.fit(Xbn, lengths=lengths)
models['bệnh nhân'] = hmm
print("Training done")

training class bệnh nhân
(2832, 1) [37, 37, 40, 38, 47, 35, 41, 41, 44, 32, 35, 28, 32, 44, 47, 33, 34, 45, 51, 41, 62, 40, 44, 47, 52, 39, 54, 35, 41, 43, 47, 41, 51, 39, 38, 38, 43, 44, 55, 45, 26, 29, 43, 33, 30, 37, 30, 45, 42, 42, 45, 41, 44, 52, 41, 43, 41, 36, 38, 46, 46, 34, 33, 35, 37, 28, 30, 45, 36, 44] 70


         1       -7011.5751             +nan
         2       -5196.1622       +1815.4130
         3       -4536.9018        +659.2603
         4       -4233.5116        +303.3903
         5       -4085.1570        +148.3546
         6       -4013.4692         +71.6878
         7       -3972.7949         +40.6742
         8       -3953.3399         +19.4551
         9       -3941.2277         +12.1122
        10       -3928.6214         +12.6063
        11       -3912.2816         +16.3397
        12       -3893.5142         +18.7674
        13       -3874.4352         +19.0790
        14       -3860.2609         +14.1744
        15       -3850.1875         +10.0733
        16       -3839.4714         +10.7161
        17       -3825.5035         +13.9679
        18       -3810.9672         +14.5363
        19       -3800.9368         +10.0304
        20       -3795.9363          +5.0005
        21       -3793.1929          +2.7434
        22       -3791.3413          +1.8516
        23

Training done


       125       -3718.5438          +0.0096


## Models cho 'chúng ta' 5x3

In [307]:
hmm = hmmlearn.hmm.MultinomialHMM(n_components=5*3, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

Xct = np.concatenate(trainset['chúng ta'])
lengths = list([len(x) for x in trainset['chúng ta']])
print("training class", 'chúng ta')
print(Xct.shape, lengths, len(lengths))
hmm.fit(Xct, lengths=lengths)
models['chúng ta'] = hmm
print("Training done")

         1       -6958.9586             +nan
         2       -5055.1134       +1903.8452
         3       -4197.3881        +857.7253


training class chúng ta
(2759, 1) [41, 38, 28, 46, 47, 36, 43, 34, 36, 42, 36, 40, 36, 37, 30, 40, 36, 41, 36, 37, 36, 42, 40, 35, 39, 38, 38, 41, 36, 44, 38, 59, 49, 37, 31, 43, 52, 42, 50, 48, 46, 54, 55, 47, 45, 35, 38, 38, 42, 38, 43, 33, 35, 32, 42, 32, 39, 41, 40, 37, 35, 37, 47, 31, 34, 36, 34, 34, 31, 30] 70


         4       -3703.6214        +493.7667
         5       -3552.6837        +150.9377
         6       -3492.2620         +60.4218
         7       -3465.2442         +27.0177
         8       -3451.3295         +13.9148
         9       -3441.5688          +9.7606
        10       -3435.3430          +6.2258
        11       -3431.5076          +3.8354
        12       -3428.7012          +2.8064
        13       -3426.2123          +2.4889
        14       -3423.8977          +2.3146
        15       -3421.9507          +1.9470
        16       -3420.5486          +1.4021
        17       -3419.6401          +0.9086
        18       -3419.0453          +0.5947
        19       -3418.6152          +0.4302
        20       -3418.2713          +0.3439
        21       -3417.9808          +0.2904
        22       -3417.7302          +0.2506
        23       -3417.5115          +0.2187
        24       -3417.3176          +0.1939
        25       -3417.1407          +0.1768
        26

Training done


        56       -3413.1841          +0.0078


## Models cho 'có thể' 4x3

In [308]:
hmm = hmmlearn.hmm.MultinomialHMM(n_components=4*3, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

Xcth = np.concatenate(trainset['có thể'])
lengths = list([len(x) for x in trainset['có thể']])
print("training class", 'có thể')
print(Xcth.shape, lengths, len(lengths))
hmm.fit(Xcth, lengths=lengths)
models['có thể'] = hmm
print("Training done")

         1       -6095.4031             +nan
         2       -5072.3400       +1023.0631


training class có thể
(2537, 1) [34, 31, 56, 39, 44, 40, 46, 35, 45, 47, 37, 56, 36, 31, 39, 35, 42, 30, 30, 37, 36, 39, 26, 34, 47, 36, 41, 34, 26, 30, 25, 27, 16, 39, 27, 52, 25, 27, 20, 22, 36, 53, 44, 44, 28, 57, 53, 36, 37, 30, 34, 38, 51, 43, 40, 47, 38, 44, 102, 44, 52, 56, 35, 37, 39] 65


         3       -4228.5706        +843.7694
         4       -3842.7625        +385.8081
         5       -3687.5667        +155.1958
         6       -3637.2827         +50.2840
         7       -3615.9657         +21.3170
         8       -3604.8262         +11.1394
         9       -3598.2582          +6.5681
        10       -3593.4904          +4.7677
        11       -3588.8575          +4.6329
        12       -3583.5655          +5.2920
        13       -3577.6057          +5.9598
        14       -3571.4727          +6.1331
        15       -3565.6251          +5.8475
        16       -3559.9546          +5.6705
        17       -3553.0418          +6.9128
        18       -3541.0475         +11.9943
        19       -3522.1011         +18.9465
        20       -3504.6317         +17.4694
        21       -3493.8637         +10.7680
        22       -3486.8209          +7.0428
        23       -3480.6733          +6.1477
        24       -3473.6546          +7.0187
        25

Training done





## Models cho 'người' 3x3

In [309]:
hmm = hmmlearn.hmm.MultinomialHMM(n_components=3*3, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

Xcth = np.concatenate(trainset['người'])
lengths = list([len(x) for x in trainset['người']])
print("training class", 'người')
print(Xcth.shape, lengths, len(lengths))
hmm.fit(Xcth, lengths=lengths)
models['người'] = hmm
print("Training done")

         1       -4760.8928             +nan
         2       -3677.8784       +1083.0143
         3       -3353.1064        +324.7720

training class người
(1951, 1) [29, 27, 25, 18, 18, 19, 20, 22, 26, 23, 31, 30, 21, 19, 18, 22, 15, 29, 26, 19, 22, 51, 37, 42, 37, 18, 32, 62, 26, 32, 21, 28, 43, 23, 25, 36, 42, 20, 18, 20, 21, 38, 22, 22, 50, 18, 18, 21, 16, 19, 23, 19, 23, 17, 24, 27, 21, 21, 13, 22, 36, 35, 26, 30, 35, 28, 18, 58, 23, 35, 25, 27, 28] 73



         4       -3197.4247        +155.6818
         5       -2993.0277        +204.3970
         6       -2823.6435        +169.3841
         7       -2784.4078         +39.2357
         8       -2737.6359         +46.7719
         9       -2690.7454         +46.8905
        10       -2649.3634         +41.3820
        11       -2620.5414         +28.8221
        12       -2612.5240          +8.0173
        13       -2609.6135          +2.9105
        14       -2607.0513          +2.5622
        15       -2604.3679          +2.6834
        16       -2601.5267          +2.8412
        17       -2598.7624          +2.7643
        18       -2596.5768          +2.1856
        19       -2594.6021          +1.9746
        20       -2592.3633          +2.2389
        21       -2590.1689          +2.1944
        22       -2588.4626          +1.7063
        23       -2587.3471          +1.1155
        24       -2586.6716          +0.6755
        25       -2586.2234          +0.4482
        2

Training done


        33       -2582.9959          +0.0375
        34       -2582.9731          +0.0228
        35       -2582.9580          +0.0150
        36       -2582.9474          +0.0106
        37       -2582.9394          +0.0080


## Models cho 'Việt Nam' 6x3

In [310]:
hmm = hmmlearn.hmm.MultinomialHMM(n_components=6*3, random_state=0, n_iter=1000, verbose=True, init_params='e', params='ste')
hmm.startprob_ = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
hmm.transmat_ =np.array([
    [0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, 0.0, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.2, 0.1, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7, 0.3, ],
    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ],])

Xvn = np.concatenate(trainset['Việt Nam'])
lengths = list([len(x) for x in trainset['Việt Nam']])
print("training class", 'Việt Nam')
print(Xvn.shape, lengths, len(lengths))
hmm.fit(Xvn, lengths=lengths)
models['Việt Nam'] = hmm
print("Training done")

training class Việt Nam
(2648, 1) [42, 34, 48, 36, 42, 34, 37, 42, 32, 38, 35, 40, 44, 37, 38, 37, 31, 27, 29, 32, 28, 22, 26, 32, 37, 39, 27, 33, 29, 41, 34, 30, 31, 36, 26, 27, 40, 27, 37, 26, 59, 53, 46, 32, 35, 30, 57, 36, 35, 60, 34, 57, 35, 44, 33, 39, 31, 28, 35, 43, 57, 56, 47, 44, 44, 47, 42, 41, 44, 41] 70


         1       -6501.7665             +nan
         2       -4759.6760       +1742.0905
         3       -4151.0491        +608.6269
         4       -3825.0691        +325.9800
         5       -3649.1205        +175.9486
         6       -3555.4147         +93.7058
         7       -3497.3970         +58.0177
         8       -3454.2361         +43.1609
         9       -3418.8316         +35.4045
        10       -3384.7485         +34.0831
        11       -3352.1801         +32.5685
        12       -3328.1147         +24.0654
        13       -3308.5302         +19.5844
        14       -3290.1281         +18.4021
        15       -3279.3157         +10.8124
        16       -3266.3179         +12.9978
        17       -3263.0295          +3.2884
        18       -3260.9108          +2.1186
        19       -3259.4921          +1.4187
        20       -3258.4548          +1.0373
        21       -3257.6254          +0.8294
        22       -3256.9501          +0.6753
        23

Training done


        52       -3251.5609          +0.0096


# Test

In [311]:
print("Testing")
n_correct = {'bệnh nhân': 0, 'chúng ta': 0, 'có thể': 0, 'người': 0,'Việt Nam': 0}
for true_cname in class_names:
    for O in testset[true_cname]:
        score = {cname: model.score(O, [len(O)]) for cname, model in models.items()}
        if (true_cname == max(score, key=score.get)): n_correct[true_cname] += 1
        print(true_cname, score, 'predict:', max(score, key=score.get))

Testing
bệnh nhân {'bệnh nhân': -52.43462965196605, 'chúng ta': -256.7931257055144, 'có thể': -209.72691917386913, 'người': -90.35889120439035, 'Việt Nam': -216.0088893987384} predict: bệnh nhân
bệnh nhân {'bệnh nhân': -58.430040134692625, 'chúng ta': -218.9584431985581, 'có thể': -187.03128277863706, 'người': -129.72944240076544, 'Việt Nam': -232.8922633066236} predict: bệnh nhân
bệnh nhân {'bệnh nhân': -76.46892974091331, 'chúng ta': -395.2221025855082, 'có thể': -584.5122627027931, 'người': -451.9087225094313, 'Việt Nam': -240.45472925483497} predict: bệnh nhân
bệnh nhân {'bệnh nhân': -146.61560667824497, 'chúng ta': -366.63743702932885, 'có thể': -186.42636198030252, 'người': -1174.36785074458, 'Việt Nam': -408.7738788355435} predict: bệnh nhân
bệnh nhân {'bệnh nhân': -55.8888364619792, 'chúng ta': -256.94447070504555, 'có thể': -317.69942261982607, 'người': -775.1884850384574, 'Việt Nam': -497.61389368082155} predict: bệnh nhân
bệnh nhân {'bệnh nhân': -53.776073698538, 'chúng ta':

In [312]:
for cname in class_names:
    print('Accuracy:', cname, n_correct[cname]/n_test[cname])

Accuracy: bệnh nhân 1.0
Accuracy: chúng ta 0.9666666666666667
Accuracy: có thể 0.896551724137931
Accuracy: người 0.96875
Accuracy: Việt Nam 0.9666666666666667


In [313]:
print('All Accuracy:', sum(n_correct.values())/sum(n_test.values()))

All Accuracy: 0.9602649006622517


In [314]:
print(n_test)

{'bệnh nhân': 30, 'chúng ta': 30, 'có thể': 29, 'người': 32, 'Việt Nam': 30}


In [315]:
np.around(models['người'].transmat_, 2)

array([[0.84, 0.15, 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.81, 0.19, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.67, 0.14, 0.19, 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.78, 0.03, 0.18, 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.83, 0.06, 0.11, 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.77, 0.23, 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.74, 0.1 , 0.15],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  ]])