# DEEE725 Speech Signal Processing Lab
### 2023 Spring, Kyungpook National University 
### Instructor: Gil-Jin Jang

## Lab 01 Korean digit recognition using python-hmmlearn
version 2, 2023/03/24
source: [jayaram1125's github repository](https://github.com/jayaram1125/Single-Word-Speech-Recognition-using-GMM-HMM-)

__update description:__

1. assigns sound files 8 and 9 for test out of 0...9, the rest (0...7) are for training
    no random selection for reproducibility
2. folder structure change

> segmented/${username}/${dnum}/kdigits${trial}-${dnum}.wav
> > for example, for user "gjang", digit 2, recording trial 0 (1st)
> > "segmented/gjang/2/kdigits0-2.wav"

In [1]:
# import necessary packages
import numpy as np
import matplotlib.pyplot as plt
#from scikits.talkbox.features import mfcc
#librosa.feature.mfcc(*, y=None, sr=22050, S=None, n_mfcc=20, dct_type=2, norm='ortho', lifter=0, **kwargs)[source]
from librosa.feature import mfcc
from scipy.io import wavfile
from hmmlearn import hmm
import numpy as np
import os
import warnings
import scipy.stats as sp
from time import time

warnings.filterwarnings("ignore")

__hyperparameters__ - CHANGE THEM TO IMPROVE PERFORMANCE
1. number of MFCC (feature dimension), try `num_mfcc` 6, 10, 13

2. Parameters needed to train GMMHMM: number of HMM states, number of Gaussian mixtures, diagonal or full covariance matrix, etc.

In [2]:
# 1. number of MFCC (feature dimension)
num_mfcc = 6
#num_mfcc = 10
#num_mfcc = 13
# 2. Parameters needed to train GMMHMM
m_num_of_HMMStates = 3  # number of states
m_num_of_mixtures = 2  # number of mixtures for each hidden state
m_covarianceType = 'diag'  # covariance type
m_n_iter = 10  # number of iterations
m_bakisLevel = 2

In [3]:
# extract MFCC features
def extmfcc(file):
    samplerate, d = wavfile.read(file)
    #features.append(mfcc(d, nwin=int(samplerate * 0.03), fs=samplerate, nceps= 6)[0])
    x = np.float32(d)
    hop=samplerate//100
    mc = mfcc(y=x, sr=samplerate, n_mfcc=num_mfcc, hop_length=hop, win_length=hop*2)
    return np.transpose(mc, (1,0))

__load data files__

1. find files: 
    for user `"gjang"`, digit 2, recording trial 0 (1st)
    `"segmented/gjang/2/kdigits0-2.wav"`
2. extract MFCC features for training and testing
    for each digit, indexes 4 and 9 for test, and the rest for training

In [4]:
#fpaths = []
#labels = []
spoken = []
m_trainingsetfeatures = []
m_trainingsetlabels = []
m_testingsetfeatures = []
m_testingsetlabels = []
n_folds = 5   # 0...3 for training, 4 for testing

apath = 'segmented'
count = 0
for username in os.listdir(apath):
    apath2 = apath + '/' + username    # example: segmented/gjang
    for ii in range(10):   #dnum in os.listdir(apath2):
        dnum = str(ii)
        apath3 = apath2 + '/' + dnum     # example: segmented/gjang/2
        if dnum not in spoken:
            spoken.append(dnum)
        for trial in range(10):
            file = apath3 + '/' + "kdigits{}-{}.wav".format(trial,dnum)      # segmented/gjang/2/kdigits0-2.wav
            mc = extmfcc(file)

            # display file names for the first 20 files only
            count += 1
            if count <= 20:
                print(file, dnum, end=' '); print(mc.shape, end=' ')
            elif count == 21:
                print('...'); print('')

            # 0...3 for training, 4 for testing
            if trial % n_folds == (n_folds-1):
                if count <= 20: print('testing')
                m_testingsetfeatures.append(mc)
                m_testingsetlabels.append(dnum)
            else:
                if count <= 20: print('training')
                m_trainingsetfeatures.append(mc)
                m_trainingsetlabels.append(dnum)


print('Words spoken:', spoken)
#print("number of labels and features = %d, %d" % ( len(labels), len(features) ))
#print("feature shape = ", end='')
#print(features[0].shape)

segmented/gjang/0/kdigits0-0.wav 0 (266, 6) training
segmented/gjang/0/kdigits1-0.wav 0 (222, 6) training
segmented/gjang/0/kdigits2-0.wav 0 (291, 6) training
segmented/gjang/0/kdigits3-0.wav 0 (216, 6) training
segmented/gjang/0/kdigits4-0.wav 0 (228, 6) testing
segmented/gjang/0/kdigits5-0.wav 0 (270, 6) training
segmented/gjang/0/kdigits6-0.wav 0 (291, 6) training
segmented/gjang/0/kdigits7-0.wav 0 (403, 6) training
segmented/gjang/0/kdigits8-0.wav 0 (320, 6) training
segmented/gjang/0/kdigits9-0.wav 0 (318, 6) testing
segmented/gjang/1/kdigits0-1.wav 1 (231, 6) training
segmented/gjang/1/kdigits1-1.wav 1 (129, 6) training
segmented/gjang/1/kdigits2-1.wav 1 (212, 6) training
segmented/gjang/1/kdigits3-1.wav 1 (262, 6) training
segmented/gjang/1/kdigits4-1.wav 1 (189, 6) testing
segmented/gjang/1/kdigits5-1.wav 1 (235, 6) training
segmented/gjang/1/kdigits6-1.wav 1 (231, 6) training
segmented/gjang/1/kdigits7-1.wav 1 (281, 6) training
segmented/gjang/1/kdigits8-1.wav 1 (338, 6) train

In [5]:
# gjang: shuffling the data (x)
# c = list(zip(features, labels))
# np.random.shuffle(c)
# features,labels = zip(*c)

In [6]:
# test and training for 100 files
ntest  = len(m_testingsetlabels)
ntrain = len(m_trainingsetlabels)
nfiles = ntest + ntrain

print("[training] number of labels and features = %d, %d" % 
        ( len(m_trainingsetlabels), len(m_trainingsetfeatures)) )
print("[test] number of labels and features = %d, %d" % 
        ( len(m_testingsetlabels), len(m_testingsetfeatures)) )

print ('Loading data completed')

[training] number of labels and features = 80, 80
[test] number of labels and features = 20, 20
Loading data completed


In [7]:
# model initialization
gmmhmmindexdict = {}
index = 0
for word in spoken:
    gmmhmmindexdict[word] = index
    index = index +1

def initByBakis(inumstates, ibakisLevel):
    startprobPrior = np.zeros(inumstates)
    startprobPrior[0: ibakisLevel - 1] = 1/float((ibakisLevel - 1))
    transmatPrior = getTransmatPrior(inumstates, ibakisLevel)
    return startprobPrior, transmatPrior

def getTransmatPrior(inumstates, ibakisLevel):
    transmatPrior = (1 / float(ibakisLevel)) * np.eye(inumstates)

    for i in range(inumstates - (ibakisLevel - 1)):
        for j in range(ibakisLevel - 1):
            transmatPrior[i, i + j + 1] = 1. / ibakisLevel

    for i in range(inumstates - ibakisLevel + 1, inumstates):
        for j in range(inumstates - i - j):
            transmatPrior[i, i + j] = 1. / (inumstates - i)

    return transmatPrior

m_startprobPrior ,m_transmatPrior = initByBakis(m_num_of_HMMStates,m_bakisLevel)

print("StartProbPrior=")
print(m_startprobPrior)

print("TransMatPrior=")
print(m_transmatPrior)

StartProbPrior=
[1. 0. 0.]
TransMatPrior=
[[0.5 0.5 0. ]
 [0.  0.5 0.5]
 [0.  0.  1. ]]


In [8]:
# acoustic model definition
class SpeechModel:
    def __init__(self,Class,label):
        self.traindata = np.zeros((0,num_mfcc))
        self.Class = Class
        self.label = label
        self.model  = hmm.GMMHMM(n_components = m_num_of_HMMStates, n_mix = m_num_of_mixtures, \
                transmat_prior = m_transmatPrior, startprob_prior = m_startprobPrior, \
                covariance_type = m_covarianceType, n_iter = m_n_iter)

In [9]:
# training GMMHMM Models 
start = time()

speechmodels = [None] * len(spoken)
for key in gmmhmmindexdict:
    speechmodels[gmmhmmindexdict[key]] = SpeechModel(gmmhmmindexdict[key],key)

for i in range(0,len(m_trainingsetfeatures)):
     for j in range(0,len(speechmodels)):
         if int(speechmodels[j].Class) == int(gmmhmmindexdict[m_trainingsetlabels[i]]):
            speechmodels[j].traindata = np.concatenate((speechmodels[j].traindata , m_trainingsetfeatures[i]))

for speechmodel in speechmodels:
    speechmodel.model.fit(speechmodel.traindata)

print ('Training completed -- {0} GMM-HMM models are built for {0} different types of words'.format(len(spoken)))
print('time elapsed: %.2f seconds' % ( time() - start ))
print (" "); print(" ")

Training completed -- 10 GMM-HMM models are built for 10 different types of words
time elapsed: 19.78 seconds
 
 


In [10]:
# testing
print("Prediction started")
m_PredictionlabelList = []

for i in range(0,len(m_testingsetfeatures)):
    scores = []
    for speechmodel in speechmodels:
         scores.append(speechmodel.model.score(m_testingsetfeatures[i]))
    id  = scores.index(max(scores))
    m_PredictionlabelList.append(speechmodels[id].Class)
    print(str(np.round(scores, 3)) + " " + str(max(np.round(scores, 3))) +" "+":"+ speechmodels[id].label)

accuracy = 0.0
count = 0
print("")
print("Prediction for Testing DataSet:")

for i in range(0,len(m_testingsetlabels)):
    print( "Label"+str(i+1)+":"+m_testingsetlabels[i])
    if gmmhmmindexdict[m_testingsetlabels[i]] == m_PredictionlabelList[i]:
       count = count+1

accuracy = 100.0*count/float(len(m_testingsetlabels))

print("")
print("accuracy ="+str(accuracy))
print("")

# end of testing

Prediction started
[-4972.822 -5302.009 -5280.65  -5228.55  -5381.846 -5190.518 -5077.571
 -5355.306 -5086.189 -5273.42 ] -4972.822 :0
[-6872.99  -7303.117 -7337.985 -7135.205 -7380.618 -7080.457 -6988.068
 -7373.42  -6948.682 -7168.201] -6872.99 :0
[-4522.783 -4198.668 -4653.116 -4684.291 -4892.336 -4334.099 -4364.667
 -4742.523 -4443.112 -4515.993] -4198.668 :1
[-6881.595 -6569.895 -6997.281 -7085.501 -7312.743 -6687.811 -6699.438
 -7119.151 -6778.356 -6887.102] -6569.895 :1
[-4033.262 -4164.371 -3936.343 -4099.433 -4092.416 -4089.772 -4054.873
 -4099.949 -4054.728 -4367.528] -3936.343 :2
[-5364.    -5456.338 -5206.391 -5401.502 -5441.407 -5331.168 -5297.741
 -5382.797 -5347.156 -5513.784] -5206.391 :2
[-4054.901 -4325.412 -4170.382 -4065.937 -4057.112 -4168.681 -4082.255
 -4226.582 -4143.508 -4270.511] -4054.901 :0
[-5682.098 -5925.023 -5721.647 -5814.41  -5762.081 -5814.923 -5739.34
 -5871.699 -5761.708 -5971.43 ] -5682.098 :0
[-3566.295 -3918.934 -3731.218 -3619.126 -3486.782 -368

## End of Lab 01