# I. Declare dependencies

In [98]:
!pip install dtw
!pip install hmmlearn



In [99]:
import matplotlib.pyplot as plt
import matplotlib.style as ms
ms.use('seaborn-muted')
%matplotlib inline

import glob
import os

import numpy as np
import pandas as pd

import librosa
import librosa.display
import IPython.display

In [100]:
class Config:
    n_mfcc = 13
    hop_length = 220
    n_frame = 12
    sr = 22050
    n_fft = 512
    
c = Config()

# II. Extrac MFCC features / Segmantal k-means

Use the collected data (in groups), extracting the  MFCC  feature (39 features, including  MFCC, delta, deltadelta) of each voice command / number.

#### 2.1 Set path, create dataframe

Create list contain all paths to my group member

In [101]:
all_member = ['./19021372_BuiVanToan', './19021381_NguyenVanTu', './19020093_HoangHuuBach', './19021217_DinhVietAnh'
            './19021384_NguyenManhTuan', './19021396_PhamThanhVinh', './19021347_BuiThuPhuong', './19021354_HoangMinhQuang']

Create dataframe

In [102]:
# audio file data, hand-labeled with audacity software, 
# has a .wav audio file name with the same name as its label .txt file
df = pd.DataFrame(columns=["fid", "label", "start", "end"])

In [103]:
df

Unnamed: 0,fid,label,start,end


In [104]:
for mname in all_member:
    df_mi = pd.DataFrame(columns=["fid", "label", "start", "end"])
    data_dir = mname
    label_fnames = glob.glob(os.path.join(data_dir, "*.txt"))
    label_fnames = [os.path.basename(label_fname) for label_fname in label_fnames]

    for fname in label_fnames:
        df_i = pd.read_csv(os.path.join(data_dir, fname), sep="\t", header=None)
        df_i = df_i.rename(columns={2:"label", 0:"start", 1:"end"})
        df_i["fid"] = [mname + '\\' + fname.split('.')[0]]*len(df_i)
        df_mi = pd.concat([df_mi, df_i], axis=0, ignore_index=True)

    df = pd.concat([df, df_mi], axis=0, ignore_index=True)

In [105]:
df.head(3)

Unnamed: 0,fid,label,start,end
0,./19021372_BuiVanToan\1-10,sil,0.0,2.426485
1,./19021372_BuiVanToan\1-10,7,2.426485,2.821224
2,./19021372_BuiVanToan\1-10,tram,3.181134,3.657143


Check unique label in dataframe, there are 20

In [106]:
labels = list(df['label'].unique())
print(len(labels), len(labels)== 20 )
print(labels)

30 False
['sil', '7', 'tram', '6', 'muoi', '9', 'trieu', '8', '3', '2', 'nghin', 'linh', '4', '5', 'tu', 'mot', 'm1', 'lam', '1', '0', 'tram ', '3   ', '9  ', 'nghin ', 'sli', ' tram', 'nam', nan, 'ngin', '8\\']


Correct the wrong label name

In [107]:
df['label'].replace('lnh', 'linh', inplace=True)
df['label'].replace('trsm', 'tram', inplace=True)
df['label'].replace('sli', 'sil', inplace=True)
df['label'].replace('tram ', 'tram', inplace=True)
df['label'].replace('3   ', '3', inplace=True)
df['label'].replace('9  ', '9', inplace=True)
df['label'].replace(' tram', 'tram', inplace=True)
df['label'].replace('nam', '5', inplace=True)
df['label'].replace('ngin', 'nghin', inplace=True)
df['label'].replace('nghin ', 'nghin', inplace=True)
df['label'].replace('8\\', '8', inplace=True)

df.dropna(subset = ["label"], inplace=True)

In [108]:
labels = list(df['label'].unique())
print(len(labels), len(labels)== 20 )
print(labels)

20 True
['sil', '7', 'tram', '6', 'muoi', '9', 'trieu', '8', '3', '2', 'nghin', 'linh', '4', '5', 'tu', 'mot', 'm1', 'lam', '1', '0']


#### 2.2 Func to calculate mfcc for each 'word'/'sound'

In [109]:
# Get unique file names (fids)
fids = df['fid'].unique()

In [110]:
def get_mfcc_features(x, sound, sr, hop_length):
#   print(sound.shape)
    # start sample, end sample, and calculate mfcc in the interval (s, e)
    s, e = int(np.floor(x.loc["start"]*sr)), int(np.ceil(x.loc["end"]*sr))
#     if e - s < 1024 + 512:
#         print(e-s)
#     # print(e - s)
    mfcc = librosa.feature.mfcc(y=sound[s:e], sr=sr, n_mfcc=c.n_mfcc, hop_length=hop_length, n_fft=512)

    delta = librosa.feature.delta(mfcc, width=3)
    delta_2 = librosa.feature.delta(mfcc, order=2, width=3)
    features = np.concatenate((mfcc, delta, delta_2))
    
    # The 'features' calculated array is of the form (39, 74), whose mfcc features are vectors of length 39
    # we will transpose the matrix
    return features.T

#### 2.3 Calculate MFCC over dataframe

In [111]:
# del df['mfcc_origin']

In [112]:
for fid in fids:
    sound_file_path = os.path.join(fid+".wav")
    # optimize calculation speed by read files only once
    sound, sr = librosa.load(sound_file_path)
    dfi = df[df["fid"] == fid]
    
    # store mfcc in 
    # 'apply' function is used dataframe along data vertical axis
    df.loc[df["fid"] == fid, "mfcc_origin"] = dfi.apply(get_mfcc_features, args=(sound, c.sr, c.hop_length), axis=1)

In [113]:
df.head(3)

Unnamed: 0,fid,label,start,end,mfcc_origin
0,./19021372_BuiVanToan\1-10,sil,0.0,2.426485,"[[-296.9526, 123.99669, -0.2854234, 9.288038, ..."
1,./19021372_BuiVanToan\1-10,7,2.426485,2.821224,"[[-378.6088, 114.13159, 6.1494083, 7.1754503, ..."
2,./19021372_BuiVanToan\1-10,tram,3.181134,3.657143,"[[-488.46213, 128.77538, -14.592409, 19.957142..."


In [114]:
df.isnull().sum()

fid            0
label          0
start          0
end            0
mfcc_origin    0
dtype: int64

#### 2.4 Function to normalize MFCC.

In [115]:
def preprocess_mfcc(mfcc):
    # distance normalization (39 dimensional data)
    # return librosa.util.normalize(mfcc, axis=1)
    return librosa.util.normalize(mfcc, norm=1)

#### 2.5 Normalize MFCC over dataframe

In [116]:
df['mfcc_norma'] = df['mfcc_origin'].apply(preprocess_mfcc)

In [117]:
df.head(3)

Unnamed: 0,fid,label,start,end,mfcc_origin,mfcc_norma
0,./19021372_BuiVanToan\1-10,sil,0.0,2.426485,"[[-296.9526, 123.99669, -0.2854234, 9.288038, ...","[[-0.0034401976, 0.0034305984, -6.283296e-05, ..."
1,./19021372_BuiVanToan\1-10,7,2.426485,2.821224,"[[-378.6088, 114.13159, 6.1494083, 7.1754503, ...","[[-0.03010174, 0.016774641, 0.004588213, 0.007..."
2,./19021372_BuiVanToan\1-10,tram,3.181134,3.657143,"[[-488.46213, 128.77538, -14.592409, 19.957142...","[[-0.034174513, 0.021232355, -0.006877562, 0.0..."


### 2.6 Reduce the number of states(frame) of MFCC (Segmental k-means).
Shape of MFCC (frame, band)

### 2.7 Create state dict for 'word'/'sound'

In [118]:
i_th = 12
print("label: ", df['label'].iloc[i_th])
m = df['mfcc_origin'].iloc[i_th]

label:  nghin


In [119]:
m.shape

(51, 39)

In [120]:
n_frames = []
n_labels = list(df['label'])

for i in range(len(df)):
    f, b = df['mfcc_origin'].iloc[i].shape
    n_frames.append(f)

In [121]:
len(n_frames)

9770

In [122]:
state_dict = {}

from collections import Counter

for l in set(labels):
    if l == 'sil':
        continue
        
    a = []
    for i in range(len(df)):
        if n_labels[i] == l:
            # print(n_labels[i], ":", n_frames[i])
            a.append(n_frames[i])
    
    state_dict[l] = int(np.ceil(np.mean(a)) - 9)
    
state_dict['sil'] = 38

In [123]:
state_dict

{'linh': 20,
 '9': 25,
 '6': 29,
 '5': 20,
 '2': 25,
 '7': 20,
 '3': 21,
 '4': 20,
 'mot': 16,
 '0': 26,
 'tram': 23,
 'tu': 24,
 '1': 19,
 'muoi': 15,
 '8': 21,
 'lam': 21,
 'trieu': 29,
 'nghin': 27,
 'm1': 19,
 'sil': 38}

In [124]:
state_dict = {'linh': 18,
 '9': 21,
 '6': 24,
 '5': 21,
 '2': 21,
 '7': 20,
 '3': 21,
 '4': 20,
 'mot': 16,
 '0': 22,
 'tram': 20,
 'tu': 24,
 '1': 19,
 'muoi': 15,
 '8': 21,
 'lam': 21,
 'trieu': 24,
 'nghin': 23,
 'm1': 16,
 'sil': 38}

### 2.8 Reduce state / Segmantal K-means MFCCs

In [125]:
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans

In [126]:
def reduce_mfcc_state(mfcc_norma, label, state_dict):
    
    if label == 'sil':
        return mfcc_norma
    
    n_state, b = mfcc_norma.shape
    if n_state > state_dict[label] :
        n_state = state_dict[label]
    
    clustering = KMeans(n_clusters=n_state,
#          assign_labels='discretize',
         random_state=0).fit(mfcc_norma)
    
    state = list(clustering.labels_)
    state.append(None)

    # lấy ra mảng chỉ mục của các frame giống nhau ở cạnh nhau
    idx_arr = []
    idx_arr_of_states = []
    for i in range(len(state)-1):
        idx_arr.append(i)

        if state[i] != state[i+1]:
            idx_arr_of_states.append(idx_arr)
            idx_arr = []
    
    # tính mean các frame giống nhau, là 1 frame mới
    mfcc_feat = []
    for arr in idx_arr_of_states:
        
        mfcc_i = np.mean([mfcc_norma[idx] for idx in arr], axis=0)

        mfcc_feat.append(mfcc_i)
        
    return np.array(mfcc_feat)

Check the reduce_mfcc_state function has output same as the expected output

In [127]:
a = df['mfcc_norma'].iloc[1]
b = df['label'].iloc[1]

# first frame / expected output ( for data i_th)
t = (a[0]+a[1])/2

# real output
r = reduce_mfcc_state(a, b, state_dict)[0]

print(t)
print(r)

print(sum(t-r) == 0 )

[-0.03063438  0.02194486 -0.00625493  0.00240583  0.00709179  0.00765732
  0.02965748  0.00097861 -0.0087722  -0.01955744 -0.0106822  -0.01197451
 -0.01671942  0.0013625   0.11110691 -0.05285965  0.00851604  0.02376341
 -0.03457373 -0.00994516  0.02201542 -0.00578886 -0.01572862 -0.02038442
 -0.00365612 -0.02766761  0.04630173 -0.07330771  0.06353993  0.04092852
  0.0507398  -0.00488324 -0.06163172 -0.01466724  0.03506643  0.01801992
  0.02044023  0.00036431  0.04758816]
[-0.03063438  0.02194486 -0.00625493  0.00240583  0.00709179  0.00765732
  0.02965748  0.00097861 -0.0087722  -0.01955744 -0.0106822  -0.01197451
 -0.01671942  0.0013625   0.11110691 -0.05285965  0.00851604  0.02376341
 -0.03457373 -0.00994516  0.02201542 -0.00578886 -0.01572862 -0.02038442
 -0.00365612 -0.02766761  0.04630173 -0.07330771  0.06353993  0.04092852
  0.0507398  -0.00488324 -0.06163172 -0.01466724  0.03506643  0.01801992
  0.02044023  0.00036431  0.04758816]
True


In [128]:
# del df['mfcc_feat']

In [129]:
df['mfcc_feat'] = df.apply(lambda x: reduce_mfcc_state(x.mfcc_norma, x.label, state_dict),  axis=1)

In [130]:
df.head(3)

Unnamed: 0,fid,label,start,end,mfcc_origin,mfcc_norma,mfcc_feat
0,./19021372_BuiVanToan\1-10,sil,0.0,2.426485,"[[-296.9526, 123.99669, -0.2854234, 9.288038, ...","[[-0.0034401976, 0.0034305984, -6.283296e-05, ...","[[-0.0034401976, 0.0034305984, -6.283296e-05, ..."
1,./19021372_BuiVanToan\1-10,7,2.426485,2.821224,"[[-378.6088, 114.13159, 6.1494083, 7.1754503, ...","[[-0.03010174, 0.016774641, 0.004588213, 0.007...","[[-0.030634377, 0.021944862, -0.006254934, 0.0..."
2,./19021372_BuiVanToan\1-10,tram,3.181134,3.657143,"[[-488.46213, 128.77538, -14.592409, 19.957142...","[[-0.034174513, 0.021232355, -0.006877562, 0.0...","[[-0.0329436, 0.020918448, -0.006619167, 0.013..."


In [131]:
df['mfcc_feat'].iloc[100].shape

(28, 39)

# III. HMM for speech regconition

In [132]:
from hmmlearn import hmm

In [133]:
print(labels)

['sil', '7', 'tram', '6', 'muoi', '9', 'trieu', '8', '3', '2', 'nghin', 'linh', '4', '5', 'tu', 'mot', 'm1', 'lam', '1', '0']


### 3.1 Function to divide train/test data
Format dictinary:  ( label: [ mfcc_feats ] )

In [134]:
def buildDataDictForMFCC(df, set_labels, train_rate):
    
    trainDataDict = {}
    testDataDict = {}
    
    for l in set_labels:
        df_sl = df[df['label'] == l]
        msk = np.random.rand(len(df_sl)) < train_rate
        
        train_l = df_sl['mfcc_feat'][msk]
        test_l = df_sl['mfcc_feat'][~msk]
        
        trainDataDict[l] = np.array(train_l)
        testDataDict[l] = np.array(test_l)
        
    return trainDataDict, testDataDict

Test sum amount data of label

In [135]:
trainDataDict, testDataDict = buildDataDictForMFCC(df, labels , 0.8)

In [136]:
len(df[df['label'] == 'sil'])

1546

In [137]:
len(trainDataDict['sil'])

1212

In [138]:
len(testDataDict['sil'])

334

### 3.2 Function to train the model hmm
input is a data dictionary by each label

In [151]:
def train_GMMHMM(trainDataDict):
    
    GMMHMM_Models = {}
#     states_num = 5
#     GMM_mix_num = 6
#     tmp_p = 1.0/(states_num-2)
#     transmatPrior = np.array([[tmp_p, tmp_p, tmp_p, 0 ,0], \
#                                [0, tmp_p, tmp_p, tmp_p , 0], \
#                                [0, 0, tmp_p, tmp_p,tmp_p], \
#                                [0, 0, 0, 0.5, 0.5], \
#                                [0, 0, 0, 0, 1]],dtype=np.float64)


#     startprobPrior = np.array([0.3, 0.3, 0.1, 0, 0],dtype=np.float64)    
    
    states_num = 8
    GMM_mix_num = 6
    tmp_p = 1.0/(3)
    transmatPrior = np.array([[tmp_p, tmp_p, tmp_p,0, 0, 0 ,0, 0], \
                               [0, tmp_p, tmp_p, tmp_p, 0, 0, 0 ,0], \
                               [0, 0, tmp_p, tmp_p, tmp_p, 0, 0, 0], \
                               [0, 0, 0, tmp_p, tmp_p, tmp_p, 0, 0], \
                               [0, 0, 0, 0, tmp_p, tmp_p, tmp_p, 0], \
                               [0, 0, 0, 0, 0, tmp_p, tmp_p, tmp_p], \
                               [0, 0, 0, 0, 0, 0, 0.5, 0.5], \
                                  [0, 0, 0, 0, 0, 0, 0, 1]],dtype=np.float64)


    startprobPrior = np.array([0.3, 0.3, 0.1, 0, 0, 0, 0, 0],dtype=np.float64)    
    
    
    
    for label in trainDataDict.keys():
        model = hmm.GMMHMM(n_components=states_num, n_mix=GMM_mix_num, \
                           transmat_prior=transmatPrior, startprob_prior=startprobPrior, \
                           covariance_type='diag', n_iter=10)
        trainData = trainDataDict[label]
        length = np.zeros([len(trainData), ], dtype=np.int32)
        for m in range(len(trainData)):
            length[m] = trainData[m].shape[0]
        trainData = np.vstack(trainData)
        model.fit(trainData, lengths=length)  # get optimal parameters
        GMMHMM_Models[label] = model
    return GMMHMM_Models

Convert test data to format (label: mfcc_feat)

In [152]:
def prepare_test_data(testDataDict):
    pairOfTestData = []
    
    for l in testDataDict.keys():
        for mfcc_feat in testDataDict[l]:
            pairOfTestData.append((l, mfcc_feat))
            
    return pairOfTestData

### 3.3 Training

In [153]:
trainDataDict, testDataDict = buildDataDictForMFCC(df, labels , 0.8)
pairOfTestData = prepare_test_data(testDataDict)
print("Finish prepare the training/testing data")

hmmModels = train_GMMHMM(trainDataDict)
print("Finish training of the GMM_HMM models for voice recognition reading numbers Vietnamese.")

Finish prepare the training/testing data
Finish training of the GMM_HMM models for voice recognition reading numbers Vietnamese.


### 3.4 Testing

In [154]:
true_label = []
pred_label = []

import logging
logging.getLogger("hmmlearn").setLevel("CRITICAL")

i_th = 0

print("i_th : (True_label, pred_label)")

for l, mfcc_feat in pairOfTestData:
    scoreList = {}
    for model_label in hmmModels.keys():
        model = hmmModels[model_label]
        score = model.score(mfcc_feat)
        scoreList[model_label] = score
    predict = max(scoreList, key=scoreList.get)
#     print(i_th, "th :", (l, predict))
#     i_th += 1
    
    true_label.append(l)
    pred_label.append(predict)
    
print("Warning: Degenerate mixture covariance")

i_th : (True_label, pred_label)


### 3.5 Result / Metrics

In [155]:
# confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(labels)

# actual values
actual = true_label
# predicted values
predicted = pred_label

# confusion matrix
matrix = confusion_matrix(actual,predicted, labels=labels)
print('\nConfusion matrix on test data: \n',matrix)

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(actual,predicted,labels=labels)
print('\nClassification report on test data: \n',matrix)

['sil', '7', 'tram', '6', 'muoi', '9', 'trieu', '8', '3', '2', 'nghin', 'linh', '4', '5', 'tu', 'mot', 'm1', 'lam', '1', '0']

Confusion matrix on test data: 
 [[285   0   0   0   7   0   0   0   0   1   1   0   1   1   0   0   0   0
    1   0]
 [  0  92   0   0   2   0   0   0   2   1   0   0   0   0   0   0   0   0
    0   0]
 [  0   0 289   2   0   0   1   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   3  82   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0 217   0   0   0   0   0   2   3   0   1   1   0  17   0
    0   0]
 [  0   0   1   0   0  77   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   1  98   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   0 106   1   1   0   0   0   1   0   0   0   0
    0   0]
 [  0   1   0   0   0   0   0   0  84   3   0   0   0   1   0   0   0   0
    0   0]
 [  0   0   0   0   1   0   0   0   3  91   0   0   0   1   0   0   0   0
 